698 lines
		
	
	
		
			25 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
			
		
		
	
	
			698 lines
		
	
	
		
			25 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
/*
 | 
						|
   Copyright The containerd Authors.
 | 
						|
 | 
						|
   Licensed under the Apache License, Version 2.0 (the "License");
 | 
						|
   you may not use this file except in compliance with the License.
 | 
						|
   You may obtain a copy of the License at
 | 
						|
 | 
						|
       http://www.apache.org/licenses/LICENSE-2.0
 | 
						|
 | 
						|
   Unless required by applicable law or agreed to in writing, software
 | 
						|
   distributed under the License is distributed on an "AS IS" BASIS,
 | 
						|
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | 
						|
   See the License for the specific language governing permissions and
 | 
						|
   limitations under the License.
 | 
						|
*/
 | 
						|
 | 
						|
package server
 | 
						|
 | 
						|
import (
 | 
						|
	"context"
 | 
						|
	"encoding/json"
 | 
						|
	"errors"
 | 
						|
	"fmt"
 | 
						|
	"math"
 | 
						|
	"path/filepath"
 | 
						|
	"strings"
 | 
						|
	"time"
 | 
						|
 | 
						|
	"github.com/containerd/go-cni"
 | 
						|
	"github.com/containerd/typeurl/v2"
 | 
						|
	runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
 | 
						|
 | 
						|
	containerd "github.com/containerd/containerd/v2/client"
 | 
						|
	"github.com/containerd/containerd/v2/pkg/cri/annotations"
 | 
						|
	"github.com/containerd/containerd/v2/pkg/cri/bandwidth"
 | 
						|
	criconfig "github.com/containerd/containerd/v2/pkg/cri/config"
 | 
						|
	"github.com/containerd/containerd/v2/pkg/cri/server/podsandbox"
 | 
						|
	sandboxstore "github.com/containerd/containerd/v2/pkg/cri/store/sandbox"
 | 
						|
	"github.com/containerd/containerd/v2/pkg/cri/util"
 | 
						|
	"github.com/containerd/containerd/v2/pkg/netns"
 | 
						|
	sb "github.com/containerd/containerd/v2/sandbox"
 | 
						|
	"github.com/containerd/log"
 | 
						|
)
 | 
						|
 | 
						|
func init() {
 | 
						|
	typeurl.Register(&sandboxstore.Metadata{},
 | 
						|
		"github.com/containerd/cri/pkg/store/sandbox", "Metadata")
 | 
						|
}
 | 
						|
 | 
						|
// RunPodSandbox creates and starts a pod-level sandbox. Runtimes should ensure
 | 
						|
// the sandbox is in ready state.
 | 
						|
func (c *criService) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandboxRequest) (_ *runtime.RunPodSandboxResponse, retErr error) {
 | 
						|
	config := r.GetConfig()
 | 
						|
	log.G(ctx).Debugf("Sandbox config %+v", config)
 | 
						|
 | 
						|
	// Generate unique id and name for the sandbox and reserve the name.
 | 
						|
	id := util.GenerateID()
 | 
						|
	metadata := config.GetMetadata()
 | 
						|
	if metadata == nil {
 | 
						|
		return nil, errors.New("sandbox config must include metadata")
 | 
						|
	}
 | 
						|
	name := makeSandboxName(metadata)
 | 
						|
	log.G(ctx).WithField("podsandboxid", id).Debugf("generated id for sandbox name %q", name)
 | 
						|
 | 
						|
	// cleanupErr records the last error returned by the critical cleanup operations in deferred functions,
 | 
						|
	// like CNI teardown and stopping the running sandbox task.
 | 
						|
	// If cleanup is not completed for some reason, the CRI-plugin will leave the sandbox
 | 
						|
	// in a not-ready state, which can later be cleaned up by the next execution of the kubelet's syncPod workflow.
 | 
						|
	var cleanupErr error
 | 
						|
 | 
						|
	// Reserve the sandbox name to avoid concurrent `RunPodSandbox` request starting the
 | 
						|
	// same sandbox.
 | 
						|
	if err := c.sandboxNameIndex.Reserve(name, id); err != nil {
 | 
						|
		return nil, fmt.Errorf("failed to reserve sandbox name %q: %w", name, err)
 | 
						|
	}
 | 
						|
	defer func() {
 | 
						|
		// Release the name if the function returns with an error.
 | 
						|
		// When cleanupErr != nil, the name will be cleaned in sandbox_remove.
 | 
						|
		if retErr != nil && cleanupErr == nil {
 | 
						|
			c.sandboxNameIndex.ReleaseByName(name)
 | 
						|
		}
 | 
						|
	}()
 | 
						|
 | 
						|
	var (
 | 
						|
		err         error
 | 
						|
		sandboxInfo = sb.Sandbox{ID: id}
 | 
						|
	)
 | 
						|
 | 
						|
	ociRuntime, err := c.getSandboxRuntime(config, r.GetRuntimeHandler())
 | 
						|
	if err != nil {
 | 
						|
		return nil, fmt.Errorf("unable to get OCI runtime for sandbox %q: %w", id, err)
 | 
						|
	}
 | 
						|
 | 
						|
	sandboxInfo.Runtime.Name = ociRuntime.Type
 | 
						|
	sandboxInfo.Sandboxer = ociRuntime.Sandboxer
 | 
						|
 | 
						|
	runtimeStart := time.Now()
 | 
						|
	// Retrieve runtime options
 | 
						|
	runtimeOpts, err := generateRuntimeOptions(ociRuntime)
 | 
						|
	if err != nil {
 | 
						|
		return nil, fmt.Errorf("failed to generate sandbox runtime options: %w", err)
 | 
						|
	}
 | 
						|
 | 
						|
	if runtimeOpts != nil {
 | 
						|
		sandboxInfo.Runtime.Options, err = typeurl.MarshalAny(runtimeOpts)
 | 
						|
		if err != nil {
 | 
						|
			return nil, fmt.Errorf("failed to marshal runtime options: %w", err)
 | 
						|
		}
 | 
						|
	}
 | 
						|
 | 
						|
	// Save sandbox name
 | 
						|
	sandboxInfo.AddLabel("name", name)
 | 
						|
 | 
						|
	// Create initial internal sandbox object.
 | 
						|
	sandbox := sandboxstore.NewSandbox(
 | 
						|
		sandboxstore.Metadata{
 | 
						|
			ID:             id,
 | 
						|
			Name:           name,
 | 
						|
			Config:         config,
 | 
						|
			RuntimeHandler: r.GetRuntimeHandler(),
 | 
						|
		},
 | 
						|
		sandboxstore.Status{
 | 
						|
			State: sandboxstore.StateUnknown,
 | 
						|
		},
 | 
						|
	)
 | 
						|
 | 
						|
	if _, err := c.client.SandboxStore().Create(ctx, sandboxInfo); err != nil {
 | 
						|
		return nil, fmt.Errorf("failed to save sandbox metadata: %w", err)
 | 
						|
	}
 | 
						|
	defer func() {
 | 
						|
		if retErr != nil && cleanupErr == nil {
 | 
						|
			cleanupErr = c.client.SandboxStore().Delete(ctx, id)
 | 
						|
		}
 | 
						|
	}()
 | 
						|
 | 
						|
	defer func() {
 | 
						|
		// Put the sandbox into sandbox store when some resources fail to be cleaned.
 | 
						|
		if retErr != nil && cleanupErr != nil {
 | 
						|
			log.G(ctx).WithError(cleanupErr).Errorf("encountered an error cleaning up failed sandbox %q, marking sandbox state as SANDBOX_UNKNOWN", id)
 | 
						|
			if err := c.sandboxStore.Add(sandbox); err != nil {
 | 
						|
				log.G(ctx).WithError(err).Errorf("failed to add sandbox %+v into store", sandbox)
 | 
						|
			}
 | 
						|
		}
 | 
						|
	}()
 | 
						|
 | 
						|
	// XXX: What we really want here is to call controller.Platform() and then check
 | 
						|
	// platform.OS, but that is only populated after controller.Create() and that needs to be
 | 
						|
	// done later (uses sandbox.NSPath that we will set just _after_ this).
 | 
						|
	// So, lets check for the Linux section on the config, if that is populated, we assume the
 | 
						|
	// platform is linux.
 | 
						|
	// This is a hack, we should improve the controller interface to return the platform
 | 
						|
	// earlier. But should work fine for this specific use.
 | 
						|
	userNsEnabled := false
 | 
						|
	if linux := config.GetLinux(); linux != nil {
 | 
						|
		usernsOpts := linux.GetSecurityContext().GetNamespaceOptions().GetUsernsOptions()
 | 
						|
		if usernsOpts != nil && usernsOpts.GetMode() == runtime.NamespaceMode_POD {
 | 
						|
			userNsEnabled = true
 | 
						|
		}
 | 
						|
	}
 | 
						|
 | 
						|
	// Setup the network namespace if host networking wasn't requested.
 | 
						|
	if !hostNetwork(config) && !userNsEnabled {
 | 
						|
		// XXX: We do c&p of this code later for the podNetwork && userNsEnabled case too.
 | 
						|
		// We can't move this to a function, as the defer calls need to be executed if other
 | 
						|
		// errors are returned in this function. So, we would need more refactors to move
 | 
						|
		// this code to a function and the idea was to not change the current code for
 | 
						|
		// !userNsEnabled case, therefore doing it would defeat the purpose.
 | 
						|
		//
 | 
						|
		// The difference between the cases is the use of netns.NewNetNS() vs
 | 
						|
		// netns.NewNetNSFromPID().
 | 
						|
		//
 | 
						|
		// To simplify this, in the future, we should just remove this case (podNetwork &&
 | 
						|
		// !userNsEnabled) and just keep the other case (podNetwork && userNsEnabled).
 | 
						|
		netStart := time.Now()
 | 
						|
		// If it is not in host network namespace then create a namespace and set the sandbox
 | 
						|
		// handle. NetNSPath in sandbox metadata and NetNS is non empty only for non host network
 | 
						|
		// namespaces. If the pod is in host network namespace then both are empty and should not
 | 
						|
		// be used.
 | 
						|
		var netnsMountDir = "/var/run/netns"
 | 
						|
		if c.config.NetNSMountsUnderStateDir {
 | 
						|
			netnsMountDir = filepath.Join(c.config.StateDir, "netns")
 | 
						|
		}
 | 
						|
		sandbox.NetNS, err = netns.NewNetNS(netnsMountDir)
 | 
						|
		if err != nil {
 | 
						|
			return nil, fmt.Errorf("failed to create network namespace for sandbox %q: %w", id, err)
 | 
						|
		}
 | 
						|
		// Update network namespace in the store, which is used to generate the container's spec
 | 
						|
		sandbox.NetNSPath = sandbox.NetNS.GetPath()
 | 
						|
		defer func() {
 | 
						|
			// Remove the network namespace only if all the resource cleanup is done
 | 
						|
			if retErr != nil && cleanupErr == nil {
 | 
						|
				if cleanupErr = sandbox.NetNS.Remove(); cleanupErr != nil {
 | 
						|
					log.G(ctx).WithError(cleanupErr).Errorf("Failed to remove network namespace %s for sandbox %q", sandbox.NetNSPath, id)
 | 
						|
					return
 | 
						|
				}
 | 
						|
				sandbox.NetNSPath = ""
 | 
						|
			}
 | 
						|
		}()
 | 
						|
 | 
						|
		if err := sandboxInfo.AddExtension(podsandbox.MetadataKey, &sandbox.Metadata); err != nil {
 | 
						|
			return nil, fmt.Errorf("unable to save sandbox %q to store: %w", id, err)
 | 
						|
		}
 | 
						|
		// Save sandbox metadata to store
 | 
						|
		if sandboxInfo, err = c.client.SandboxStore().Update(ctx, sandboxInfo, "extensions"); err != nil {
 | 
						|
			return nil, fmt.Errorf("unable to update extensions for sandbox %q: %w", id, err)
 | 
						|
		}
 | 
						|
 | 
						|
		// Define this defer to teardownPodNetwork prior to the setupPodNetwork function call.
 | 
						|
		// This is because in setupPodNetwork the resource is allocated even if it returns error, unlike other resource
 | 
						|
		// creation functions.
 | 
						|
		defer func() {
 | 
						|
			// Remove the network namespace only if all the resource cleanup is done.
 | 
						|
			if retErr != nil && cleanupErr == nil {
 | 
						|
				deferCtx, deferCancel := util.DeferContext()
 | 
						|
				defer deferCancel()
 | 
						|
				// Teardown network if an error is returned.
 | 
						|
				if cleanupErr = c.teardownPodNetwork(deferCtx, sandbox); cleanupErr != nil {
 | 
						|
					log.G(ctx).WithError(cleanupErr).Errorf("Failed to destroy network for sandbox %q", id)
 | 
						|
				}
 | 
						|
 | 
						|
			}
 | 
						|
		}()
 | 
						|
 | 
						|
		// Setup network for sandbox.
 | 
						|
		// Certain VM based solutions like clear containers (Issue containerd/cri-containerd#524)
 | 
						|
		// rely on the assumption that CRI shim will not be querying the network namespace to check the
 | 
						|
		// network states such as IP.
 | 
						|
		// In future runtime implementation should avoid relying on CRI shim implementation details.
 | 
						|
		// In this case however caching the IP will add a subtle performance enhancement by avoiding
 | 
						|
		// calls to network namespace of the pod to query the IP of the veth interface on every
 | 
						|
		// SandboxStatus request.
 | 
						|
		if err := c.setupPodNetwork(ctx, &sandbox); err != nil {
 | 
						|
			return nil, fmt.Errorf("failed to setup network for sandbox %q: %w", id, err)
 | 
						|
		}
 | 
						|
		sandboxCreateNetworkTimer.UpdateSince(netStart)
 | 
						|
	}
 | 
						|
 | 
						|
	if err := sandboxInfo.AddExtension(podsandbox.MetadataKey, &sandbox.Metadata); err != nil {
 | 
						|
		return nil, fmt.Errorf("unable to save sandbox %q to store: %w", id, err)
 | 
						|
	}
 | 
						|
 | 
						|
	controller, err := c.getSandboxController(config, r.GetRuntimeHandler())
 | 
						|
	if err != nil {
 | 
						|
		return nil, fmt.Errorf("failed to get sandbox controller: %w", err)
 | 
						|
	}
 | 
						|
 | 
						|
	// Save sandbox metadata to store
 | 
						|
	if sandboxInfo, err = c.client.SandboxStore().Update(ctx, sandboxInfo, "extensions"); err != nil {
 | 
						|
		return nil, fmt.Errorf("unable to update extensions for sandbox %q: %w", id, err)
 | 
						|
	}
 | 
						|
 | 
						|
	if err := controller.Create(ctx, sandboxInfo, sb.WithOptions(config), sb.WithNetNSPath(sandbox.NetNSPath)); err != nil {
 | 
						|
		return nil, fmt.Errorf("failed to create sandbox %q: %w", id, err)
 | 
						|
	}
 | 
						|
 | 
						|
	ctrl, err := controller.Start(ctx, id)
 | 
						|
	if err != nil {
 | 
						|
		sandbox.Container, _ = c.client.LoadContainer(ctx, id)
 | 
						|
		var cerr podsandbox.CleanupErr
 | 
						|
		if errors.As(err, &cerr) {
 | 
						|
			cleanupErr = fmt.Errorf("failed to cleanup sandbox: %w", cerr)
 | 
						|
 | 
						|
			// Strip last error as cleanup error to handle separately
 | 
						|
			if merr, ok := err.(interface{ Unwrap() []error }); ok {
 | 
						|
				if errs := merr.Unwrap(); len(errs) > 0 {
 | 
						|
					err = errs[0]
 | 
						|
				}
 | 
						|
			}
 | 
						|
		}
 | 
						|
		return nil, fmt.Errorf("failed to start sandbox %q: %w", id, err)
 | 
						|
	}
 | 
						|
 | 
						|
	if !hostNetwork(config) && userNsEnabled {
 | 
						|
		// If userns is enabled, then the netns was created by the OCI runtime
 | 
						|
		// on controller.Start(). The OCI runtime needs to create the netns
 | 
						|
		// because, if userns is in use, the netns needs to be owned by the
 | 
						|
		// userns. So, let the OCI runtime just handle this for us.
 | 
						|
		// If the netns is not owned by the userns several problems will happen.
 | 
						|
		// For instance, the container will lack permission (even if
 | 
						|
		// capabilities are present) to modify the netns or, even worse, the OCI
 | 
						|
		// runtime will fail to mount sysfs:
 | 
						|
		//      https://github.com/torvalds/linux/commit/7dc5dbc879bd0779924b5132a48b731a0bc04a1e#diff-4839664cd0c8eab716e064323c7cd71fR1164
 | 
						|
		//
 | 
						|
		// Note we do this after controller.Start(), as before that we
 | 
						|
		// can't get the PID for the sandbox that we need for the netns.
 | 
						|
		// Doing a controller.Status() call before that fails (can't
 | 
						|
		// find the sandbox) so we can't get the PID.
 | 
						|
		netStart := time.Now()
 | 
						|
 | 
						|
		// If it is not in host network namespace then create a namespace and set the sandbox
 | 
						|
		// handle. NetNSPath in sandbox metadata and NetNS is non empty only for non host network
 | 
						|
		// namespaces. If the pod is in host network namespace then both are empty and should not
 | 
						|
		// be used.
 | 
						|
		var netnsMountDir = "/var/run/netns"
 | 
						|
		if c.config.NetNSMountsUnderStateDir {
 | 
						|
			netnsMountDir = filepath.Join(c.config.StateDir, "netns")
 | 
						|
		}
 | 
						|
 | 
						|
		sandbox.NetNS, err = netns.NewNetNSFromPID(netnsMountDir, ctrl.Pid)
 | 
						|
		if err != nil {
 | 
						|
			return nil, fmt.Errorf("failed to create network namespace for sandbox %q: %w", id, err)
 | 
						|
		}
 | 
						|
 | 
						|
		// Update network namespace in the store, which is used to generate the container's spec
 | 
						|
		sandbox.NetNSPath = sandbox.NetNS.GetPath()
 | 
						|
		defer func() {
 | 
						|
			// Remove the network namespace only if all the resource cleanup is done
 | 
						|
			if retErr != nil && cleanupErr == nil {
 | 
						|
				if cleanupErr = sandbox.NetNS.Remove(); cleanupErr != nil {
 | 
						|
					log.G(ctx).WithError(cleanupErr).Errorf("Failed to remove network namespace %s for sandbox %q", sandbox.NetNSPath, id)
 | 
						|
					return
 | 
						|
				}
 | 
						|
				sandbox.NetNSPath = ""
 | 
						|
			}
 | 
						|
		}()
 | 
						|
 | 
						|
		if err := sandboxInfo.AddExtension(podsandbox.MetadataKey, &sandbox.Metadata); err != nil {
 | 
						|
			return nil, fmt.Errorf("unable to save sandbox %q to store: %w", id, err)
 | 
						|
		}
 | 
						|
		// Save sandbox metadata to store
 | 
						|
		if sandboxInfo, err = c.client.SandboxStore().Update(ctx, sandboxInfo, "extensions"); err != nil {
 | 
						|
			return nil, fmt.Errorf("unable to update extensions for sandbox %q: %w", id, err)
 | 
						|
		}
 | 
						|
 | 
						|
		// Define this defer to teardownPodNetwork prior to the setupPodNetwork function call.
 | 
						|
		// This is because in setupPodNetwork the resource is allocated even if it returns error, unlike other resource
 | 
						|
		// creation functions.
 | 
						|
		defer func() {
 | 
						|
			// Remove the network namespace only if all the resource cleanup is done.
 | 
						|
			if retErr != nil && cleanupErr == nil {
 | 
						|
				deferCtx, deferCancel := util.DeferContext()
 | 
						|
				defer deferCancel()
 | 
						|
				// Teardown network if an error is returned.
 | 
						|
				if cleanupErr = c.teardownPodNetwork(deferCtx, sandbox); cleanupErr != nil {
 | 
						|
					log.G(ctx).WithError(cleanupErr).Errorf("Failed to destroy network for sandbox %q", id)
 | 
						|
				}
 | 
						|
 | 
						|
			}
 | 
						|
		}()
 | 
						|
 | 
						|
		// Setup network for sandbox.
 | 
						|
		// Certain VM based solutions like clear containers (Issue containerd/cri-containerd#524)
 | 
						|
		// rely on the assumption that CRI shim will not be querying the network namespace to check the
 | 
						|
		// network states such as IP.
 | 
						|
		// In future runtime implementation should avoid relying on CRI shim implementation details.
 | 
						|
		// In this case however caching the IP will add a subtle performance enhancement by avoiding
 | 
						|
		// calls to network namespace of the pod to query the IP of the veth interface on every
 | 
						|
		// SandboxStatus request.
 | 
						|
		if err := c.setupPodNetwork(ctx, &sandbox); err != nil {
 | 
						|
			return nil, fmt.Errorf("failed to setup network for sandbox %q: %w", id, err)
 | 
						|
		}
 | 
						|
		sandboxCreateNetworkTimer.UpdateSince(netStart)
 | 
						|
	}
 | 
						|
 | 
						|
	// TODO: get rid of this. sandbox object should no longer have Container field.
 | 
						|
	if ociRuntime.Sandboxer == string(criconfig.ModePodSandbox) {
 | 
						|
		container, err := c.client.LoadContainer(ctx, id)
 | 
						|
		if err != nil {
 | 
						|
			return nil, fmt.Errorf("failed to load container %q for sandbox: %w", id, err)
 | 
						|
		}
 | 
						|
		sandbox.Container = container
 | 
						|
	}
 | 
						|
 | 
						|
	labels := ctrl.Labels
 | 
						|
	if labels == nil {
 | 
						|
		labels = map[string]string{}
 | 
						|
	}
 | 
						|
 | 
						|
	sandbox.ProcessLabel = labels["selinux_label"]
 | 
						|
 | 
						|
	err = c.nri.RunPodSandbox(ctx, &sandbox)
 | 
						|
	if err != nil {
 | 
						|
		return nil, fmt.Errorf("NRI RunPodSandbox failed: %w", err)
 | 
						|
	}
 | 
						|
 | 
						|
	defer func() {
 | 
						|
		if retErr != nil {
 | 
						|
			deferCtx, deferCancel := util.DeferContext()
 | 
						|
			defer deferCancel()
 | 
						|
			c.nri.RemovePodSandbox(deferCtx, &sandbox)
 | 
						|
		}
 | 
						|
	}()
 | 
						|
 | 
						|
	if err := sandbox.Status.Update(func(status sandboxstore.Status) (sandboxstore.Status, error) {
 | 
						|
		// Set the pod sandbox as ready after successfully start sandbox container.
 | 
						|
		status.Pid = ctrl.Pid
 | 
						|
		status.State = sandboxstore.StateReady
 | 
						|
		status.CreatedAt = ctrl.CreatedAt
 | 
						|
		return status, nil
 | 
						|
	}); err != nil {
 | 
						|
		return nil, fmt.Errorf("failed to update sandbox status: %w", err)
 | 
						|
	}
 | 
						|
 | 
						|
	// Add sandbox into sandbox store in INIT state.
 | 
						|
	if err := c.sandboxStore.Add(sandbox); err != nil {
 | 
						|
		return nil, fmt.Errorf("failed to add sandbox %+v into store: %w", sandbox, err)
 | 
						|
	}
 | 
						|
 | 
						|
	// Send CONTAINER_CREATED event with both ContainerId and SandboxId equal to SandboxId.
 | 
						|
	// Note that this has to be done after sandboxStore.Add() because we need to get
 | 
						|
	// SandboxStatus from the store and include it in the event.
 | 
						|
	c.generateAndSendContainerEvent(ctx, id, id, runtime.ContainerEventType_CONTAINER_CREATED_EVENT)
 | 
						|
 | 
						|
	// TODO: Use sandbox client instead
 | 
						|
	exitCh := make(chan containerd.ExitStatus, 1)
 | 
						|
	go func() {
 | 
						|
		defer close(exitCh)
 | 
						|
 | 
						|
		ctx := util.NamespacedContext()
 | 
						|
		resp, err := controller.Wait(ctx, id)
 | 
						|
		if err != nil {
 | 
						|
			log.G(ctx).WithError(err).Error("failed to wait for sandbox exit")
 | 
						|
			exitCh <- *containerd.NewExitStatus(containerd.UnknownExitStatus, time.Time{}, err)
 | 
						|
			return
 | 
						|
		}
 | 
						|
 | 
						|
		exitCh <- *containerd.NewExitStatus(resp.ExitStatus, resp.ExitedAt, nil)
 | 
						|
	}()
 | 
						|
 | 
						|
	// start the monitor after adding sandbox into the store, this ensures
 | 
						|
	// that sandbox is in the store, when event monitor receives the TaskExit event.
 | 
						|
	//
 | 
						|
	// TaskOOM from containerd may come before sandbox is added to store,
 | 
						|
	// but we don't care about sandbox TaskOOM right now, so it is fine.
 | 
						|
	c.eventMonitor.startSandboxExitMonitor(context.Background(), id, ctrl.Pid, exitCh)
 | 
						|
 | 
						|
	// Send CONTAINER_STARTED event with ContainerId equal to SandboxId.
 | 
						|
	c.generateAndSendContainerEvent(ctx, id, id, runtime.ContainerEventType_CONTAINER_STARTED_EVENT)
 | 
						|
 | 
						|
	sandboxRuntimeCreateTimer.WithValues(labels["oci_runtime_type"]).UpdateSince(runtimeStart)
 | 
						|
 | 
						|
	return &runtime.RunPodSandboxResponse{PodSandboxId: id}, nil
 | 
						|
}
 | 
						|
 | 
						|
// getNetworkPlugin returns the network plugin to be used by the runtime class
 | 
						|
// defaults to the global CNI options in the CRI config
 | 
						|
func (c *criService) getNetworkPlugin(runtimeClass string) cni.CNI {
 | 
						|
	if c.netPlugin == nil {
 | 
						|
		return nil
 | 
						|
	}
 | 
						|
	i, ok := c.netPlugin[runtimeClass]
 | 
						|
	if !ok {
 | 
						|
		if i, ok = c.netPlugin[defaultNetworkPlugin]; !ok {
 | 
						|
			return nil
 | 
						|
		}
 | 
						|
	}
 | 
						|
	return i
 | 
						|
}
 | 
						|
 | 
						|
// setupPodNetwork setups up the network for a pod
 | 
						|
func (c *criService) setupPodNetwork(ctx context.Context, sandbox *sandboxstore.Sandbox) error {
 | 
						|
	var (
 | 
						|
		id        = sandbox.ID
 | 
						|
		config    = sandbox.Config
 | 
						|
		path      = sandbox.NetNSPath
 | 
						|
		netPlugin = c.getNetworkPlugin(sandbox.RuntimeHandler)
 | 
						|
		err       error
 | 
						|
		result    *cni.Result
 | 
						|
	)
 | 
						|
	if netPlugin == nil {
 | 
						|
		return errors.New("cni config not initialized")
 | 
						|
	}
 | 
						|
 | 
						|
	opts, err := cniNamespaceOpts(id, config)
 | 
						|
	if err != nil {
 | 
						|
		return fmt.Errorf("get cni namespace options: %w", err)
 | 
						|
	}
 | 
						|
	log.G(ctx).WithField("podsandboxid", id).Debugf("begin cni setup")
 | 
						|
	netStart := time.Now()
 | 
						|
	if c.config.CniConfig.NetworkPluginSetupSerially {
 | 
						|
		result, err = netPlugin.SetupSerially(ctx, id, path, opts...)
 | 
						|
	} else {
 | 
						|
		result, err = netPlugin.Setup(ctx, id, path, opts...)
 | 
						|
	}
 | 
						|
	networkPluginOperations.WithValues(networkSetUpOp).Inc()
 | 
						|
	networkPluginOperationsLatency.WithValues(networkSetUpOp).UpdateSince(netStart)
 | 
						|
	if err != nil {
 | 
						|
		networkPluginOperationsErrors.WithValues(networkSetUpOp).Inc()
 | 
						|
		return err
 | 
						|
	}
 | 
						|
	logDebugCNIResult(ctx, id, result)
 | 
						|
	// Check if the default interface has IP config
 | 
						|
	if configs, ok := result.Interfaces[defaultIfName]; ok && len(configs.IPConfigs) > 0 {
 | 
						|
		sandbox.IP, sandbox.AdditionalIPs = selectPodIPs(ctx, configs.IPConfigs, c.config.IPPreference)
 | 
						|
		sandbox.CNIResult = result
 | 
						|
		return nil
 | 
						|
	}
 | 
						|
	return fmt.Errorf("failed to find network info for sandbox %q", id)
 | 
						|
}
 | 
						|
 | 
						|
// cniNamespaceOpts get CNI namespace options from sandbox config.
 | 
						|
func cniNamespaceOpts(id string, config *runtime.PodSandboxConfig) ([]cni.NamespaceOpts, error) {
 | 
						|
	opts := []cni.NamespaceOpts{
 | 
						|
		cni.WithLabels(toCNILabels(id, config)),
 | 
						|
		cni.WithCapability(annotations.PodAnnotations, config.Annotations),
 | 
						|
	}
 | 
						|
 | 
						|
	portMappings := toCNIPortMappings(config.GetPortMappings())
 | 
						|
	if len(portMappings) > 0 {
 | 
						|
		opts = append(opts, cni.WithCapabilityPortMap(portMappings))
 | 
						|
	}
 | 
						|
 | 
						|
	// Will return an error if the bandwidth limitation has the wrong unit
 | 
						|
	// or an unreasonable value see validateBandwidthIsReasonable()
 | 
						|
	bandWidth, err := toCNIBandWidth(config.Annotations)
 | 
						|
	if err != nil {
 | 
						|
		return nil, err
 | 
						|
	}
 | 
						|
	if bandWidth != nil {
 | 
						|
		opts = append(opts, cni.WithCapabilityBandWidth(*bandWidth))
 | 
						|
	}
 | 
						|
 | 
						|
	dns := toCNIDNS(config.GetDnsConfig())
 | 
						|
	if dns != nil {
 | 
						|
		opts = append(opts, cni.WithCapabilityDNS(*dns))
 | 
						|
	}
 | 
						|
 | 
						|
	if cgroup := config.GetLinux().GetCgroupParent(); cgroup != "" {
 | 
						|
		opts = append(opts, cni.WithCapabilityCgroupPath(cgroup))
 | 
						|
	}
 | 
						|
 | 
						|
	return opts, nil
 | 
						|
}
 | 
						|
 | 
						|
// toCNILabels adds pod metadata into CNI labels.
 | 
						|
func toCNILabels(id string, config *runtime.PodSandboxConfig) map[string]string {
 | 
						|
	return map[string]string{
 | 
						|
		"K8S_POD_NAMESPACE":          config.GetMetadata().GetNamespace(),
 | 
						|
		"K8S_POD_NAME":               config.GetMetadata().GetName(),
 | 
						|
		"K8S_POD_INFRA_CONTAINER_ID": id,
 | 
						|
		"K8S_POD_UID":                config.GetMetadata().GetUid(),
 | 
						|
		"IgnoreUnknown":              "1",
 | 
						|
	}
 | 
						|
}
 | 
						|
 | 
						|
// toCNIBandWidth converts CRI annotations to CNI bandwidth.
 | 
						|
func toCNIBandWidth(annotations map[string]string) (*cni.BandWidth, error) {
 | 
						|
	ingress, egress, err := bandwidth.ExtractPodBandwidthResources(annotations)
 | 
						|
	if err != nil {
 | 
						|
		return nil, fmt.Errorf("reading pod bandwidth annotations: %w", err)
 | 
						|
	}
 | 
						|
 | 
						|
	if ingress == nil && egress == nil {
 | 
						|
		return nil, nil
 | 
						|
	}
 | 
						|
 | 
						|
	bandWidth := &cni.BandWidth{}
 | 
						|
 | 
						|
	if ingress != nil {
 | 
						|
		bandWidth.IngressRate = uint64(ingress.Value())
 | 
						|
		bandWidth.IngressBurst = math.MaxUint32
 | 
						|
	}
 | 
						|
 | 
						|
	if egress != nil {
 | 
						|
		bandWidth.EgressRate = uint64(egress.Value())
 | 
						|
		bandWidth.EgressBurst = math.MaxUint32
 | 
						|
	}
 | 
						|
 | 
						|
	return bandWidth, nil
 | 
						|
}
 | 
						|
 | 
						|
// toCNIPortMappings converts CRI port mappings to CNI.
 | 
						|
func toCNIPortMappings(criPortMappings []*runtime.PortMapping) []cni.PortMapping {
 | 
						|
	var portMappings []cni.PortMapping
 | 
						|
	for _, mapping := range criPortMappings {
 | 
						|
		if mapping.HostPort <= 0 {
 | 
						|
			continue
 | 
						|
		}
 | 
						|
		portMappings = append(portMappings, cni.PortMapping{
 | 
						|
			HostPort:      mapping.HostPort,
 | 
						|
			ContainerPort: mapping.ContainerPort,
 | 
						|
			Protocol:      strings.ToLower(mapping.Protocol.String()),
 | 
						|
			HostIP:        mapping.HostIp,
 | 
						|
		})
 | 
						|
	}
 | 
						|
	return portMappings
 | 
						|
}
 | 
						|
 | 
						|
// toCNIDNS converts CRI DNSConfig to CNI.
 | 
						|
func toCNIDNS(dns *runtime.DNSConfig) *cni.DNS {
 | 
						|
	if dns == nil {
 | 
						|
		return nil
 | 
						|
	}
 | 
						|
	return &cni.DNS{
 | 
						|
		Servers:  dns.GetServers(),
 | 
						|
		Searches: dns.GetSearches(),
 | 
						|
		Options:  dns.GetOptions(),
 | 
						|
	}
 | 
						|
}
 | 
						|
 | 
						|
// selectPodIPs select an ip from the ip list.
 | 
						|
func selectPodIPs(ctx context.Context, configs []*cni.IPConfig, preference string) (string, []string) {
 | 
						|
	if len(configs) == 1 {
 | 
						|
		return ipString(configs[0]), nil
 | 
						|
	}
 | 
						|
	toStrings := func(ips []*cni.IPConfig) (o []string) {
 | 
						|
		for _, i := range ips {
 | 
						|
			o = append(o, ipString(i))
 | 
						|
		}
 | 
						|
		return o
 | 
						|
	}
 | 
						|
	var extra []string
 | 
						|
	switch preference {
 | 
						|
	default:
 | 
						|
		if preference != "ipv4" && preference != "" {
 | 
						|
			log.G(ctx).WithField("ip_pref", preference).Warn("invalid ip_pref, falling back to ipv4")
 | 
						|
		}
 | 
						|
		for i, ip := range configs {
 | 
						|
			if ip.IP.To4() != nil {
 | 
						|
				return ipString(ip), append(extra, toStrings(configs[i+1:])...)
 | 
						|
			}
 | 
						|
			extra = append(extra, ipString(ip))
 | 
						|
		}
 | 
						|
	case "ipv6":
 | 
						|
		for i, ip := range configs {
 | 
						|
			if ip.IP.To4() == nil {
 | 
						|
				return ipString(ip), append(extra, toStrings(configs[i+1:])...)
 | 
						|
			}
 | 
						|
			extra = append(extra, ipString(ip))
 | 
						|
		}
 | 
						|
	case "cni":
 | 
						|
		// use func default return
 | 
						|
	}
 | 
						|
 | 
						|
	all := toStrings(configs)
 | 
						|
	return all[0], all[1:]
 | 
						|
}
 | 
						|
 | 
						|
func ipString(ip *cni.IPConfig) string {
 | 
						|
	return ip.IP.String()
 | 
						|
}
 | 
						|
 | 
						|
// untrustedWorkload returns true if the sandbox contains untrusted workload.
 | 
						|
func untrustedWorkload(config *runtime.PodSandboxConfig) bool {
 | 
						|
	return config.GetAnnotations()[annotations.UntrustedWorkload] == "true"
 | 
						|
}
 | 
						|
 | 
						|
// hostAccessingSandbox returns true if the sandbox configuration
 | 
						|
// requires additional host access for the sandbox.
 | 
						|
func hostAccessingSandbox(config *runtime.PodSandboxConfig) bool {
 | 
						|
	securityContext := config.GetLinux().GetSecurityContext()
 | 
						|
 | 
						|
	namespaceOptions := securityContext.GetNamespaceOptions()
 | 
						|
	if namespaceOptions.GetNetwork() == runtime.NamespaceMode_NODE ||
 | 
						|
		namespaceOptions.GetPid() == runtime.NamespaceMode_NODE ||
 | 
						|
		namespaceOptions.GetIpc() == runtime.NamespaceMode_NODE {
 | 
						|
		return true
 | 
						|
	}
 | 
						|
 | 
						|
	return false
 | 
						|
}
 | 
						|
 | 
						|
// getSandboxRuntime returns the runtime configuration for sandbox.
 | 
						|
// If the sandbox contains untrusted workload, runtime for untrusted workload will be returned,
 | 
						|
// or else default runtime will be returned.
 | 
						|
func (c *criService) getSandboxRuntime(config *runtime.PodSandboxConfig, runtimeHandler string) (criconfig.Runtime, error) {
 | 
						|
	if untrustedWorkload(config) {
 | 
						|
		// If the untrusted annotation is provided, runtimeHandler MUST be empty.
 | 
						|
		if runtimeHandler != "" && runtimeHandler != criconfig.RuntimeUntrusted {
 | 
						|
			return criconfig.Runtime{}, errors.New("untrusted workload with explicit runtime handler is not allowed")
 | 
						|
		}
 | 
						|
 | 
						|
		//  If the untrusted workload is requesting access to the host/node, this request will fail.
 | 
						|
		//
 | 
						|
		//  Note: If the workload is marked untrusted but requests privileged, this can be granted, as the
 | 
						|
		// runtime may support this.  For example, in a virtual-machine isolated runtime, privileged
 | 
						|
		// is a supported option, granting the workload to access the entire guest VM instead of host.
 | 
						|
		// TODO(windows): Deprecate this so that we don't need to handle it for windows.
 | 
						|
		if hostAccessingSandbox(config) {
 | 
						|
			return criconfig.Runtime{}, errors.New("untrusted workload with host access is not allowed")
 | 
						|
		}
 | 
						|
 | 
						|
		runtimeHandler = criconfig.RuntimeUntrusted
 | 
						|
	}
 | 
						|
 | 
						|
	if runtimeHandler == "" {
 | 
						|
		runtimeHandler = c.config.ContainerdConfig.DefaultRuntimeName
 | 
						|
	}
 | 
						|
 | 
						|
	handler, ok := c.config.ContainerdConfig.Runtimes[runtimeHandler]
 | 
						|
	if !ok {
 | 
						|
		return criconfig.Runtime{}, fmt.Errorf("no runtime for %q is configured", runtimeHandler)
 | 
						|
	}
 | 
						|
	return handler, nil
 | 
						|
}
 | 
						|
 | 
						|
func logDebugCNIResult(ctx context.Context, sandboxID string, result *cni.Result) {
 | 
						|
	if log.GetLevel() < log.DebugLevel {
 | 
						|
		return
 | 
						|
	}
 | 
						|
	cniResult, err := json.Marshal(result)
 | 
						|
	if err != nil {
 | 
						|
		log.G(ctx).WithField("podsandboxid", sandboxID).WithError(err).Errorf("Failed to marshal CNI result: %v", err)
 | 
						|
		return
 | 
						|
	}
 | 
						|
	log.G(ctx).WithField("podsandboxid", sandboxID).Debugf("cni result: %s", string(cniResult))
 | 
						|
}
 |