 a7adeb6976
			
		
	
	a7adeb6976
	
	
	
		
			
			This patch requests the OCI runtime to create a userns when the CRI message includes such request. Signed-off-by: Rodrigo Campos <rodrigoca@microsoft.com>
		
			
				
	
	
		
			386 lines
		
	
	
		
			14 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
			
		
		
	
	
			386 lines
		
	
	
		
			14 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
| /*
 | |
|    Copyright The containerd Authors.
 | |
| 
 | |
|    Licensed under the Apache License, Version 2.0 (the "License");
 | |
|    you may not use this file except in compliance with the License.
 | |
|    You may obtain a copy of the License at
 | |
| 
 | |
|        http://www.apache.org/licenses/LICENSE-2.0
 | |
| 
 | |
|    Unless required by applicable law or agreed to in writing, software
 | |
|    distributed under the License is distributed on an "AS IS" BASIS,
 | |
|    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | |
|    See the License for the specific language governing permissions and
 | |
|    limitations under the License.
 | |
| */
 | |
| 
 | |
| package server
 | |
| 
 | |
| import (
 | |
| 	"fmt"
 | |
| 	"os"
 | |
| 	"strconv"
 | |
| 	"strings"
 | |
| 
 | |
| 	"github.com/containerd/containerd"
 | |
| 	"github.com/containerd/containerd/oci"
 | |
| 	"github.com/containerd/containerd/plugin"
 | |
| 	"github.com/containerd/containerd/snapshots"
 | |
| 	imagespec "github.com/opencontainers/image-spec/specs-go/v1"
 | |
| 	runtimespec "github.com/opencontainers/runtime-spec/specs-go"
 | |
| 	selinux "github.com/opencontainers/selinux/go-selinux"
 | |
| 	"golang.org/x/sys/unix"
 | |
| 	runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
 | |
| 
 | |
| 	"github.com/containerd/containerd/pkg/cri/annotations"
 | |
| 	customopts "github.com/containerd/containerd/pkg/cri/opts"
 | |
| 	osinterface "github.com/containerd/containerd/pkg/os"
 | |
| 	"github.com/containerd/containerd/pkg/userns"
 | |
| )
 | |
| 
 | |
| func (c *criService) sandboxContainerSpec(id string, config *runtime.PodSandboxConfig,
 | |
| 	imageConfig *imagespec.ImageConfig, nsPath string, runtimePodAnnotations []string) (_ *runtimespec.Spec, retErr error) {
 | |
| 	// Creates a spec Generator with the default spec.
 | |
| 	// TODO(random-liu): [P1] Compare the default settings with docker and containerd default.
 | |
| 	specOpts := []oci.SpecOpts{
 | |
| 		oci.WithoutRunMount,
 | |
| 		customopts.WithoutDefaultSecuritySettings,
 | |
| 		customopts.WithRelativeRoot(relativeRootfsPath),
 | |
| 		oci.WithEnv(imageConfig.Env),
 | |
| 		oci.WithRootFSReadonly(),
 | |
| 		oci.WithHostname(config.GetHostname()),
 | |
| 	}
 | |
| 	if imageConfig.WorkingDir != "" {
 | |
| 		specOpts = append(specOpts, oci.WithProcessCwd(imageConfig.WorkingDir))
 | |
| 	}
 | |
| 
 | |
| 	if len(imageConfig.Entrypoint) == 0 && len(imageConfig.Cmd) == 0 {
 | |
| 		// Pause image must have entrypoint or cmd.
 | |
| 		return nil, fmt.Errorf("invalid empty entrypoint and cmd in image config %+v", imageConfig)
 | |
| 	}
 | |
| 	specOpts = append(specOpts, oci.WithProcessArgs(append(imageConfig.Entrypoint, imageConfig.Cmd...)...))
 | |
| 
 | |
| 	// Set cgroups parent.
 | |
| 	if c.config.DisableCgroup {
 | |
| 		specOpts = append(specOpts, customopts.WithDisabledCgroups)
 | |
| 	} else {
 | |
| 		if config.GetLinux().GetCgroupParent() != "" {
 | |
| 			cgroupsPath := getCgroupsPath(config.GetLinux().GetCgroupParent(), id)
 | |
| 			specOpts = append(specOpts, oci.WithCgroup(cgroupsPath))
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	// When cgroup parent is not set, containerd-shim will create container in a child cgroup
 | |
| 	// of the cgroup itself is in.
 | |
| 	// TODO(random-liu): [P2] Set default cgroup path if cgroup parent is not specified.
 | |
| 
 | |
| 	// Set namespace options.
 | |
| 	var (
 | |
| 		securityContext = config.GetLinux().GetSecurityContext()
 | |
| 		nsOptions       = securityContext.GetNamespaceOptions()
 | |
| 	)
 | |
| 	if nsOptions.GetNetwork() == runtime.NamespaceMode_NODE {
 | |
| 		specOpts = append(specOpts, customopts.WithoutNamespace(runtimespec.NetworkNamespace))
 | |
| 		specOpts = append(specOpts, customopts.WithoutNamespace(runtimespec.UTSNamespace))
 | |
| 	} else {
 | |
| 		specOpts = append(specOpts, oci.WithLinuxNamespace(
 | |
| 			runtimespec.LinuxNamespace{
 | |
| 				Type: runtimespec.NetworkNamespace,
 | |
| 				Path: nsPath,
 | |
| 			}))
 | |
| 	}
 | |
| 	if nsOptions.GetPid() == runtime.NamespaceMode_NODE {
 | |
| 		specOpts = append(specOpts, customopts.WithoutNamespace(runtimespec.PIDNamespace))
 | |
| 	}
 | |
| 	if nsOptions.GetIpc() == runtime.NamespaceMode_NODE {
 | |
| 		specOpts = append(specOpts, customopts.WithoutNamespace(runtimespec.IPCNamespace))
 | |
| 	}
 | |
| 
 | |
| 	usernsOpts := nsOptions.GetUsernsOptions()
 | |
| 	uids, gids, err := parseUsernsIDs(usernsOpts)
 | |
| 	if err != nil {
 | |
| 		return nil, fmt.Errorf("user namespace configuration: %w", err)
 | |
| 	}
 | |
| 
 | |
| 	if usernsOpts != nil {
 | |
| 		switch mode := usernsOpts.GetMode(); mode {
 | |
| 		case runtime.NamespaceMode_NODE:
 | |
| 			specOpts = append(specOpts, customopts.WithoutNamespace(runtimespec.UserNamespace))
 | |
| 		case runtime.NamespaceMode_POD:
 | |
| 			specOpts = append(specOpts, oci.WithUserNamespace(uids, gids))
 | |
| 		default:
 | |
| 			return nil, fmt.Errorf("unsupported user namespace mode: %q", mode)
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	// It's fine to generate the spec before the sandbox /dev/shm
 | |
| 	// is actually created.
 | |
| 	sandboxDevShm := c.getSandboxDevShm(id)
 | |
| 	if nsOptions.GetIpc() == runtime.NamespaceMode_NODE {
 | |
| 		sandboxDevShm = devShm
 | |
| 	}
 | |
| 	// Remove the default /dev/shm mount from defaultMounts, it is added in oci/mounts.go.
 | |
| 	specOpts = append(specOpts, oci.WithoutMounts(devShm))
 | |
| 	// In future the when user-namespace is enabled, the `nosuid, nodev, noexec` flags are
 | |
| 	// required, otherwise the remount will fail with EPERM. Just use them unconditionally,
 | |
| 	// they are nice to have anyways.
 | |
| 	specOpts = append(specOpts, oci.WithMounts([]runtimespec.Mount{
 | |
| 		{
 | |
| 			Source:      sandboxDevShm,
 | |
| 			Destination: devShm,
 | |
| 			Type:        "bind",
 | |
| 			Options:     []string{"rbind", "ro", "nosuid", "nodev", "noexec"},
 | |
| 		},
 | |
| 		// Add resolv.conf for katacontainers to setup the DNS of pod VM properly.
 | |
| 		{
 | |
| 			Source:      c.getResolvPath(id),
 | |
| 			Destination: resolvConfPath,
 | |
| 			Type:        "bind",
 | |
| 			Options:     []string{"rbind", "ro"},
 | |
| 		},
 | |
| 	}))
 | |
| 
 | |
| 	processLabel, mountLabel, err := initLabelsFromOpt(securityContext.GetSelinuxOptions())
 | |
| 	if err != nil {
 | |
| 		return nil, fmt.Errorf("failed to init selinux options %+v: %w", securityContext.GetSelinuxOptions(), err)
 | |
| 	}
 | |
| 	defer func() {
 | |
| 		if retErr != nil {
 | |
| 			selinux.ReleaseLabel(processLabel)
 | |
| 		}
 | |
| 	}()
 | |
| 
 | |
| 	supplementalGroups := securityContext.GetSupplementalGroups()
 | |
| 	specOpts = append(specOpts,
 | |
| 		customopts.WithSelinuxLabels(processLabel, mountLabel),
 | |
| 		customopts.WithSupplementalGroups(supplementalGroups),
 | |
| 	)
 | |
| 
 | |
| 	// Add sysctls
 | |
| 	sysctls := config.GetLinux().GetSysctls()
 | |
| 	if sysctls == nil {
 | |
| 		sysctls = make(map[string]string)
 | |
| 	}
 | |
| 	_, ipUnprivilegedPortStart := sysctls["net.ipv4.ip_unprivileged_port_start"]
 | |
| 	_, pingGroupRange := sysctls["net.ipv4.ping_group_range"]
 | |
| 	if nsOptions.GetNetwork() != runtime.NamespaceMode_NODE {
 | |
| 		if c.config.EnableUnprivilegedPorts && !ipUnprivilegedPortStart {
 | |
| 			sysctls["net.ipv4.ip_unprivileged_port_start"] = "0"
 | |
| 		}
 | |
| 		if c.config.EnableUnprivilegedICMP && !pingGroupRange && !userns.RunningInUserNS() {
 | |
| 			sysctls["net.ipv4.ping_group_range"] = "0 2147483647"
 | |
| 		}
 | |
| 	}
 | |
| 	specOpts = append(specOpts, customopts.WithSysctls(sysctls))
 | |
| 
 | |
| 	// Note: LinuxSandboxSecurityContext does not currently provide an apparmor profile
 | |
| 
 | |
| 	if !c.config.DisableCgroup {
 | |
| 		specOpts = append(specOpts, customopts.WithDefaultSandboxShares)
 | |
| 	}
 | |
| 
 | |
| 	if res := config.GetLinux().GetResources(); res != nil {
 | |
| 		specOpts = append(specOpts,
 | |
| 			customopts.WithAnnotation(annotations.SandboxCPUPeriod, strconv.FormatInt(res.CpuPeriod, 10)),
 | |
| 			customopts.WithAnnotation(annotations.SandboxCPUQuota, strconv.FormatInt(res.CpuQuota, 10)),
 | |
| 			customopts.WithAnnotation(annotations.SandboxCPUShares, strconv.FormatInt(res.CpuShares, 10)),
 | |
| 			customopts.WithAnnotation(annotations.SandboxMem, strconv.FormatInt(res.MemoryLimitInBytes, 10)))
 | |
| 	}
 | |
| 
 | |
| 	specOpts = append(specOpts, customopts.WithPodOOMScoreAdj(int(defaultSandboxOOMAdj), c.config.RestrictOOMScoreAdj))
 | |
| 
 | |
| 	for pKey, pValue := range getPassthroughAnnotations(config.Annotations,
 | |
| 		runtimePodAnnotations) {
 | |
| 		specOpts = append(specOpts, customopts.WithAnnotation(pKey, pValue))
 | |
| 	}
 | |
| 
 | |
| 	specOpts = append(specOpts,
 | |
| 		customopts.WithAnnotation(annotations.ContainerType, annotations.ContainerTypeSandbox),
 | |
| 		customopts.WithAnnotation(annotations.SandboxID, id),
 | |
| 		customopts.WithAnnotation(annotations.SandboxNamespace, config.GetMetadata().GetNamespace()),
 | |
| 		customopts.WithAnnotation(annotations.SandboxUID, config.GetMetadata().GetUid()),
 | |
| 		customopts.WithAnnotation(annotations.SandboxName, config.GetMetadata().GetName()),
 | |
| 		customopts.WithAnnotation(annotations.SandboxLogDir, config.GetLogDirectory()),
 | |
| 	)
 | |
| 
 | |
| 	return c.runtimeSpec(id, "", specOpts...)
 | |
| }
 | |
| 
 | |
| // sandboxContainerSpecOpts generates OCI spec options for
 | |
| // the sandbox container.
 | |
| func (c *criService) sandboxContainerSpecOpts(config *runtime.PodSandboxConfig, imageConfig *imagespec.ImageConfig) ([]oci.SpecOpts, error) {
 | |
| 	var (
 | |
| 		securityContext = config.GetLinux().GetSecurityContext()
 | |
| 		specOpts        []oci.SpecOpts
 | |
| 		err             error
 | |
| 	)
 | |
| 	ssp := securityContext.GetSeccomp()
 | |
| 	if ssp == nil {
 | |
| 		ssp, err = generateSeccompSecurityProfile(
 | |
| 			securityContext.GetSeccompProfilePath(), //nolint:staticcheck // Deprecated but we don't want to remove yet
 | |
| 			c.config.UnsetSeccompProfile)
 | |
| 		if err != nil {
 | |
| 			return nil, fmt.Errorf("failed to generate seccomp spec opts: %w", err)
 | |
| 		}
 | |
| 	}
 | |
| 	seccompSpecOpts, err := c.generateSeccompSpecOpts(
 | |
| 		ssp,
 | |
| 		securityContext.GetPrivileged(),
 | |
| 		c.seccompEnabled())
 | |
| 	if err != nil {
 | |
| 		return nil, fmt.Errorf("failed to generate seccomp spec opts: %w", err)
 | |
| 	}
 | |
| 	if seccompSpecOpts != nil {
 | |
| 		specOpts = append(specOpts, seccompSpecOpts)
 | |
| 	}
 | |
| 
 | |
| 	userstr, err := generateUserString(
 | |
| 		"",
 | |
| 		securityContext.GetRunAsUser(),
 | |
| 		securityContext.GetRunAsGroup(),
 | |
| 	)
 | |
| 	if err != nil {
 | |
| 		return nil, fmt.Errorf("failed to generate user string: %w", err)
 | |
| 	}
 | |
| 	if userstr == "" {
 | |
| 		// Lastly, since no user override was passed via CRI try to set via OCI
 | |
| 		// Image
 | |
| 		userstr = imageConfig.User
 | |
| 	}
 | |
| 	if userstr != "" {
 | |
| 		specOpts = append(specOpts, oci.WithUser(userstr))
 | |
| 	}
 | |
| 	return specOpts, nil
 | |
| }
 | |
| 
 | |
| // setupSandboxFiles sets up necessary sandbox files including /dev/shm, /etc/hosts,
 | |
| // /etc/resolv.conf and /etc/hostname.
 | |
| func (c *criService) setupSandboxFiles(id string, config *runtime.PodSandboxConfig) error {
 | |
| 	sandboxEtcHostname := c.getSandboxHostname(id)
 | |
| 	hostname := config.GetHostname()
 | |
| 	if hostname == "" {
 | |
| 		var err error
 | |
| 		hostname, err = c.os.Hostname()
 | |
| 		if err != nil {
 | |
| 			return fmt.Errorf("failed to get hostname: %w", err)
 | |
| 		}
 | |
| 	}
 | |
| 	if err := c.os.WriteFile(sandboxEtcHostname, []byte(hostname+"\n"), 0644); err != nil {
 | |
| 		return fmt.Errorf("failed to write hostname to %q: %w", sandboxEtcHostname, err)
 | |
| 	}
 | |
| 
 | |
| 	// TODO(random-liu): Consider whether we should maintain /etc/hosts and /etc/resolv.conf in kubelet.
 | |
| 	sandboxEtcHosts := c.getSandboxHosts(id)
 | |
| 	if err := c.os.CopyFile(etcHosts, sandboxEtcHosts, 0644); err != nil {
 | |
| 		return fmt.Errorf("failed to generate sandbox hosts file %q: %w", sandboxEtcHosts, err)
 | |
| 	}
 | |
| 
 | |
| 	// Set DNS options. Maintain a resolv.conf for the sandbox.
 | |
| 	var err error
 | |
| 	resolvContent := ""
 | |
| 	if dnsConfig := config.GetDnsConfig(); dnsConfig != nil {
 | |
| 		resolvContent, err = parseDNSOptions(dnsConfig.Servers, dnsConfig.Searches, dnsConfig.Options)
 | |
| 		if err != nil {
 | |
| 			return fmt.Errorf("failed to parse sandbox DNSConfig %+v: %w", dnsConfig, err)
 | |
| 		}
 | |
| 	}
 | |
| 	resolvPath := c.getResolvPath(id)
 | |
| 	if resolvContent == "" {
 | |
| 		// copy host's resolv.conf to resolvPath
 | |
| 		err = c.os.CopyFile(resolvConfPath, resolvPath, 0644)
 | |
| 		if err != nil {
 | |
| 			return fmt.Errorf("failed to copy host's resolv.conf to %q: %w", resolvPath, err)
 | |
| 		}
 | |
| 	} else {
 | |
| 		err = c.os.WriteFile(resolvPath, []byte(resolvContent), 0644)
 | |
| 		if err != nil {
 | |
| 			return fmt.Errorf("failed to write resolv content to %q: %w", resolvPath, err)
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	// Setup sandbox /dev/shm.
 | |
| 	if config.GetLinux().GetSecurityContext().GetNamespaceOptions().GetIpc() == runtime.NamespaceMode_NODE {
 | |
| 		if _, err := c.os.Stat(devShm); err != nil {
 | |
| 			return fmt.Errorf("host %q is not available for host ipc: %w", devShm, err)
 | |
| 		}
 | |
| 	} else {
 | |
| 		sandboxDevShm := c.getSandboxDevShm(id)
 | |
| 		if err := c.os.MkdirAll(sandboxDevShm, 0700); err != nil {
 | |
| 			return fmt.Errorf("failed to create sandbox shm: %w", err)
 | |
| 		}
 | |
| 		shmproperty := fmt.Sprintf("mode=1777,size=%d", defaultShmSize)
 | |
| 		if err := c.os.(osinterface.UNIX).Mount("shm", sandboxDevShm, "tmpfs", uintptr(unix.MS_NOEXEC|unix.MS_NOSUID|unix.MS_NODEV), shmproperty); err != nil {
 | |
| 			return fmt.Errorf("failed to mount sandbox shm: %w", err)
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	return nil
 | |
| }
 | |
| 
 | |
| // parseDNSOptions parse DNS options into resolv.conf format content,
 | |
| // if none option is specified, will return empty with no error.
 | |
| func parseDNSOptions(servers, searches, options []string) (string, error) {
 | |
| 	resolvContent := ""
 | |
| 
 | |
| 	if len(searches) > 0 {
 | |
| 		resolvContent += fmt.Sprintf("search %s\n", strings.Join(searches, " "))
 | |
| 	}
 | |
| 
 | |
| 	if len(servers) > 0 {
 | |
| 		resolvContent += fmt.Sprintf("nameserver %s\n", strings.Join(servers, "\nnameserver "))
 | |
| 	}
 | |
| 
 | |
| 	if len(options) > 0 {
 | |
| 		resolvContent += fmt.Sprintf("options %s\n", strings.Join(options, " "))
 | |
| 	}
 | |
| 
 | |
| 	return resolvContent, nil
 | |
| }
 | |
| 
 | |
| // cleanupSandboxFiles unmount some sandbox files, we rely on the removal of sandbox root directory to
 | |
| // remove these files. Unmount should *NOT* return error if the mount point is already unmounted.
 | |
| func (c *criService) cleanupSandboxFiles(id string, config *runtime.PodSandboxConfig) error {
 | |
| 	if config.GetLinux().GetSecurityContext().GetNamespaceOptions().GetIpc() != runtime.NamespaceMode_NODE {
 | |
| 		path, err := c.os.FollowSymlinkInScope(c.getSandboxDevShm(id), "/")
 | |
| 		if err != nil {
 | |
| 			return fmt.Errorf("failed to follow symlink: %w", err)
 | |
| 		}
 | |
| 		if err := c.os.(osinterface.UNIX).Unmount(path); err != nil && !os.IsNotExist(err) {
 | |
| 			return fmt.Errorf("failed to unmount %q: %w", path, err)
 | |
| 		}
 | |
| 	}
 | |
| 	return nil
 | |
| }
 | |
| 
 | |
| // taskOpts generates task options for a (sandbox) container.
 | |
| func (c *criService) taskOpts(runtimeType string) []containerd.NewTaskOpts {
 | |
| 	// TODO(random-liu): Remove this after shim v1 is deprecated.
 | |
| 	var taskOpts []containerd.NewTaskOpts
 | |
| 
 | |
| 	// c.config.NoPivot is only supported for RuntimeLinuxV1 = "io.containerd.runtime.v1.linux" legacy linux runtime
 | |
| 	// and is not supported for RuntimeRuncV1 = "io.containerd.runc.v1" or  RuntimeRuncV2 = "io.containerd.runc.v2"
 | |
| 	// for RuncV1/2 no pivot is set under the containerd.runtimes.runc.options config see
 | |
| 	// https://github.com/containerd/containerd/blob/v1.3.2/runtime/v2/runc/options/oci.pb.go#L26
 | |
| 	if c.config.NoPivot && runtimeType == plugin.RuntimeLinuxV1 {
 | |
| 		taskOpts = append(taskOpts, containerd.WithNoPivotRoot)
 | |
| 	}
 | |
| 
 | |
| 	return taskOpts
 | |
| }
 | |
| 
 | |
| func (c *criService) updateNetNamespacePath(spec *runtimespec.Spec, nsPath string) {
 | |
| 	for i := range spec.Linux.Namespaces {
 | |
| 		if spec.Linux.Namespaces[i].Type == runtimespec.NetworkNamespace {
 | |
| 			spec.Linux.Namespaces[i].Path = nsPath
 | |
| 			break
 | |
| 		}
 | |
| 	}
 | |
| }
 | |
| 
 | |
| // sandboxSnapshotterOpts generates any platform specific snapshotter options
 | |
| // for a sandbox container.
 | |
| func sandboxSnapshotterOpts(config *runtime.PodSandboxConfig) ([]snapshots.Opt, error) {
 | |
| 	nsOpts := config.GetLinux().GetSecurityContext().GetNamespaceOptions()
 | |
| 	return snapshotterRemapOpts(nsOpts)
 | |
| }
 |