diff --git a/pkg/cri/sbserver/container_create.go b/pkg/cri/sbserver/container_create.go index cb3191506..eb1e85f2b 100644 --- a/pkg/cri/sbserver/container_create.go +++ b/pkg/cri/sbserver/container_create.go @@ -21,6 +21,7 @@ import ( "errors" "fmt" "path/filepath" + "strconv" "time" "github.com/containerd/containerd" @@ -28,6 +29,7 @@ import ( "github.com/containerd/containerd/containers" "github.com/containerd/containerd/log" "github.com/containerd/containerd/oci" + "github.com/containerd/containerd/pkg/cri/annotations" "github.com/containerd/containerd/pkg/cri/config" criconfig "github.com/containerd/containerd/pkg/cri/config" cio "github.com/containerd/containerd/pkg/cri/io" @@ -394,6 +396,13 @@ func (c *criService) runtimeSnapshotter(ctx context.Context, ociRuntime criconfi return ociRuntime.Snapshotter } +const ( + // relativeRootfsPath is the rootfs path relative to bundle path. + relativeRootfsPath = "rootfs" + // hostnameEnv is the key for HOSTNAME env. + hostnameEnv = "HOSTNAME" +) + // buildContainerSpec build container's OCI spec depending on controller's target platform OS. func (c *criService) buildContainerSpec( platform *types.Platform, @@ -410,8 +419,49 @@ func (c *criService) buildContainerSpec( ociRuntime config.Runtime, ) (_ *runtimespec.Spec, retErr error) { var ( - specOpts []oci.SpecOpts - isLinux = platform.OS == "linux" + specOpts []oci.SpecOpts + isLinux = platform.OS == "linux" + isWindows = platform.OS == "windows" + ) + + specOpts = append(specOpts, customopts.WithProcessArgs(config, imageConfig)) + + if config.GetWorkingDir() != "" { + specOpts = append(specOpts, oci.WithProcessCwd(config.GetWorkingDir())) + } else if imageConfig.WorkingDir != "" { + specOpts = append(specOpts, oci.WithProcessCwd(imageConfig.WorkingDir)) + } + + if config.GetTty() { + specOpts = append(specOpts, oci.WithTTY) + } + + // Apply envs from image config first, so that envs from container config + // can override them. + env := append([]string{}, imageConfig.Env...) + for _, e := range config.GetEnvs() { + env = append(env, e.GetKey()+"="+e.GetValue()) + } + specOpts = append(specOpts, oci.WithEnv(env)) + + for pKey, pValue := range getPassthroughAnnotations(sandboxConfig.Annotations, + ociRuntime.PodAnnotations) { + specOpts = append(specOpts, customopts.WithAnnotation(pKey, pValue)) + } + + for pKey, pValue := range getPassthroughAnnotations(config.Annotations, + ociRuntime.ContainerAnnotations) { + specOpts = append(specOpts, customopts.WithAnnotation(pKey, pValue)) + } + + specOpts = append(specOpts, + customopts.WithAnnotation(annotations.ContainerType, annotations.ContainerTypeContainer), + customopts.WithAnnotation(annotations.SandboxID, sandboxID), + customopts.WithAnnotation(annotations.SandboxNamespace, sandboxConfig.GetMetadata().GetNamespace()), + customopts.WithAnnotation(annotations.SandboxUID, sandboxConfig.GetMetadata().GetUid()), + customopts.WithAnnotation(annotations.SandboxName, sandboxConfig.GetMetadata().GetName()), + customopts.WithAnnotation(annotations.ContainerName, containerName), + customopts.WithAnnotation(annotations.ImageName, imageName), ) if isLinux { @@ -423,6 +473,117 @@ func (c *criService) buildContainerSpec( if ociRuntime.BaseRuntimeSpec == "" { specOpts = append(specOpts, customopts.WithoutDefaultSecuritySettings) } + + specOpts = append(specOpts, + customopts.WithRelativeRoot(relativeRootfsPath), + oci.WithDefaultPathEnv, + // this will be set based on the security context below + oci.WithNewPrivileges, + ) + + // Add HOSTNAME env. + var ( + err error + hostname = sandboxConfig.GetHostname() + ) + if hostname == "" { + if hostname, err = c.os.Hostname(); err != nil { + return nil, err + } + } + specOpts = append(specOpts, oci.WithEnv([]string{hostnameEnv + "=" + hostname})) + + securityContext := config.GetLinux().GetSecurityContext() + + // Clear all ambient capabilities. The implication of non-root + caps + // is not clearly defined in Kubernetes. + // See https://github.com/kubernetes/kubernetes/issues/56374 + // Keep docker's behavior for now. + specOpts = append(specOpts, customopts.WithoutAmbientCaps) + + if securityContext.GetPrivileged() { + if !sandboxConfig.GetLinux().GetSecurityContext().GetPrivileged() { + return nil, errors.New("no privileged container allowed in sandbox") + } + specOpts = append(specOpts, oci.WithPrivileged) + if !ociRuntime.PrivilegedWithoutHostDevices { + specOpts = append(specOpts, oci.WithHostDevices, oci.WithAllDevicesAllowed) + } else if ociRuntime.PrivilegedWithoutHostDevicesAllDevicesAllowed { + // allow rwm on all devices for the container + specOpts = append(specOpts, oci.WithAllDevicesAllowed) + } + } + + // TODO: Figure out whether we should set no new privilege for sandbox container by default + if securityContext.GetNoNewPrivs() { + specOpts = append(specOpts, oci.WithNoNewPrivileges) + } + // TODO(random-liu): [P1] Set selinux options (privileged or not). + if securityContext.GetReadonlyRootfs() { + specOpts = append(specOpts, oci.WithRootFSReadonly()) + } + + if !c.config.DisableProcMount { + // Change the default masked/readonly paths to empty slices + // See https://github.com/containerd/containerd/issues/5029 + // TODO: Provide an option to set default paths to the ones in oci.populateDefaultUnixSpec() + specOpts = append(specOpts, oci.WithMaskedPaths([]string{}), oci.WithReadonlyPaths([]string{})) + + // Apply masked paths if specified. + // If the container is privileged, this will be cleared later on. + if maskedPaths := securityContext.GetMaskedPaths(); maskedPaths != nil { + specOpts = append(specOpts, oci.WithMaskedPaths(maskedPaths)) + } + + // Apply readonly paths if specified. + // If the container is privileged, this will be cleared later on. + if readonlyPaths := securityContext.GetReadonlyPaths(); readonlyPaths != nil { + specOpts = append(specOpts, oci.WithReadonlyPaths(readonlyPaths)) + } + } + + supplementalGroups := securityContext.GetSupplementalGroups() + specOpts = append(specOpts, customopts.WithSupplementalGroups(supplementalGroups)) + + // Default target PID namespace is the sandbox PID. + targetPid := sandboxPid + // If the container targets another container's PID namespace, + // set targetPid to the PID of that container. + nsOpts := securityContext.GetNamespaceOptions() + if nsOpts.GetPid() == runtime.NamespaceMode_TARGET { + targetContainer, err := c.validateTargetContainer(sandboxID, nsOpts.TargetId) + if err != nil { + return nil, fmt.Errorf("invalid target container: %w", err) + } + + status := targetContainer.Status.Get() + targetPid = status.Pid + } + + specOpts = append(specOpts, + // TODO: This is a hack to make this compile. We should move userns support to sbserver. + customopts.WithPodNamespaces(securityContext, sandboxPid, targetPid, nil, nil), + ) + + } else if isWindows { + specOpts = append(specOpts, + // Clear the root location since hcsshim expects it. + // NOTE: readonly rootfs doesn't work on windows. + customopts.WithoutRoot, + oci.WithWindowsNetworkNamespace(netNSPath), + oci.WithHostname(sandboxConfig.GetHostname()), + ) + + // All containers in a pod need to have HostProcess set if it was set on the pod, + // and vice versa no containers in the pod can be HostProcess if the pods spec + // didn't have the field set. The only case that is valid is if these are the same value. + cntrHpc := config.GetWindows().GetSecurityContext().GetHostProcess() + sandboxHpc := sandboxConfig.GetWindows().GetSecurityContext().GetHostProcess() + if cntrHpc != sandboxHpc { + return nil, errors.New("pod spec and all containers inside must have the HostProcess field set to be valid") + } + + specOpts = append(specOpts, customopts.WithAnnotation(annotations.WindowsHostProcess, strconv.FormatBool(sandboxHpc))) } // Get spec opts that depend on features offered by the platform containerd daemon is running on. diff --git a/pkg/cri/sbserver/container_create_linux.go b/pkg/cri/sbserver/container_create_linux.go index b2af6deed..d8fcda6a8 100644 --- a/pkg/cri/sbserver/container_create_linux.go +++ b/pkg/cri/sbserver/container_create_linux.go @@ -36,7 +36,6 @@ import ( "github.com/opencontainers/selinux/go-selinux/label" runtime "k8s.io/cri-api/pkg/apis/runtime/v1" - "github.com/containerd/containerd/pkg/cri/annotations" "github.com/containerd/containerd/pkg/cri/config" customopts "github.com/containerd/containerd/pkg/cri/opts" ) @@ -126,51 +125,7 @@ func (c *criService) platformSpec( extraMounts []*runtime.Mount, ociRuntime config.Runtime, ) (_ []oci.SpecOpts, retErr error) { - specOpts := []oci.SpecOpts{ - oci.WithoutRunMount, - } - // only clear the default security settings if the runtime does not have a custom - // base runtime spec spec. Admins can use this functionality to define - // default ulimits, seccomp, or other default settings. - if ociRuntime.BaseRuntimeSpec == "" { - specOpts = append(specOpts, customopts.WithoutDefaultSecuritySettings) - } - specOpts = append(specOpts, - customopts.WithRelativeRoot(relativeRootfsPath), - customopts.WithProcessArgs(config, imageConfig), - oci.WithDefaultPathEnv, - // this will be set based on the security context below - oci.WithNewPrivileges, - ) - if config.GetWorkingDir() != "" { - specOpts = append(specOpts, oci.WithProcessCwd(config.GetWorkingDir())) - } else if imageConfig.WorkingDir != "" { - specOpts = append(specOpts, oci.WithProcessCwd(imageConfig.WorkingDir)) - } - - if config.GetTty() { - specOpts = append(specOpts, oci.WithTTY) - } - - // Add HOSTNAME env. - var ( - err error - hostname = sandboxConfig.GetHostname() - ) - if hostname == "" { - if hostname, err = c.os.Hostname(); err != nil { - return nil, err - } - } - specOpts = append(specOpts, oci.WithEnv([]string{hostnameEnv + "=" + hostname})) - - // Apply envs from image config first, so that envs from container config - // can override them. - env := append([]string{}, imageConfig.Env...) - for _, e := range config.GetEnvs() { - env = append(env, e.GetKey()+"="+e.GetValue()) - } - specOpts = append(specOpts, oci.WithEnv(env)) + specOpts := []oci.SpecOpts{} securityContext := config.GetLinux().GetSecurityContext() labelOptions, err := toLabel(securityContext.GetSelinuxOptions()) @@ -197,61 +152,13 @@ func (c *criService) platformSpec( } }() - specOpts = append(specOpts, customopts.WithMounts(c.os, config, extraMounts, mountLabel)) - - if !c.config.DisableProcMount { - // Change the default masked/readonly paths to empty slices - // See https://github.com/containerd/containerd/issues/5029 - // TODO: Provide an option to set default paths to the ones in oci.populateDefaultUnixSpec() - specOpts = append(specOpts, oci.WithMaskedPaths([]string{}), oci.WithReadonlyPaths([]string{})) - - // Apply masked paths if specified. - // If the container is privileged, this will be cleared later on. - if maskedPaths := securityContext.GetMaskedPaths(); maskedPaths != nil { - specOpts = append(specOpts, oci.WithMaskedPaths(maskedPaths)) - } - - // Apply readonly paths if specified. - // If the container is privileged, this will be cleared later on. - if readonlyPaths := securityContext.GetReadonlyPaths(); readonlyPaths != nil { - specOpts = append(specOpts, oci.WithReadonlyPaths(readonlyPaths)) - } - } - - specOpts = append(specOpts, customopts.WithDevices(c.os, config, c.config.DeviceOwnershipFromSecurityContext), - customopts.WithCapabilities(securityContext, c.allCaps)) - - if securityContext.GetPrivileged() { - if !sandboxConfig.GetLinux().GetSecurityContext().GetPrivileged() { - return nil, errors.New("no privileged container allowed in sandbox") - } - specOpts = append(specOpts, oci.WithPrivileged) - if !ociRuntime.PrivilegedWithoutHostDevices { - specOpts = append(specOpts, oci.WithHostDevices, oci.WithAllDevicesAllowed) - } else if ociRuntime.PrivilegedWithoutHostDevicesAllDevicesAllowed { - // allow rwm on all devices for the container - specOpts = append(specOpts, oci.WithAllDevicesAllowed) - } - } - - // Clear all ambient capabilities. The implication of non-root + caps - // is not clearly defined in Kubernetes. - // See https://github.com/kubernetes/kubernetes/issues/56374 - // Keep docker's behavior for now. specOpts = append(specOpts, - customopts.WithoutAmbientCaps, customopts.WithSelinuxLabels(processLabel, mountLabel), + customopts.WithMounts(c.os, config, extraMounts, mountLabel), + customopts.WithDevices(c.os, config, c.config.DeviceOwnershipFromSecurityContext), + customopts.WithCapabilities(securityContext, c.allCaps), ) - // TODO: Figure out whether we should set no new privilege for sandbox container by default - if securityContext.GetNoNewPrivs() { - specOpts = append(specOpts, oci.WithNoNewPrivileges) - } - // TODO(random-liu): [P1] Set selinux options (privileged or not). - if securityContext.GetReadonlyRootfs() { - specOpts = append(specOpts, oci.WithRootFSReadonly()) - } - if c.config.DisableCgroup { specOpts = append(specOpts, customopts.WithDisabledCgroups) } else { @@ -262,8 +169,6 @@ func (c *criService) platformSpec( } } - supplementalGroups := securityContext.GetSupplementalGroups() - // Get blockio class blockIOClass, err := c.blockIOClassFromAnnotations(config.GetMetadata().GetName(), config.Annotations, sandboxConfig.Annotations) if err != nil { @@ -286,53 +191,14 @@ func (c *criService) platformSpec( specOpts = append(specOpts, oci.WithRdt(rdtClass, "", "")) } - for pKey, pValue := range getPassthroughAnnotations(sandboxConfig.Annotations, - ociRuntime.PodAnnotations) { - specOpts = append(specOpts, customopts.WithAnnotation(pKey, pValue)) - } + specOpts = append(specOpts, customopts.WithOOMScoreAdj(config, c.config.RestrictOOMScoreAdj)) - for pKey, pValue := range getPassthroughAnnotations(config.Annotations, - ociRuntime.ContainerAnnotations) { - specOpts = append(specOpts, customopts.WithAnnotation(pKey, pValue)) - } - - // Default target PID namespace is the sandbox PID. - targetPid := sandboxPid - // If the container targets another container's PID namespace, - // set targetPid to the PID of that container. - nsOpts := securityContext.GetNamespaceOptions() - if nsOpts.GetPid() == runtime.NamespaceMode_TARGET { - targetContainer, err := c.validateTargetContainer(sandboxID, nsOpts.TargetId) - if err != nil { - return nil, fmt.Errorf("invalid target container: %w", err) - } - - status := targetContainer.Status.Get() - targetPid = status.Pid - } - - specOpts = append(specOpts, - customopts.WithOOMScoreAdj(config, c.config.RestrictOOMScoreAdj), - // TODO: This is a hack to make this compile. We should move userns support to sbserver. - customopts.WithPodNamespaces(securityContext, sandboxPid, targetPid, nil, nil), - customopts.WithSupplementalGroups(supplementalGroups), - customopts.WithAnnotation(annotations.ContainerType, annotations.ContainerTypeContainer), - customopts.WithAnnotation(annotations.SandboxID, sandboxID), - customopts.WithAnnotation(annotations.SandboxNamespace, sandboxConfig.GetMetadata().GetNamespace()), - customopts.WithAnnotation(annotations.SandboxUID, sandboxConfig.GetMetadata().GetUid()), - customopts.WithAnnotation(annotations.SandboxName, sandboxConfig.GetMetadata().GetName()), - customopts.WithAnnotation(annotations.ContainerName, containerName), - customopts.WithAnnotation(annotations.ImageName, imageName), - ) // cgroupns is used for hiding /sys/fs/cgroup from containers. // For compatibility, cgroupns is not used when running in cgroup v1 mode or in privileged. // https://github.com/containers/libpod/issues/4363 // https://github.com/kubernetes/enhancements/blob/0e409b47497e398b369c281074485c8de129694f/keps/sig-node/20191118-cgroups-v2.md#cgroup-namespace if cgroups.Mode() == cgroups.Unified && !securityContext.GetPrivileged() { - specOpts = append(specOpts, oci.WithLinuxNamespace( - runtimespec.LinuxNamespace{ - Type: runtimespec.CgroupNamespace, - })) + specOpts = append(specOpts, oci.WithLinuxNamespace(runtimespec.LinuxNamespace{Type: runtimespec.CgroupNamespace})) } return specOpts, nil diff --git a/pkg/cri/sbserver/container_create_test.go b/pkg/cri/sbserver/container_create_test.go index 507928fd8..546a8d468 100644 --- a/pkg/cri/sbserver/container_create_test.go +++ b/pkg/cri/sbserver/container_create_test.go @@ -62,6 +62,7 @@ func TestGeneralContainerSpec(t *testing.T) { testID := "test-id" testPid := uint32(1234) containerConfig, sandboxConfig, imageConfig, specCheck := getCreateContainerTestData() + containerConfig.Command = []string{"/bin/true"} ociRuntime := config.Runtime{} c := newTestCRIService() testSandboxID := "sandbox-id" diff --git a/pkg/cri/sbserver/container_create_windows.go b/pkg/cri/sbserver/container_create_windows.go index 97b5f70a4..c4ebf420b 100644 --- a/pkg/cri/sbserver/container_create_windows.go +++ b/pkg/cri/sbserver/container_create_windows.go @@ -17,18 +17,15 @@ package sbserver import ( - "errors" "fmt" - "strconv" - "github.com/containerd/containerd/oci" - "github.com/containerd/containerd/snapshots" imagespec "github.com/opencontainers/image-spec/specs-go/v1" runtime "k8s.io/cri-api/pkg/apis/runtime/v1" - "github.com/containerd/containerd/pkg/cri/annotations" + "github.com/containerd/containerd/oci" "github.com/containerd/containerd/pkg/cri/config" customopts "github.com/containerd/containerd/pkg/cri/opts" + "github.com/containerd/containerd/snapshots" ) // No container mounts for windows. @@ -49,47 +46,13 @@ func (c *criService) platformSpec( extraMounts []*runtime.Mount, ociRuntime config.Runtime, ) ([]oci.SpecOpts, error) { - specOpts := []oci.SpecOpts{ - customopts.WithProcessArgs(config, imageConfig), - } - - // All containers in a pod need to have HostProcess set if it was set on the pod, - // and vice versa no containers in the pod can be HostProcess if the pods spec - // didn't have the field set. The only case that is valid is if these are the same value. - cntrHpc := config.GetWindows().GetSecurityContext().GetHostProcess() - sandboxHpc := sandboxConfig.GetWindows().GetSecurityContext().GetHostProcess() - if cntrHpc != sandboxHpc { - return nil, errors.New("pod spec and all containers inside must have the HostProcess field set to be valid") - } - - if config.GetWorkingDir() != "" { - specOpts = append(specOpts, oci.WithProcessCwd(config.GetWorkingDir())) - } else if imageConfig.WorkingDir != "" { - specOpts = append(specOpts, oci.WithProcessCwd(imageConfig.WorkingDir)) - } - - if config.GetTty() { - specOpts = append(specOpts, oci.WithTTY) - } - - // Apply envs from image config first, so that envs from container config - // can override them. - env := append([]string{}, imageConfig.Env...) - for _, e := range config.GetEnvs() { - env = append(env, e.GetKey()+"="+e.GetValue()) - } - specOpts = append(specOpts, oci.WithEnv(env)) + specOpts := []oci.SpecOpts{} specOpts = append(specOpts, - // Clear the root location since hcsshim expects it. - // NOTE: readonly rootfs doesn't work on windows. - customopts.WithoutRoot, - oci.WithWindowsNetworkNamespace(netNSPath), - oci.WithHostname(sandboxConfig.GetHostname()), + customopts.WithWindowsMounts(c.os, config, extraMounts), + customopts.WithDevices(config), ) - specOpts = append(specOpts, customopts.WithWindowsMounts(c.os, config, extraMounts), customopts.WithDevices(config)) - // Start with the image config user and override below if RunAsUsername is not "". username := imageConfig.User @@ -115,27 +78,6 @@ func (c *criService) platformSpec( // when trying to run the init process. specOpts = append(specOpts, oci.WithUser(username)) - for pKey, pValue := range getPassthroughAnnotations(sandboxConfig.Annotations, - ociRuntime.PodAnnotations) { - specOpts = append(specOpts, customopts.WithAnnotation(pKey, pValue)) - } - - for pKey, pValue := range getPassthroughAnnotations(config.Annotations, - ociRuntime.ContainerAnnotations) { - specOpts = append(specOpts, customopts.WithAnnotation(pKey, pValue)) - } - - specOpts = append(specOpts, - customopts.WithAnnotation(annotations.ContainerType, annotations.ContainerTypeContainer), - customopts.WithAnnotation(annotations.SandboxID, sandboxID), - customopts.WithAnnotation(annotations.SandboxNamespace, sandboxConfig.GetMetadata().GetNamespace()), - customopts.WithAnnotation(annotations.SandboxUID, sandboxConfig.GetMetadata().GetUid()), - customopts.WithAnnotation(annotations.SandboxName, sandboxConfig.GetMetadata().GetName()), - customopts.WithAnnotation(annotations.ContainerName, containerName), - customopts.WithAnnotation(annotations.ImageName, imageName), - customopts.WithAnnotation(annotations.WindowsHostProcess, strconv.FormatBool(sandboxHpc)), - ) - return specOpts, nil } diff --git a/pkg/cri/sbserver/helpers_linux.go b/pkg/cri/sbserver/helpers_linux.go index d3abc3fb6..c465a4931 100644 --- a/pkg/cri/sbserver/helpers_linux.go +++ b/pkg/cri/sbserver/helpers_linux.go @@ -40,8 +40,6 @@ import ( ) const ( - // relativeRootfsPath is the rootfs path relative to bundle path. - relativeRootfsPath = "rootfs" // devShm is the default path of /dev/shm. devShm = "/dev/shm" // etcHosts is the default path of /etc/hosts file. @@ -50,8 +48,6 @@ const ( etcHostname = "/etc/hostname" // resolvConfPath is the abs path of resolv.conf on host or container. resolvConfPath = "/etc/resolv.conf" - // hostnameEnv is the key for HOSTNAME env. - hostnameEnv = "HOSTNAME" ) // getCgroupsPath generates container cgroups path.