Have separate spec builder for each platform

Signed-off-by: Maksym Pavlenko <pavlenko.maksym@gmail.com>
This commit is contained in:
Maksym Pavlenko 2023-01-11 13:12:25 -08:00
parent fdfa3519a3
commit 40be96efa9
21 changed files with 1086 additions and 746 deletions

View File

@ -45,3 +45,10 @@ func WithHostDevices(_ context.Context, _ Client, _ *containers.Container, s *Sp
func DeviceFromPath(path string) (*specs.LinuxDevice, error) {
return nil, errors.New("device from path not supported on Windows")
}
// WithDevices does nothing on Windows.
func WithDevices(devicePath, containerPath, permissions string) SpecOpts {
return func(ctx context.Context, client Client, container *containers.Container, spec *Spec) error {
return nil
}
}

View File

@ -25,13 +25,14 @@ import (
goruntime "runtime"
"strings"
"github.com/containerd/continuity/fs"
"github.com/containerd/containerd"
"github.com/containerd/containerd/containers"
"github.com/containerd/containerd/errdefs"
"github.com/containerd/containerd/log"
"github.com/containerd/containerd/mount"
"github.com/containerd/containerd/snapshots"
"github.com/containerd/continuity/fs"
)
// WithNewSnapshot wraps `containerd.WithNewSnapshot` so that if creating the

View File

@ -22,8 +22,6 @@ import (
"fmt"
"os"
"path/filepath"
"sort"
"strconv"
"strings"
"sync"
"syscall"
@ -31,255 +29,15 @@ import (
"github.com/container-orchestrated-devices/container-device-interface/pkg/cdi"
"github.com/containerd/cgroups/v3"
"github.com/containerd/cgroups/v3/cgroup1"
runtimespec "github.com/opencontainers/runtime-spec/specs-go"
"github.com/opencontainers/selinux/go-selinux/label"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
"github.com/containerd/containerd/containers"
"github.com/containerd/containerd/log"
"github.com/containerd/containerd/mount"
"github.com/containerd/containerd/oci"
osinterface "github.com/containerd/containerd/pkg/os"
)
// WithMounts sorts and adds runtime and CRI mounts to the spec
func WithMounts(osi osinterface.OS, config *runtime.ContainerConfig, extra []*runtime.Mount, mountLabel string) oci.SpecOpts {
return func(ctx context.Context, client oci.Client, _ *containers.Container, s *runtimespec.Spec) (err error) {
// mergeMounts merge CRI mounts with extra mounts. If a mount destination
// is mounted by both a CRI mount and an extra mount, the CRI mount will
// be kept.
var (
criMounts = config.GetMounts()
mounts = append([]*runtime.Mount{}, criMounts...)
)
// Copy all mounts from extra mounts, except for mounts overridden by CRI.
for _, e := range extra {
found := false
for _, c := range criMounts {
if filepath.Clean(e.ContainerPath) == filepath.Clean(c.ContainerPath) {
found = true
break
}
}
if !found {
mounts = append(mounts, e)
}
}
// Sort mounts in number of parts. This ensures that high level mounts don't
// shadow other mounts.
sort.Sort(orderedMounts(mounts))
// Mount cgroup into the container as readonly, which inherits docker's behavior.
s.Mounts = append(s.Mounts, runtimespec.Mount{
Source: "cgroup",
Destination: "/sys/fs/cgroup",
Type: "cgroup",
Options: []string{"nosuid", "noexec", "nodev", "relatime", "ro"},
})
// Copy all mounts from default mounts, except for
// - mounts overridden by supplied mount;
// - all mounts under /dev if a supplied /dev is present.
mountSet := make(map[string]struct{})
for _, m := range mounts {
mountSet[filepath.Clean(m.ContainerPath)] = struct{}{}
}
defaultMounts := s.Mounts
s.Mounts = nil
for _, m := range defaultMounts {
dst := filepath.Clean(m.Destination)
if _, ok := mountSet[dst]; ok {
// filter out mount overridden by a supplied mount
continue
}
if _, mountDev := mountSet["/dev"]; mountDev && strings.HasPrefix(dst, "/dev/") {
// filter out everything under /dev if /dev is a supplied mount
continue
}
s.Mounts = append(s.Mounts, m)
}
for _, mount := range mounts {
var (
dst = mount.GetContainerPath()
src = mount.GetHostPath()
)
// Create the host path if it doesn't exist.
// TODO(random-liu): Add CRI validation test for this case.
if _, err := osi.Stat(src); err != nil {
if !os.IsNotExist(err) {
return fmt.Errorf("failed to stat %q: %w", src, err)
}
if err := osi.MkdirAll(src, 0755); err != nil {
return fmt.Errorf("failed to mkdir %q: %w", src, err)
}
}
// TODO(random-liu): Add cri-containerd integration test or cri validation test
// for this.
src, err := osi.ResolveSymbolicLink(src)
if err != nil {
return fmt.Errorf("failed to resolve symlink %q: %w", src, err)
}
if s.Linux == nil {
s.Linux = &runtimespec.Linux{}
}
options := []string{"rbind"}
switch mount.GetPropagation() {
case runtime.MountPropagation_PROPAGATION_PRIVATE:
options = append(options, "rprivate")
// Since default root propagation in runc is rprivate ignore
// setting the root propagation
case runtime.MountPropagation_PROPAGATION_BIDIRECTIONAL:
if err := ensureShared(src, osi.LookupMount); err != nil {
return err
}
options = append(options, "rshared")
s.Linux.RootfsPropagation = "rshared"
case runtime.MountPropagation_PROPAGATION_HOST_TO_CONTAINER:
if err := ensureSharedOrSlave(src, osi.LookupMount); err != nil {
return err
}
options = append(options, "rslave")
if s.Linux.RootfsPropagation != "rshared" &&
s.Linux.RootfsPropagation != "rslave" {
s.Linux.RootfsPropagation = "rslave"
}
default:
log.G(ctx).Warnf("Unknown propagation mode for hostPath %q", mount.HostPath)
options = append(options, "rprivate")
}
// NOTE(random-liu): we don't change all mounts to `ro` when root filesystem
// is readonly. This is different from docker's behavior, but make more sense.
if mount.GetReadonly() {
options = append(options, "ro")
} else {
options = append(options, "rw")
}
if mount.GetSelinuxRelabel() {
if err := label.Relabel(src, mountLabel, false); err != nil && err != unix.ENOTSUP {
return fmt.Errorf("relabel %q with %q failed: %w", src, mountLabel, err)
}
}
s.Mounts = append(s.Mounts, runtimespec.Mount{
Source: src,
Destination: dst,
Type: "bind",
Options: options,
})
}
return nil
}
}
// Ensure mount point on which path is mounted, is shared.
func ensureShared(path string, lookupMount func(string) (mount.Info, error)) error {
mountInfo, err := lookupMount(path)
if err != nil {
return err
}
// Make sure source mount point is shared.
optsSplit := strings.Split(mountInfo.Optional, " ")
for _, opt := range optsSplit {
if strings.HasPrefix(opt, "shared:") {
return nil
}
}
return fmt.Errorf("path %q is mounted on %q but it is not a shared mount", path, mountInfo.Mountpoint)
}
// ensure mount point on which path is mounted, is either shared or slave.
func ensureSharedOrSlave(path string, lookupMount func(string) (mount.Info, error)) error {
mountInfo, err := lookupMount(path)
if err != nil {
return err
}
// Make sure source mount point is shared.
optsSplit := strings.Split(mountInfo.Optional, " ")
for _, opt := range optsSplit {
if strings.HasPrefix(opt, "shared:") {
return nil
} else if strings.HasPrefix(opt, "master:") {
return nil
}
}
return fmt.Errorf("path %q is mounted on %q but it is not a shared or slave mount", path, mountInfo.Mountpoint)
}
// getDeviceUserGroupID() is used to find the right uid/gid
// value for the device node created in the container namespace.
// The runtime executes mknod() and chmod()s the created
// device with the values returned here.
//
// On Linux, uid and gid are sufficient and the user/groupname do not
// need to be resolved.
//
// TODO(mythi): In case of user namespaces, the runtime simply bind
// mounts the devices from the host. Additional logic is needed
// to check that the runtimes effective UID/GID on the host has the
// permissions to access the device node and/or the right user namespace
// mappings are created.
//
// Ref: https://github.com/kubernetes/kubernetes/issues/92211
func getDeviceUserGroupID(runAsVal *runtime.Int64Value) uint32 {
if runAsVal != nil {
return uint32(runAsVal.GetValue())
}
return 0
}
// WithDevices sets the provided devices onto the container spec
func WithDevices(osi osinterface.OS, config *runtime.ContainerConfig, enableDeviceOwnershipFromSecurityContext bool) oci.SpecOpts {
return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) (err error) {
if s.Linux == nil {
s.Linux = &runtimespec.Linux{}
}
if s.Linux.Resources == nil {
s.Linux.Resources = &runtimespec.LinuxResources{}
}
oldDevices := len(s.Linux.Devices)
for _, device := range config.GetDevices() {
path, err := osi.ResolveSymbolicLink(device.HostPath)
if err != nil {
return err
}
o := oci.WithDevices(path, device.ContainerPath, device.Permissions)
if err := o(ctx, client, c, s); err != nil {
return err
}
}
if enableDeviceOwnershipFromSecurityContext {
UID := getDeviceUserGroupID(config.GetLinux().GetSecurityContext().GetRunAsUser())
GID := getDeviceUserGroupID(config.GetLinux().GetSecurityContext().GetRunAsGroup())
// Loop all new devices added by oci.WithDevices() to update their
// dev.UID/dev.GID.
//
// non-zero UID/GID from SecurityContext is used to override host's
// device UID/GID for the container.
for idx := oldDevices; idx < len(s.Linux.Devices); idx++ {
if UID != 0 {
*s.Linux.Devices[idx].UID = UID
}
if GID != 0 {
*s.Linux.Devices[idx].GID = GID
}
}
}
return nil
}
}
// Linux dependent OCI spec opts.
var (
swapControllerAvailability bool
@ -312,88 +70,6 @@ func SwapControllerAvailable() bool {
return swapControllerAvailability
}
// WithResources sets the provided resource restrictions
func WithResources(resources *runtime.LinuxContainerResources, tolerateMissingHugetlbController, disableHugetlbController bool) oci.SpecOpts {
return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) (err error) {
if resources == nil {
return nil
}
if s.Linux == nil {
s.Linux = &runtimespec.Linux{}
}
if s.Linux.Resources == nil {
s.Linux.Resources = &runtimespec.LinuxResources{}
}
if s.Linux.Resources.CPU == nil {
s.Linux.Resources.CPU = &runtimespec.LinuxCPU{}
}
if s.Linux.Resources.Memory == nil {
s.Linux.Resources.Memory = &runtimespec.LinuxMemory{}
}
var (
p = uint64(resources.GetCpuPeriod())
q = resources.GetCpuQuota()
shares = uint64(resources.GetCpuShares())
limit = resources.GetMemoryLimitInBytes()
swapLimit = resources.GetMemorySwapLimitInBytes()
hugepages = resources.GetHugepageLimits()
)
if p != 0 {
s.Linux.Resources.CPU.Period = &p
}
if q != 0 {
s.Linux.Resources.CPU.Quota = &q
}
if shares != 0 {
s.Linux.Resources.CPU.Shares = &shares
}
if cpus := resources.GetCpusetCpus(); cpus != "" {
s.Linux.Resources.CPU.Cpus = cpus
}
if mems := resources.GetCpusetMems(); mems != "" {
s.Linux.Resources.CPU.Mems = resources.GetCpusetMems()
}
if limit != 0 {
s.Linux.Resources.Memory.Limit = &limit
// swap/memory limit should be equal to prevent container from swapping by default
if swapLimit == 0 && SwapControllerAvailable() {
s.Linux.Resources.Memory.Swap = &limit
}
}
if swapLimit != 0 {
s.Linux.Resources.Memory.Swap = &swapLimit
}
if !disableHugetlbController {
if isHugetlbControllerPresent() {
for _, limit := range hugepages {
s.Linux.Resources.HugepageLimits = append(s.Linux.Resources.HugepageLimits, runtimespec.LinuxHugepageLimit{
Pagesize: limit.PageSize,
Limit: limit.Limit,
})
}
} else {
if !tolerateMissingHugetlbController {
return errors.New("huge pages limits are specified but hugetlb cgroup controller is missing. " +
"Please set tolerate_missing_hugetlb_controller to `true` to ignore this error")
}
logrus.Warn("hugetlb cgroup controller is absent. skipping huge pages limits")
}
}
if unified := resources.GetUnified(); unified != nil {
if s.Linux.Resources.Unified == nil {
s.Linux.Resources.Unified = make(map[string]string)
}
for k, v := range unified {
s.Linux.Resources.Unified[k] = v
}
}
return nil
}
}
var (
supportsHugetlbOnce sync.Once
supportsHugetlb bool
@ -463,72 +139,6 @@ func IsCgroup2UnifiedMode() bool {
return isUnified
}
// WithOOMScoreAdj sets the oom score
func WithOOMScoreAdj(config *runtime.ContainerConfig, restrict bool) oci.SpecOpts {
return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) error {
if s.Process == nil {
s.Process = &runtimespec.Process{}
}
resources := config.GetLinux().GetResources()
if resources == nil {
return nil
}
adj := int(resources.GetOomScoreAdj())
if restrict {
var err error
adj, err = restrictOOMScoreAdj(adj)
if err != nil {
return err
}
}
s.Process.OOMScoreAdj = &adj
return nil
}
}
// WithPodOOMScoreAdj sets the oom score for the pod sandbox
func WithPodOOMScoreAdj(adj int, restrict bool) oci.SpecOpts {
return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) error {
if s.Process == nil {
s.Process = &runtimespec.Process{}
}
if restrict {
var err error
adj, err = restrictOOMScoreAdj(adj)
if err != nil {
return err
}
}
s.Process.OOMScoreAdj = &adj
return nil
}
}
func getCurrentOOMScoreAdj() (int, error) {
b, err := os.ReadFile("/proc/self/oom_score_adj")
if err != nil {
return 0, fmt.Errorf("could not get the daemon oom_score_adj: %w", err)
}
s := strings.TrimSpace(string(b))
i, err := strconv.Atoi(s)
if err != nil {
return 0, fmt.Errorf("could not get the daemon oom_score_adj: %w", err)
}
return i, nil
}
func restrictOOMScoreAdj(preferredOOMScoreAdj int) (int, error) {
currentOOMScoreAdj, err := getCurrentOOMScoreAdj()
if err != nil {
return preferredOOMScoreAdj, err
}
if preferredOOMScoreAdj < currentOOMScoreAdj {
return currentOOMScoreAdj, nil
}
return preferredOOMScoreAdj, nil
}
// WithCDI updates OCI spec with CDI content
func WithCDI(annotations map[string]string) oci.SpecOpts {
return func(ctx context.Context, _ oci.Client, c *containers.Container, s *oci.Spec) error {

View File

@ -0,0 +1,426 @@
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package opts
import (
"context"
"errors"
"fmt"
"os"
"path/filepath"
"sort"
"strconv"
"strings"
"syscall"
runtimespec "github.com/opencontainers/runtime-spec/specs-go"
"github.com/opencontainers/selinux/go-selinux/label"
"github.com/sirupsen/logrus"
runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
"github.com/containerd/containerd/containers"
"github.com/containerd/containerd/log"
"github.com/containerd/containerd/mount"
"github.com/containerd/containerd/oci"
osinterface "github.com/containerd/containerd/pkg/os"
)
// WithMounts sorts and adds runtime and CRI mounts to the spec
func WithMounts(osi osinterface.OS, config *runtime.ContainerConfig, extra []*runtime.Mount, mountLabel string) oci.SpecOpts {
return func(ctx context.Context, client oci.Client, _ *containers.Container, s *runtimespec.Spec) (err error) {
// mergeMounts merge CRI mounts with extra mounts. If a mount destination
// is mounted by both a CRI mount and an extra mount, the CRI mount will
// be kept.
var (
criMounts = config.GetMounts()
mounts = append([]*runtime.Mount{}, criMounts...)
)
// Copy all mounts from extra mounts, except for mounts overridden by CRI.
for _, e := range extra {
found := false
for _, c := range criMounts {
if filepath.Clean(e.ContainerPath) == filepath.Clean(c.ContainerPath) {
found = true
break
}
}
if !found {
mounts = append(mounts, e)
}
}
// Sort mounts in number of parts. This ensures that high level mounts don't
// shadow other mounts.
sort.Sort(orderedMounts(mounts))
// Mount cgroup into the container as readonly, which inherits docker's behavior.
s.Mounts = append(s.Mounts, runtimespec.Mount{
Source: "cgroup",
Destination: "/sys/fs/cgroup",
Type: "cgroup",
Options: []string{"nosuid", "noexec", "nodev", "relatime", "ro"},
})
// Copy all mounts from default mounts, except for
// - mounts overridden by supplied mount;
// - all mounts under /dev if a supplied /dev is present.
mountSet := make(map[string]struct{})
for _, m := range mounts {
mountSet[filepath.Clean(m.ContainerPath)] = struct{}{}
}
defaultMounts := s.Mounts
s.Mounts = nil
for _, m := range defaultMounts {
dst := filepath.Clean(m.Destination)
if _, ok := mountSet[dst]; ok {
// filter out mount overridden by a supplied mount
continue
}
if _, mountDev := mountSet["/dev"]; mountDev && strings.HasPrefix(dst, "/dev/") {
// filter out everything under /dev if /dev is a supplied mount
continue
}
s.Mounts = append(s.Mounts, m)
}
for _, mount := range mounts {
var (
dst = mount.GetContainerPath()
src = mount.GetHostPath()
)
// Create the host path if it doesn't exist.
// TODO(random-liu): Add CRI validation test for this case.
if _, err := osi.Stat(src); err != nil {
if !os.IsNotExist(err) {
return fmt.Errorf("failed to stat %q: %w", src, err)
}
if err := osi.MkdirAll(src, 0755); err != nil {
return fmt.Errorf("failed to mkdir %q: %w", src, err)
}
}
// TODO(random-liu): Add cri-containerd integration test or cri validation test
// for this.
src, err := osi.ResolveSymbolicLink(src)
if err != nil {
return fmt.Errorf("failed to resolve symlink %q: %w", src, err)
}
if s.Linux == nil {
s.Linux = &runtimespec.Linux{}
}
options := []string{"rbind"}
switch mount.GetPropagation() {
case runtime.MountPropagation_PROPAGATION_PRIVATE:
options = append(options, "rprivate")
// Since default root propagation in runc is rprivate ignore
// setting the root propagation
case runtime.MountPropagation_PROPAGATION_BIDIRECTIONAL:
if err := ensureShared(src, osi.LookupMount); err != nil {
return err
}
options = append(options, "rshared")
s.Linux.RootfsPropagation = "rshared"
case runtime.MountPropagation_PROPAGATION_HOST_TO_CONTAINER:
if err := ensureSharedOrSlave(src, osi.LookupMount); err != nil {
return err
}
options = append(options, "rslave")
if s.Linux.RootfsPropagation != "rshared" &&
s.Linux.RootfsPropagation != "rslave" {
s.Linux.RootfsPropagation = "rslave"
}
default:
log.G(ctx).Warnf("Unknown propagation mode for hostPath %q", mount.HostPath)
options = append(options, "rprivate")
}
// NOTE(random-liu): we don't change all mounts to `ro` when root filesystem
// is readonly. This is different from docker's behavior, but make more sense.
if mount.GetReadonly() {
options = append(options, "ro")
} else {
options = append(options, "rw")
}
if mount.GetSelinuxRelabel() {
ENOTSUP := syscall.Errno(0x5f) // Linux specific error code, this branch will not execute on non Linux platforms.
if err := label.Relabel(src, mountLabel, false); err != nil && err != ENOTSUP {
return fmt.Errorf("relabel %q with %q failed: %w", src, mountLabel, err)
}
}
s.Mounts = append(s.Mounts, runtimespec.Mount{
Source: src,
Destination: dst,
Type: "bind",
Options: options,
})
}
return nil
}
}
// Ensure mount point on which path is mounted, is shared.
func ensureShared(path string, lookupMount func(string) (mount.Info, error)) error {
mountInfo, err := lookupMount(path)
if err != nil {
return err
}
// Make sure source mount point is shared.
optsSplit := strings.Split(mountInfo.Optional, " ")
for _, opt := range optsSplit {
if strings.HasPrefix(opt, "shared:") {
return nil
}
}
return fmt.Errorf("path %q is mounted on %q but it is not a shared mount", path, mountInfo.Mountpoint)
}
// ensure mount point on which path is mounted, is either shared or slave.
func ensureSharedOrSlave(path string, lookupMount func(string) (mount.Info, error)) error {
mountInfo, err := lookupMount(path)
if err != nil {
return err
}
// Make sure source mount point is shared.
optsSplit := strings.Split(mountInfo.Optional, " ")
for _, opt := range optsSplit {
if strings.HasPrefix(opt, "shared:") {
return nil
} else if strings.HasPrefix(opt, "master:") {
return nil
}
}
return fmt.Errorf("path %q is mounted on %q but it is not a shared or slave mount", path, mountInfo.Mountpoint)
}
// getDeviceUserGroupID() is used to find the right uid/gid
// value for the device node created in the container namespace.
// The runtime executes mknod() and chmod()s the created
// device with the values returned here.
//
// On Linux, uid and gid are sufficient and the user/groupname do not
// need to be resolved.
//
// TODO(mythi): In case of user namespaces, the runtime simply bind
// mounts the devices from the host. Additional logic is needed
// to check that the runtimes effective UID/GID on the host has the
// permissions to access the device node and/or the right user namespace
// mappings are created.
//
// Ref: https://github.com/kubernetes/kubernetes/issues/92211
func getDeviceUserGroupID(runAsVal *runtime.Int64Value) uint32 {
if runAsVal != nil {
return uint32(runAsVal.GetValue())
}
return 0
}
// WithDevices sets the provided devices onto the container spec
func WithDevices(osi osinterface.OS, config *runtime.ContainerConfig, enableDeviceOwnershipFromSecurityContext bool) oci.SpecOpts {
return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) (err error) {
if s.Linux == nil {
s.Linux = &runtimespec.Linux{}
}
if s.Linux.Resources == nil {
s.Linux.Resources = &runtimespec.LinuxResources{}
}
oldDevices := len(s.Linux.Devices)
for _, device := range config.GetDevices() {
path, err := osi.ResolveSymbolicLink(device.HostPath)
if err != nil {
return err
}
o := oci.WithDevices(path, device.ContainerPath, device.Permissions)
if err := o(ctx, client, c, s); err != nil {
return err
}
}
if enableDeviceOwnershipFromSecurityContext {
UID := getDeviceUserGroupID(config.GetLinux().GetSecurityContext().GetRunAsUser())
GID := getDeviceUserGroupID(config.GetLinux().GetSecurityContext().GetRunAsGroup())
// Loop all new devices added by oci.WithDevices() to update their
// dev.UID/dev.GID.
//
// non-zero UID/GID from SecurityContext is used to override host's
// device UID/GID for the container.
for idx := oldDevices; idx < len(s.Linux.Devices); idx++ {
if UID != 0 {
*s.Linux.Devices[idx].UID = UID
}
if GID != 0 {
*s.Linux.Devices[idx].GID = GID
}
}
}
return nil
}
}
// WithResources sets the provided resource restrictions
func WithResources(resources *runtime.LinuxContainerResources, tolerateMissingHugetlbController, disableHugetlbController bool) oci.SpecOpts {
return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) (err error) {
if resources == nil {
return nil
}
if s.Linux == nil {
s.Linux = &runtimespec.Linux{}
}
if s.Linux.Resources == nil {
s.Linux.Resources = &runtimespec.LinuxResources{}
}
if s.Linux.Resources.CPU == nil {
s.Linux.Resources.CPU = &runtimespec.LinuxCPU{}
}
if s.Linux.Resources.Memory == nil {
s.Linux.Resources.Memory = &runtimespec.LinuxMemory{}
}
var (
p = uint64(resources.GetCpuPeriod())
q = resources.GetCpuQuota()
shares = uint64(resources.GetCpuShares())
limit = resources.GetMemoryLimitInBytes()
swapLimit = resources.GetMemorySwapLimitInBytes()
hugepages = resources.GetHugepageLimits()
)
if p != 0 {
s.Linux.Resources.CPU.Period = &p
}
if q != 0 {
s.Linux.Resources.CPU.Quota = &q
}
if shares != 0 {
s.Linux.Resources.CPU.Shares = &shares
}
if cpus := resources.GetCpusetCpus(); cpus != "" {
s.Linux.Resources.CPU.Cpus = cpus
}
if mems := resources.GetCpusetMems(); mems != "" {
s.Linux.Resources.CPU.Mems = resources.GetCpusetMems()
}
if limit != 0 {
s.Linux.Resources.Memory.Limit = &limit
// swap/memory limit should be equal to prevent container from swapping by default
if swapLimit == 0 && SwapControllerAvailable() {
s.Linux.Resources.Memory.Swap = &limit
}
}
if swapLimit != 0 {
s.Linux.Resources.Memory.Swap = &swapLimit
}
if !disableHugetlbController {
if isHugetlbControllerPresent() {
for _, limit := range hugepages {
s.Linux.Resources.HugepageLimits = append(s.Linux.Resources.HugepageLimits, runtimespec.LinuxHugepageLimit{
Pagesize: limit.PageSize,
Limit: limit.Limit,
})
}
} else {
if !tolerateMissingHugetlbController {
return errors.New("huge pages limits are specified but hugetlb cgroup controller is missing. " +
"Please set tolerate_missing_hugetlb_controller to `true` to ignore this error")
}
logrus.Warn("hugetlb cgroup controller is absent. skipping huge pages limits")
}
}
if unified := resources.GetUnified(); unified != nil {
if s.Linux.Resources.Unified == nil {
s.Linux.Resources.Unified = make(map[string]string)
}
for k, v := range unified {
s.Linux.Resources.Unified[k] = v
}
}
return nil
}
}
// WithOOMScoreAdj sets the oom score
func WithOOMScoreAdj(config *runtime.ContainerConfig, restrict bool) oci.SpecOpts {
return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) error {
if s.Process == nil {
s.Process = &runtimespec.Process{}
}
resources := config.GetLinux().GetResources()
if resources == nil {
return nil
}
adj := int(resources.GetOomScoreAdj())
if restrict {
var err error
adj, err = restrictOOMScoreAdj(adj)
if err != nil {
return err
}
}
s.Process.OOMScoreAdj = &adj
return nil
}
}
// WithPodOOMScoreAdj sets the oom score for the pod sandbox
func WithPodOOMScoreAdj(adj int, restrict bool) oci.SpecOpts {
return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) error {
if s.Process == nil {
s.Process = &runtimespec.Process{}
}
if restrict {
var err error
adj, err = restrictOOMScoreAdj(adj)
if err != nil {
return err
}
}
s.Process.OOMScoreAdj = &adj
return nil
}
}
func getCurrentOOMScoreAdj() (int, error) {
b, err := os.ReadFile("/proc/self/oom_score_adj")
if err != nil {
return 0, fmt.Errorf("could not get the daemon oom_score_adj: %w", err)
}
s := strings.TrimSpace(string(b))
i, err := strconv.Atoi(s)
if err != nil {
return 0, fmt.Errorf("could not get the daemon oom_score_adj: %w", err)
}
return i, nil
}
func restrictOOMScoreAdj(preferredOOMScoreAdj int) (int, error) {
currentOOMScoreAdj, err := getCurrentOOMScoreAdj()
if err != nil {
return preferredOOMScoreAdj, err
}
if preferredOOMScoreAdj < currentOOMScoreAdj {
return currentOOMScoreAdj, nil
}
return preferredOOMScoreAdj, nil
}

View File

@ -0,0 +1,41 @@
//go:build !linux
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package opts
import (
"context"
"github.com/containerd/containerd/containers"
"github.com/containerd/containerd/oci"
)
func isHugetlbControllerPresent() bool {
return false
}
func SwapControllerAvailable() bool {
return false
}
// WithCDI does nothing on non Linux platforms.
func WithCDI(_ map[string]string) oci.SpecOpts {
return func(ctx context.Context, client oci.Client, container *containers.Container, spec *oci.Spec) error {
return nil
}
}

View File

@ -24,11 +24,11 @@ import (
"sort"
"strings"
"github.com/containerd/containerd/containers"
"github.com/containerd/containerd/oci"
runtimespec "github.com/opencontainers/runtime-spec/specs-go"
runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
"github.com/containerd/containerd/containers"
"github.com/containerd/containerd/oci"
osinterface "github.com/containerd/containerd/pkg/os"
)
@ -229,8 +229,8 @@ func WithWindowsCredentialSpec(credentialSpec string) oci.SpecOpts {
}
}
// WithDevices sets the provided devices onto the container spec
func WithDevices(config *runtime.ContainerConfig) oci.SpecOpts {
// WithWindowsDevices sets the provided devices onto the container spec
func WithWindowsDevices(config *runtime.ContainerConfig) oci.SpecOpts {
return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) (err error) {
for _, device := range config.GetDevices() {
if device.ContainerPath != "" {

View File

@ -22,14 +22,15 @@ import (
"strings"
"testing"
"github.com/containerd/containerd/containers"
"github.com/containerd/containerd/namespaces"
"github.com/containerd/containerd/oci"
osinterface "github.com/containerd/containerd/pkg/os"
"github.com/opencontainers/runtime-spec/specs-go"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
"github.com/containerd/containerd/containers"
"github.com/containerd/containerd/namespaces"
"github.com/containerd/containerd/oci"
osinterface "github.com/containerd/containerd/pkg/os"
)
func TestWithDevices(t *testing.T) {
@ -183,7 +184,7 @@ func TestWithDevices(t *testing.T) {
config := runtime.ContainerConfig{}
config.Devices = tc.devices
specOpts := []oci.SpecOpts{WithDevices(&config)}
specOpts := []oci.SpecOpts{WithWindowsDevices(&config)}
platform := "windows"
if tc.isLCOW {

View File

@ -24,6 +24,14 @@ import (
"strconv"
"time"
"github.com/containerd/typeurl"
"github.com/davecgh/go-spew/spew"
imagespec "github.com/opencontainers/image-spec/specs-go/v1"
runtimespec "github.com/opencontainers/runtime-spec/specs-go"
"github.com/opencontainers/selinux/go-selinux"
"github.com/opencontainers/selinux/go-selinux/label"
runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
"github.com/containerd/containerd"
"github.com/containerd/containerd/api/types"
"github.com/containerd/containerd/containers"
@ -37,12 +45,6 @@ import (
containerstore "github.com/containerd/containerd/pkg/cri/store/container"
"github.com/containerd/containerd/pkg/cri/util"
ctrdutil "github.com/containerd/containerd/pkg/cri/util"
"github.com/containerd/typeurl"
"github.com/davecgh/go-spew/spew"
imagespec "github.com/opencontainers/image-spec/specs-go/v1"
runtimespec "github.com/opencontainers/runtime-spec/specs-go"
"github.com/opencontainers/selinux/go-selinux"
runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
)
func init() {
@ -419,44 +421,317 @@ func (c *criService) buildContainerSpec(
ociRuntime config.Runtime,
) (_ *runtimespec.Spec, retErr error) {
var (
specOpts []oci.SpecOpts
// Platform helpers
isLinux = platform.OS == "linux"
isWindows = platform.OS == "windows"
isDarwin = platform.OS == "darwin"
)
if isLinux {
specOpts = append(specOpts, oci.WithoutRunMount)
switch {
case isLinux:
return c.buildLinuxSpec(
id,
sandboxID,
sandboxPid,
netNSPath,
containerName,
imageName,
config,
sandboxConfig,
imageConfig,
extraMounts,
ociRuntime,
)
case isWindows:
return c.buildWindowsSpec(
id,
sandboxID,
sandboxPid,
netNSPath,
containerName,
imageName,
config,
sandboxConfig,
imageConfig,
extraMounts,
ociRuntime,
)
case isDarwin:
return c.buildDarwinSpec(
id,
sandboxID,
containerName,
imageName,
config,
sandboxConfig,
imageConfig,
extraMounts,
ociRuntime,
)
default:
return nil, fmt.Errorf("unsupported spec platform: %s", platform.OS)
}
}
// Only clear the default security settings if the runtime does not have a custom
// base runtime spec. Admins can use this functionality to define
// default ulimits, seccomp, or other default settings.
if ociRuntime.BaseRuntimeSpec == "" {
specOpts = append(specOpts, customopts.WithoutDefaultSecuritySettings)
func (c *criService) buildLinuxSpec(
id string,
sandboxID string,
sandboxPid uint32,
netNSPath string,
containerName string,
imageName string,
config *runtime.ContainerConfig,
sandboxConfig *runtime.PodSandboxConfig,
imageConfig *imagespec.ImageConfig,
extraMounts []*runtime.Mount,
ociRuntime config.Runtime,
) (_ *runtimespec.Spec, retErr error) {
specOpts := []oci.SpecOpts{
oci.WithoutRunMount,
}
// only clear the default security settings if the runtime does not have a custom
// base runtime spec spec. Admins can use this functionality to define
// default ulimits, seccomp, or other default settings.
if ociRuntime.BaseRuntimeSpec == "" {
specOpts = append(specOpts, customopts.WithoutDefaultSecuritySettings)
}
specOpts = append(specOpts,
customopts.WithRelativeRoot(relativeRootfsPath),
customopts.WithProcessArgs(config, imageConfig),
oci.WithDefaultPathEnv,
// this will be set based on the security context below
oci.WithNewPrivileges,
)
if config.GetWorkingDir() != "" {
specOpts = append(specOpts, oci.WithProcessCwd(config.GetWorkingDir()))
} else if imageConfig.WorkingDir != "" {
specOpts = append(specOpts, oci.WithProcessCwd(imageConfig.WorkingDir))
}
if config.GetTty() {
specOpts = append(specOpts, oci.WithTTY)
}
// Add HOSTNAME env.
var (
err error
hostname = sandboxConfig.GetHostname()
)
if hostname == "" {
if hostname, err = c.os.Hostname(); err != nil {
return nil, err
}
}
specOpts = append(specOpts, oci.WithEnv([]string{hostnameEnv + "=" + hostname}))
specOpts = append(specOpts,
customopts.WithRelativeRoot(relativeRootfsPath),
oci.WithDefaultPathEnv,
// this will be set based on the security context below
oci.WithNewPrivileges,
)
// Apply envs from image config first, so that envs from container config
// can override them.
env := append([]string{}, imageConfig.Env...)
for _, e := range config.GetEnvs() {
env = append(env, e.GetKey()+"="+e.GetValue())
}
specOpts = append(specOpts, oci.WithEnv(env))
// Add HOSTNAME env.
var (
err error
hostname = sandboxConfig.GetHostname()
)
if hostname == "" {
if hostname, err = c.os.Hostname(); err != nil {
securityContext := config.GetLinux().GetSecurityContext()
labelOptions, err := toLabel(securityContext.GetSelinuxOptions())
if err != nil {
return nil, err
}
if len(labelOptions) == 0 {
// Use pod level SELinux config
if sandbox, err := c.sandboxStore.Get(sandboxID); err == nil {
labelOptions, err = selinux.DupSecOpt(sandbox.ProcessLabel)
if err != nil {
return nil, err
}
}
specOpts = append(specOpts, oci.WithEnv([]string{hostnameEnv + "=" + hostname}))
}
specOpts = append(specOpts, customopts.WithProcessArgs(config, imageConfig))
processLabel, mountLabel, err := label.InitLabels(labelOptions)
if err != nil {
return nil, fmt.Errorf("failed to init selinux options %+v: %w", securityContext.GetSelinuxOptions(), err)
}
defer func() {
if retErr != nil {
selinux.ReleaseLabel(processLabel)
}
}()
specOpts = append(specOpts, customopts.WithMounts(c.os, config, extraMounts, mountLabel))
if !c.config.DisableProcMount {
// Change the default masked/readonly paths to empty slices
// See https://github.com/containerd/containerd/issues/5029
// TODO: Provide an option to set default paths to the ones in oci.populateDefaultUnixSpec()
specOpts = append(specOpts, oci.WithMaskedPaths([]string{}), oci.WithReadonlyPaths([]string{}))
// Apply masked paths if specified.
// If the container is privileged, this will be cleared later on.
if maskedPaths := securityContext.GetMaskedPaths(); maskedPaths != nil {
specOpts = append(specOpts, oci.WithMaskedPaths(maskedPaths))
}
// Apply readonly paths if specified.
// If the container is privileged, this will be cleared later on.
if readonlyPaths := securityContext.GetReadonlyPaths(); readonlyPaths != nil {
specOpts = append(specOpts, oci.WithReadonlyPaths(readonlyPaths))
}
}
specOpts = append(specOpts, customopts.WithDevices(c.os, config, c.config.DeviceOwnershipFromSecurityContext),
customopts.WithCapabilities(securityContext, c.allCaps))
if securityContext.GetPrivileged() {
if !sandboxConfig.GetLinux().GetSecurityContext().GetPrivileged() {
return nil, errors.New("no privileged container allowed in sandbox")
}
specOpts = append(specOpts, oci.WithPrivileged)
if !ociRuntime.PrivilegedWithoutHostDevices {
specOpts = append(specOpts, oci.WithHostDevices, oci.WithAllDevicesAllowed)
} else if ociRuntime.PrivilegedWithoutHostDevicesAllDevicesAllowed {
// allow rwm on all devices for the container
specOpts = append(specOpts, oci.WithAllDevicesAllowed)
}
}
// Clear all ambient capabilities. The implication of non-root + caps
// is not clearly defined in Kubernetes.
// See https://github.com/kubernetes/kubernetes/issues/56374
// Keep docker's behavior for now.
specOpts = append(specOpts,
customopts.WithoutAmbientCaps,
customopts.WithSelinuxLabels(processLabel, mountLabel),
)
// TODO: Figure out whether we should set no new privilege for sandbox container by default
if securityContext.GetNoNewPrivs() {
specOpts = append(specOpts, oci.WithNoNewPrivileges)
}
// TODO(random-liu): [P1] Set selinux options (privileged or not).
if securityContext.GetReadonlyRootfs() {
specOpts = append(specOpts, oci.WithRootFSReadonly())
}
if c.config.DisableCgroup {
specOpts = append(specOpts, customopts.WithDisabledCgroups)
} else {
specOpts = append(specOpts, customopts.WithResources(config.GetLinux().GetResources(), c.config.TolerateMissingHugetlbController, c.config.DisableHugetlbController))
if sandboxConfig.GetLinux().GetCgroupParent() != "" {
cgroupsPath := getCgroupsPath(sandboxConfig.GetLinux().GetCgroupParent(), id)
specOpts = append(specOpts, oci.WithCgroup(cgroupsPath))
}
}
supplementalGroups := securityContext.GetSupplementalGroups()
// Get blockio class
blockIOClass, err := c.blockIOClassFromAnnotations(config.GetMetadata().GetName(), config.Annotations, sandboxConfig.Annotations)
if err != nil {
return nil, fmt.Errorf("failed to set blockio class: %w", err)
}
if blockIOClass != "" {
if linuxBlockIO, err := blockIOToLinuxOci(blockIOClass); err == nil {
specOpts = append(specOpts, oci.WithBlockIO(linuxBlockIO))
} else {
return nil, err
}
}
// Get RDT class
rdtClass, err := c.rdtClassFromAnnotations(config.GetMetadata().GetName(), config.Annotations, sandboxConfig.Annotations)
if err != nil {
return nil, fmt.Errorf("failed to set RDT class: %w", err)
}
if rdtClass != "" {
specOpts = append(specOpts, oci.WithRdt(rdtClass, "", ""))
}
for pKey, pValue := range getPassthroughAnnotations(sandboxConfig.Annotations,
ociRuntime.PodAnnotations) {
specOpts = append(specOpts, customopts.WithAnnotation(pKey, pValue))
}
for pKey, pValue := range getPassthroughAnnotations(config.Annotations,
ociRuntime.ContainerAnnotations) {
specOpts = append(specOpts, customopts.WithAnnotation(pKey, pValue))
}
// Default target PID namespace is the sandbox PID.
targetPid := sandboxPid
// If the container targets another container's PID namespace,
// set targetPid to the PID of that container.
nsOpts := securityContext.GetNamespaceOptions()
if nsOpts.GetPid() == runtime.NamespaceMode_TARGET {
targetContainer, err := c.validateTargetContainer(sandboxID, nsOpts.TargetId)
if err != nil {
return nil, fmt.Errorf("invalid target container: %w", err)
}
status := targetContainer.Status.Get()
targetPid = status.Pid
}
uids, gids, err := parseUsernsIDs(nsOpts.GetUsernsOptions())
if err != nil {
return nil, fmt.Errorf("user namespace configuration: %w", err)
}
// Check sandbox userns config is consistent with container config.
sandboxUsernsOpts := sandboxConfig.GetLinux().GetSecurityContext().GetNamespaceOptions().GetUsernsOptions()
if !sameUsernsConfig(sandboxUsernsOpts, nsOpts.GetUsernsOptions()) {
return nil, fmt.Errorf("user namespace config for sandbox is different from container. Sandbox userns config: %v - Container userns config: %v", sandboxUsernsOpts, nsOpts.GetUsernsOptions())
}
specOpts = append(specOpts,
customopts.WithOOMScoreAdj(config, c.config.RestrictOOMScoreAdj),
customopts.WithPodNamespaces(securityContext, sandboxPid, targetPid, uids, gids),
customopts.WithSupplementalGroups(supplementalGroups),
customopts.WithAnnotation(annotations.ContainerType, annotations.ContainerTypeContainer),
customopts.WithAnnotation(annotations.SandboxID, sandboxID),
customopts.WithAnnotation(annotations.SandboxNamespace, sandboxConfig.GetMetadata().GetNamespace()),
customopts.WithAnnotation(annotations.SandboxUID, sandboxConfig.GetMetadata().GetUid()),
customopts.WithAnnotation(annotations.SandboxName, sandboxConfig.GetMetadata().GetName()),
customopts.WithAnnotation(annotations.ContainerName, containerName),
customopts.WithAnnotation(annotations.ImageName, imageName),
)
// cgroupns is used for hiding /sys/fs/cgroup from containers.
// For compatibility, cgroupns is not used when running in cgroup v1 mode or in privileged.
// https://github.com/containers/libpod/issues/4363
// https://github.com/kubernetes/enhancements/blob/0e409b47497e398b369c281074485c8de129694f/keps/sig-node/20191118-cgroups-v2.md#cgroup-namespace
if isUnifiedCgroupsMode() && !securityContext.GetPrivileged() {
specOpts = append(specOpts, oci.WithLinuxNamespace(runtimespec.LinuxNamespace{Type: runtimespec.CgroupNamespace}))
}
return c.runtimeSpec(id, ociRuntime.BaseRuntimeSpec, specOpts...)
}
func (c *criService) buildWindowsSpec(
id string,
sandboxID string,
sandboxPid uint32,
netNSPath string,
containerName string,
imageName string,
config *runtime.ContainerConfig,
sandboxConfig *runtime.PodSandboxConfig,
imageConfig *imagespec.ImageConfig,
extraMounts []*runtime.Mount,
ociRuntime config.Runtime,
) (_ *runtimespec.Spec, retErr error) {
specOpts := []oci.SpecOpts{
customopts.WithProcessArgs(config, imageConfig),
}
// All containers in a pod need to have HostProcess set if it was set on the pod,
// and vice versa no containers in the pod can be HostProcess if the pods spec
// didn't have the field set. The only case that is valid is if these are the same value.
cntrHpc := config.GetWindows().GetSecurityContext().GetHostProcess()
sandboxHpc := sandboxConfig.GetWindows().GetSecurityContext().GetHostProcess()
if cntrHpc != sandboxHpc {
return nil, errors.New("pod spec and all containers inside must have the HostProcess field set to be valid")
}
if config.GetWorkingDir() != "" {
specOpts = append(specOpts, oci.WithProcessCwd(config.GetWorkingDir()))
@ -476,116 +751,98 @@ func (c *criService) buildContainerSpec(
}
specOpts = append(specOpts, oci.WithEnv(env))
if isWindows {
specOpts = append(specOpts,
// Clear the root location since hcsshim expects it.
// NOTE: readonly rootfs doesn't work on windows.
customopts.WithoutRoot,
oci.WithWindowsNetworkNamespace(netNSPath),
oci.WithHostname(sandboxConfig.GetHostname()),
)
// All containers in a pod need to have HostProcess set if it was set on the pod,
// and vice versa no containers in the pod can be HostProcess if the pods spec
// didn't have the field set. The only case that is valid is if these are the same value.
cntrHpc := config.GetWindows().GetSecurityContext().GetHostProcess()
sandboxHpc := sandboxConfig.GetWindows().GetSecurityContext().GetHostProcess()
if cntrHpc != sandboxHpc {
return nil, errors.New("pod spec and all containers inside must have the HostProcess field set to be valid")
}
specOpts = append(specOpts, customopts.WithAnnotation(annotations.WindowsHostProcess, strconv.FormatBool(sandboxHpc)))
}
// Get spec opts that depend on features offered by the platform containerd daemon is running on.
platformSpecOpts, err := c.platformSpec(
id,
sandboxID,
config,
sandboxConfig,
imageConfig,
extraMounts,
specOpts = append(specOpts,
// Clear the root location since hcsshim expects it.
// NOTE: readonly rootfs doesn't work on windows.
customopts.WithoutRoot,
oci.WithWindowsNetworkNamespace(netNSPath),
oci.WithHostname(sandboxConfig.GetHostname()),
)
if err != nil {
return nil, err
specOpts = append(specOpts, customopts.WithWindowsMounts(c.os, config, extraMounts), customopts.WithWindowsDevices(config))
// Start with the image config user and override below if RunAsUsername is not "".
username := imageConfig.User
windowsConfig := config.GetWindows()
if windowsConfig != nil {
specOpts = append(specOpts, customopts.WithWindowsResources(windowsConfig.GetResources()))
securityCtx := windowsConfig.GetSecurityContext()
if securityCtx != nil {
runAsUser := securityCtx.GetRunAsUsername()
if runAsUser != "" {
username = runAsUser
}
cs := securityCtx.GetCredentialSpec()
if cs != "" {
specOpts = append(specOpts, customopts.WithWindowsCredentialSpec(cs))
}
}
}
specOpts = append(specOpts, platformSpecOpts...)
// There really isn't a good Windows way to verify that the username is available in the
// image as early as here like there is for Linux. Later on in the stack hcsshim
// will handle the behavior of erroring out if the user isn't available in the image
// when trying to run the init process.
specOpts = append(specOpts, oci.WithUser(username))
if isLinux {
securityContext := config.GetLinux().GetSecurityContext()
if !c.config.DisableProcMount {
// Change the default masked/readonly paths to empty slices
// See https://github.com/containerd/containerd/issues/5029
// TODO: Provide an option to set default paths to the ones in oci.populateDefaultUnixSpec()
specOpts = append(specOpts, oci.WithMaskedPaths([]string{}), oci.WithReadonlyPaths([]string{}))
// Apply masked paths if specified.
// If the container is privileged, this will be cleared later on.
if maskedPaths := securityContext.GetMaskedPaths(); maskedPaths != nil {
specOpts = append(specOpts, oci.WithMaskedPaths(maskedPaths))
}
// Apply readonly paths if specified.
// If the container is privileged, this will be cleared later on.
if readonlyPaths := securityContext.GetReadonlyPaths(); readonlyPaths != nil {
specOpts = append(specOpts, oci.WithReadonlyPaths(readonlyPaths))
}
}
if securityContext.GetPrivileged() {
if !sandboxConfig.GetLinux().GetSecurityContext().GetPrivileged() {
return nil, errors.New("no privileged container allowed in sandbox")
}
specOpts = append(specOpts, oci.WithPrivileged)
if !ociRuntime.PrivilegedWithoutHostDevices {
specOpts = append(specOpts, oci.WithHostDevices, oci.WithAllDevicesAllowed)
} else if ociRuntime.PrivilegedWithoutHostDevicesAllDevicesAllowed {
// allow rwm on all devices for the container
specOpts = append(specOpts, oci.WithAllDevicesAllowed)
}
}
// Clear all ambient capabilities. The implication of non-root + caps
// is not clearly defined in Kubernetes.
// See https://github.com/kubernetes/kubernetes/issues/56374
// Keep docker's behavior for now.
specOpts = append(specOpts, customopts.WithoutAmbientCaps)
// TODO: Figure out whether we should set no new privilege for sandbox container by default
if securityContext.GetNoNewPrivs() {
specOpts = append(specOpts, oci.WithNoNewPrivileges)
}
// TODO(random-liu): [P1] Set selinux options (privileged or not).
if securityContext.GetReadonlyRootfs() {
specOpts = append(specOpts, oci.WithRootFSReadonly())
}
supplementalGroups := securityContext.GetSupplementalGroups()
specOpts = append(specOpts, customopts.WithSupplementalGroups(supplementalGroups))
// Default target PID namespace is the sandbox PID.
targetPid := sandboxPid
// If the container targets another container's PID namespace,
// set targetPid to the PID of that container.
nsOpts := securityContext.GetNamespaceOptions()
if nsOpts.GetPid() == runtime.NamespaceMode_TARGET {
targetContainer, err := c.validateTargetContainer(sandboxID, nsOpts.TargetId)
if err != nil {
return nil, fmt.Errorf("invalid target container: %w", err)
}
status := targetContainer.Status.Get()
targetPid = status.Pid
}
specOpts = append(specOpts,
// TODO: This is a hack to make this compile. We should move userns support to sbserver.
customopts.WithPodNamespaces(securityContext, sandboxPid, targetPid, nil, nil),
)
for pKey, pValue := range getPassthroughAnnotations(sandboxConfig.Annotations,
ociRuntime.PodAnnotations) {
specOpts = append(specOpts, customopts.WithAnnotation(pKey, pValue))
}
for pKey, pValue := range getPassthroughAnnotations(config.Annotations,
ociRuntime.ContainerAnnotations) {
specOpts = append(specOpts, customopts.WithAnnotation(pKey, pValue))
}
specOpts = append(specOpts,
customopts.WithAnnotation(annotations.ContainerType, annotations.ContainerTypeContainer),
customopts.WithAnnotation(annotations.SandboxID, sandboxID),
customopts.WithAnnotation(annotations.SandboxNamespace, sandboxConfig.GetMetadata().GetNamespace()),
customopts.WithAnnotation(annotations.SandboxUID, sandboxConfig.GetMetadata().GetUid()),
customopts.WithAnnotation(annotations.SandboxName, sandboxConfig.GetMetadata().GetName()),
customopts.WithAnnotation(annotations.ContainerName, containerName),
customopts.WithAnnotation(annotations.ImageName, imageName),
customopts.WithAnnotation(annotations.WindowsHostProcess, strconv.FormatBool(sandboxHpc)),
)
return c.runtimeSpec(id, ociRuntime.BaseRuntimeSpec, specOpts...)
}
func (c *criService) buildDarwinSpec(
id string,
sandboxID string,
containerName string,
imageName string,
config *runtime.ContainerConfig,
sandboxConfig *runtime.PodSandboxConfig,
imageConfig *imagespec.ImageConfig,
extraMounts []*runtime.Mount,
ociRuntime config.Runtime,
) (_ *runtimespec.Spec, retErr error) {
specOpts := []oci.SpecOpts{
customopts.WithProcessArgs(config, imageConfig),
}
if config.GetWorkingDir() != "" {
specOpts = append(specOpts, oci.WithProcessCwd(config.GetWorkingDir()))
} else if imageConfig.WorkingDir != "" {
specOpts = append(specOpts, oci.WithProcessCwd(imageConfig.WorkingDir))
}
if config.GetTty() {
specOpts = append(specOpts, oci.WithTTY)
}
// Apply envs from image config first, so that envs from container config
// can override them.
env := append([]string{}, imageConfig.Env...)
for _, e := range config.GetEnvs() {
env = append(env, e.GetKey()+"="+e.GetValue())
}
specOpts = append(specOpts, oci.WithEnv(env))
for pKey, pValue := range getPassthroughAnnotations(sandboxConfig.Annotations,
ociRuntime.PodAnnotations) {
specOpts = append(specOpts, customopts.WithAnnotation(pKey, pValue))

View File

@ -25,16 +25,13 @@ import (
"strconv"
"strings"
"github.com/containerd/cgroups/v3"
imagespec "github.com/opencontainers/image-spec/specs-go/v1"
runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
"github.com/containerd/containerd/contrib/apparmor"
"github.com/containerd/containerd/contrib/seccomp"
"github.com/containerd/containerd/oci"
"github.com/containerd/containerd/snapshots"
imagespec "github.com/opencontainers/image-spec/specs-go/v1"
runtimespec "github.com/opencontainers/runtime-spec/specs-go"
"github.com/opencontainers/selinux/go-selinux"
"github.com/opencontainers/selinux/go-selinux/label"
runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
customopts "github.com/containerd/containerd/pkg/cri/opts"
)
@ -111,93 +108,6 @@ func (c *criService) containerMounts(sandboxID string, config *runtime.Container
return mounts
}
func (c *criService) platformSpec(
id string,
sandboxID string,
config *runtime.ContainerConfig,
sandboxConfig *runtime.PodSandboxConfig,
imageConfig *imagespec.ImageConfig,
extraMounts []*runtime.Mount,
) (_ []oci.SpecOpts, retErr error) {
specOpts := []oci.SpecOpts{}
securityContext := config.GetLinux().GetSecurityContext()
labelOptions, err := toLabel(securityContext.GetSelinuxOptions())
if err != nil {
return nil, err
}
if len(labelOptions) == 0 {
// Use pod level SELinux config
if sandbox, err := c.sandboxStore.Get(sandboxID); err == nil {
labelOptions, err = selinux.DupSecOpt(sandbox.ProcessLabel)
if err != nil {
return nil, err
}
}
}
processLabel, mountLabel, err := label.InitLabels(labelOptions)
if err != nil {
return nil, fmt.Errorf("failed to init selinux options %+v: %w", securityContext.GetSelinuxOptions(), err)
}
defer func() {
if retErr != nil {
selinux.ReleaseLabel(processLabel)
}
}()
specOpts = append(specOpts,
customopts.WithSelinuxLabels(processLabel, mountLabel),
customopts.WithMounts(c.os, config, extraMounts, mountLabel),
customopts.WithDevices(c.os, config, c.config.DeviceOwnershipFromSecurityContext),
customopts.WithCapabilities(securityContext, c.allCaps),
)
if c.config.DisableCgroup {
specOpts = append(specOpts, customopts.WithDisabledCgroups)
} else {
specOpts = append(specOpts, customopts.WithResources(config.GetLinux().GetResources(), c.config.TolerateMissingHugetlbController, c.config.DisableHugetlbController))
if sandboxConfig.GetLinux().GetCgroupParent() != "" {
cgroupsPath := getCgroupsPath(sandboxConfig.GetLinux().GetCgroupParent(), id)
specOpts = append(specOpts, oci.WithCgroup(cgroupsPath))
}
}
// Get blockio class
blockIOClass, err := c.blockIOClassFromAnnotations(config.GetMetadata().GetName(), config.Annotations, sandboxConfig.Annotations)
if err != nil {
return nil, fmt.Errorf("failed to set blockio class: %w", err)
}
if blockIOClass != "" {
if linuxBlockIO, err := blockIOToLinuxOci(blockIOClass); err == nil {
specOpts = append(specOpts, oci.WithBlockIO(linuxBlockIO))
} else {
return nil, err
}
}
// Get RDT class
rdtClass, err := c.rdtClassFromAnnotations(config.GetMetadata().GetName(), config.Annotations, sandboxConfig.Annotations)
if err != nil {
return nil, fmt.Errorf("failed to set RDT class: %w", err)
}
if rdtClass != "" {
specOpts = append(specOpts, oci.WithRdt(rdtClass, "", ""))
}
specOpts = append(specOpts, customopts.WithOOMScoreAdj(config, c.config.RestrictOOMScoreAdj))
// cgroupns is used for hiding /sys/fs/cgroup from containers.
// For compatibility, cgroupns is not used when running in cgroup v1 mode or in privileged.
// https://github.com/containers/libpod/issues/4363
// https://github.com/kubernetes/enhancements/blob/0e409b47497e398b369c281074485c8de129694f/keps/sig-node/20191118-cgroups-v2.md#cgroup-namespace
if cgroups.Mode() == cgroups.Unified && !securityContext.GetPrivileged() {
specOpts = append(specOpts, oci.WithLinuxNamespace(runtimespec.LinuxNamespace{Type: runtimespec.CgroupNamespace}))
}
return specOpts, nil
}
func (c *criService) containerSpecOpts(config *runtime.ContainerConfig, imageConfig *imagespec.ImageConfig) ([]oci.SpecOpts, error) {
var specOpts []oci.SpecOpts
securityContext := config.GetLinux().GetSecurityContext()

View File

@ -19,10 +19,11 @@
package sbserver
import (
"github.com/containerd/containerd/oci"
"github.com/containerd/containerd/snapshots"
imagespec "github.com/opencontainers/image-spec/specs-go/v1"
runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
"github.com/containerd/containerd/oci"
"github.com/containerd/containerd/snapshots"
)
// containerMounts sets up necessary container system file mounts
@ -31,17 +32,6 @@ func (c *criService) containerMounts(sandboxID string, config *runtime.Container
return []*runtime.Mount{}
}
func (c *criService) platformSpec(
id string,
sandboxID string,
config *runtime.ContainerConfig,
sandboxConfig *runtime.PodSandboxConfig,
imageConfig *imagespec.ImageConfig,
extraMounts []*runtime.Mount,
) ([]oci.SpecOpts, error) {
return []oci.SpecOpts{}, nil
}
func (c *criService) containerSpecOpts(config *runtime.ContainerConfig, imageConfig *imagespec.ImageConfig) ([]oci.SpecOpts, error) {
return []oci.SpecOpts{}, nil
}

View File

@ -23,7 +23,6 @@ import (
runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
"github.com/containerd/containerd/oci"
customopts "github.com/containerd/containerd/pkg/cri/opts"
"github.com/containerd/containerd/snapshots"
)
@ -32,49 +31,6 @@ func (c *criService) containerMounts(sandboxID string, config *runtime.Container
return nil
}
func (c *criService) platformSpec(
id string,
sandboxID string,
config *runtime.ContainerConfig,
sandboxConfig *runtime.PodSandboxConfig,
imageConfig *imagespec.ImageConfig,
extraMounts []*runtime.Mount,
) ([]oci.SpecOpts, error) {
specOpts := []oci.SpecOpts{}
specOpts = append(specOpts,
customopts.WithWindowsMounts(c.os, config, extraMounts),
customopts.WithDevices(config),
)
// Start with the image config user and override below if RunAsUsername is not "".
username := imageConfig.User
windowsConfig := config.GetWindows()
if windowsConfig != nil {
specOpts = append(specOpts, customopts.WithWindowsResources(windowsConfig.GetResources()))
securityCtx := windowsConfig.GetSecurityContext()
if securityCtx != nil {
runAsUser := securityCtx.GetRunAsUsername()
if runAsUser != "" {
username = runAsUser
}
cs := securityCtx.GetCredentialSpec()
if cs != "" {
specOpts = append(specOpts, customopts.WithWindowsCredentialSpec(cs))
}
}
}
// There really isn't a good Windows way to verify that the username is available in the
// image as early as here like there is for Linux. Later on in the stack hcsshim
// will handle the behavior of erroring out if the user isn't available in the image
// when trying to run the init process.
specOpts = append(specOpts, oci.WithUser(username))
return specOpts, nil
}
// No extra spec options needed for windows.
func (c *criService) containerSpecOpts(config *runtime.ContainerConfig, imageConfig *imagespec.ImageConfig) ([]oci.SpecOpts, error) {
return nil, nil

View File

@ -21,6 +21,7 @@ import (
"fmt"
"path"
"path/filepath"
"regexp"
goruntime "runtime"
"strconv"
"strings"
@ -603,3 +604,180 @@ func hostNetwork(config *runtime.PodSandboxConfig) bool {
}
return hostNet
}
// getCgroupsPath generates container cgroups path.
func getCgroupsPath(cgroupsParent, id string) string {
base := path.Base(cgroupsParent)
if strings.HasSuffix(base, ".slice") {
// For a.slice/b.slice/c.slice, base is c.slice.
// runc systemd cgroup path format is "slice:prefix:name".
return strings.Join([]string{base, "cri-containerd", id}, ":")
}
return filepath.Join(cgroupsParent, id)
}
func toLabel(selinuxOptions *runtime.SELinuxOption) ([]string, error) {
var labels []string
if selinuxOptions == nil {
return nil, nil
}
if err := checkSelinuxLevel(selinuxOptions.Level); err != nil {
return nil, err
}
if selinuxOptions.User != "" {
labels = append(labels, "user:"+selinuxOptions.User)
}
if selinuxOptions.Role != "" {
labels = append(labels, "role:"+selinuxOptions.Role)
}
if selinuxOptions.Type != "" {
labels = append(labels, "type:"+selinuxOptions.Type)
}
if selinuxOptions.Level != "" {
labels = append(labels, "level:"+selinuxOptions.Level)
}
return labels, nil
}
func checkSelinuxLevel(level string) error {
if len(level) == 0 {
return nil
}
matched, err := regexp.MatchString(`^s\d(-s\d)??(:c\d{1,4}(\.c\d{1,4})?(,c\d{1,4}(\.c\d{1,4})?)*)?$`, level)
if err != nil {
return fmt.Errorf("the format of 'level' %q is not correct: %w", level, err)
}
if !matched {
return fmt.Errorf("the format of 'level' %q is not correct", level)
}
return nil
}
func parseUsernsIDMap(runtimeIDMap []*runtime.IDMapping) ([]runtimespec.LinuxIDMapping, error) {
var m []runtimespec.LinuxIDMapping
if len(runtimeIDMap) == 0 {
return m, nil
}
if len(runtimeIDMap) > 1 {
// We only accept 1 line, because containerd.WithRemappedSnapshot() only supports that.
return m, fmt.Errorf("only one mapping line supported, got %v mapping lines", len(runtimeIDMap))
}
// We know len is 1 now.
if runtimeIDMap[0] == nil {
return m, nil
}
uidMap := *runtimeIDMap[0]
if uidMap.Length < 1 {
return m, fmt.Errorf("invalid mapping length: %v", uidMap.Length)
}
m = []runtimespec.LinuxIDMapping{
{
ContainerID: uidMap.ContainerId,
HostID: uidMap.HostId,
Size: uidMap.Length,
},
}
return m, nil
}
func parseUsernsIDs(userns *runtime.UserNamespace) (uids, gids []runtimespec.LinuxIDMapping, retErr error) {
if userns == nil {
// If userns is not set, the kubelet doesn't support this option
// and we should just fallback to no userns. This is completely
// valid.
return nil, nil, nil
}
uids, err := parseUsernsIDMap(userns.GetUids())
if err != nil {
return nil, nil, fmt.Errorf("UID mapping: %w", err)
}
gids, err = parseUsernsIDMap(userns.GetGids())
if err != nil {
return nil, nil, fmt.Errorf("GID mapping: %w", err)
}
switch mode := userns.GetMode(); mode {
case runtime.NamespaceMode_NODE:
if len(uids) != 0 || len(gids) != 0 {
return nil, nil, fmt.Errorf("can't use user namespace mode %q with mappings. Got %v UID mappings and %v GID mappings", mode, len(uids), len(gids))
}
case runtime.NamespaceMode_POD:
// This is valid, we will handle it in WithPodNamespaces().
if len(uids) == 0 || len(gids) == 0 {
return nil, nil, fmt.Errorf("can't use user namespace mode %q without UID and GID mappings", mode)
}
default:
return nil, nil, fmt.Errorf("unsupported user namespace mode: %q", mode)
}
return uids, gids, nil
}
// sameUsernsConfig checks if the userns configs are the same. If the mappings
// on each config are the same but in different order, it returns false.
// XXX: If the runtime.UserNamespace struct changes, we should update this
// function accordingly.
func sameUsernsConfig(a, b *runtime.UserNamespace) bool {
// If both are nil, they are the same.
if a == nil && b == nil {
return true
}
// If only one is nil, they are different.
if a == nil || b == nil {
return false
}
// At this point, a is not nil nor b.
if a.GetMode() != b.GetMode() {
return false
}
aUids, aGids, err := parseUsernsIDs(a)
if err != nil {
return false
}
bUids, bGids, err := parseUsernsIDs(b)
if err != nil {
return false
}
if !sameMapping(aUids, bUids) {
return false
}
if !sameMapping(aGids, bGids) {
return false
}
return true
}
// sameMapping checks if the mappings are the same. If the mappings are the same
// but in different order, it returns false.
func sameMapping(a, b []runtimespec.LinuxIDMapping) bool {
if len(a) != len(b) {
return false
}
for x := range a {
if a[x].ContainerID != b[x].ContainerID {
return false
}
if a[x].HostID != b[x].HostID {
return false
}
if a[x].Size != b[x].Size {
return false
}
}
return true
}

View File

@ -20,23 +20,22 @@ import (
"context"
"fmt"
"os"
"path"
"path/filepath"
"regexp"
"sort"
"strings"
"syscall"
"time"
"github.com/containerd/cgroups/v3"
"github.com/moby/sys/mountinfo"
"github.com/opencontainers/runtime-spec/specs-go"
"golang.org/x/sys/unix"
"github.com/containerd/containerd/log"
"github.com/containerd/containerd/mount"
"github.com/containerd/containerd/pkg/apparmor"
"github.com/containerd/containerd/pkg/seccomp"
"github.com/containerd/containerd/pkg/seutil"
"github.com/moby/sys/mountinfo"
"github.com/opencontainers/runtime-spec/specs-go"
"golang.org/x/sys/unix"
runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
)
const (
@ -50,17 +49,6 @@ const (
resolvConfPath = "/etc/resolv.conf"
)
// getCgroupsPath generates container cgroups path.
func getCgroupsPath(cgroupsParent, id string) string {
base := path.Base(cgroupsParent)
if strings.HasSuffix(base, ".slice") {
// For a.slice/b.slice/c.slice, base is c.slice.
// runc systemd cgroup path format is "slice:prefix:name".
return strings.Join([]string{base, "cri-containerd", id}, ":")
}
return filepath.Join(cgroupsParent, id)
}
// getSandboxRootDir returns the root directory for managing sandbox files,
// e.g. hosts files.
func (c *criService) getSandboxRootDir(id string) string {
@ -93,46 +81,6 @@ func (c *criService) getSandboxDevShm(id string) string {
return filepath.Join(c.getVolatileSandboxRootDir(id), "shm")
}
func toLabel(selinuxOptions *runtime.SELinuxOption) ([]string, error) {
var labels []string
if selinuxOptions == nil {
return nil, nil
}
if err := checkSelinuxLevel(selinuxOptions.Level); err != nil {
return nil, err
}
if selinuxOptions.User != "" {
labels = append(labels, "user:"+selinuxOptions.User)
}
if selinuxOptions.Role != "" {
labels = append(labels, "role:"+selinuxOptions.Role)
}
if selinuxOptions.Type != "" {
labels = append(labels, "type:"+selinuxOptions.Type)
}
if selinuxOptions.Level != "" {
labels = append(labels, "level:"+selinuxOptions.Level)
}
return labels, nil
}
func checkSelinuxLevel(level string) error {
if len(level) == 0 {
return nil
}
matched, err := regexp.MatchString(`^s\d(-s\d)??(:c\d{1,4}(\.c\d{1,4})?(,c\d{1,4}(\.c\d{1,4})?)*)?$`, level)
if err != nil {
return fmt.Errorf("the format of 'level' %q is not correct: %w", level, err)
}
if !matched {
return fmt.Errorf("the format of 'level' %q is not correct", level)
}
return nil
}
// apparmorEnabled returns true if apparmor is enabled, supported by the host,
// if apparmor_parser is installed, and if we are not running docker-in-docker.
func (c *criService) apparmorEnabled() bool {
@ -270,3 +218,9 @@ func modifyProcessLabel(runtimeType string, spec *specs.Spec) error {
spec.Process.SelinuxLabel = l
return nil
}
// getCgroupsMode returns cgropu mode.
// TODO: add build constraints to cgroups package and remove this helper
func isUnifiedCgroupsMode() bool {
return cgroups.Mode() == cgroups.Unified
}

View File

@ -41,3 +41,7 @@ func ensureRemoveAll(ctx context.Context, dir string) error {
func modifyProcessLabel(runtimeType string, spec *specs.Spec) error {
return nil
}
func isUnifiedCgroupsMode() bool {
return false
}

View File

@ -166,3 +166,7 @@ func ensureRemoveAll(_ context.Context, dir string) error {
func modifyProcessLabel(runtimeType string, spec *specs.Spec) error {
return nil
}
func isUnifiedCgroupsMode() bool {
return false
}

View File

@ -21,12 +21,13 @@ import (
"fmt"
"strconv"
"github.com/containerd/containerd/oci"
"github.com/containerd/containerd/snapshots"
imagespec "github.com/opencontainers/image-spec/specs-go/v1"
runtimespec "github.com/opencontainers/runtime-spec/specs-go"
runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
"github.com/containerd/containerd/oci"
"github.com/containerd/containerd/snapshots"
"github.com/containerd/containerd/pkg/cri/annotations"
"github.com/containerd/containerd/pkg/cri/config"
customopts "github.com/containerd/containerd/pkg/cri/opts"
@ -89,7 +90,7 @@ func (c *criService) containerSpec(
oci.WithHostname(sandboxConfig.GetHostname()),
)
specOpts = append(specOpts, customopts.WithWindowsMounts(c.os, config, extraMounts), customopts.WithDevices(config))
specOpts = append(specOpts, customopts.WithWindowsMounts(c.os, config, extraMounts), customopts.WithWindowsDevices(config))
// Start with the image config user and override below if RunAsUsername is not "".
username := imageConfig.User