567 lines
18 KiB
Go
567 lines
18 KiB
Go
/*
|
|
Copyright The containerd Authors.
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
*/
|
|
|
|
package opts
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"fmt"
|
|
"os"
|
|
"path/filepath"
|
|
"sort"
|
|
"strconv"
|
|
"strings"
|
|
"sync"
|
|
"syscall"
|
|
|
|
"github.com/container-orchestrated-devices/container-device-interface/pkg/cdi"
|
|
"github.com/containerd/cgroups/v3"
|
|
"github.com/containerd/cgroups/v3/cgroup1"
|
|
runtimespec "github.com/opencontainers/runtime-spec/specs-go"
|
|
"github.com/opencontainers/selinux/go-selinux/label"
|
|
"github.com/sirupsen/logrus"
|
|
"golang.org/x/sys/unix"
|
|
runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
|
|
|
|
"github.com/containerd/containerd/containers"
|
|
"github.com/containerd/containerd/log"
|
|
"github.com/containerd/containerd/mount"
|
|
"github.com/containerd/containerd/oci"
|
|
osinterface "github.com/containerd/containerd/pkg/os"
|
|
)
|
|
|
|
// WithMounts sorts and adds runtime and CRI mounts to the spec
|
|
func WithMounts(osi osinterface.OS, config *runtime.ContainerConfig, extra []*runtime.Mount, mountLabel string) oci.SpecOpts {
|
|
return func(ctx context.Context, client oci.Client, _ *containers.Container, s *runtimespec.Spec) (err error) {
|
|
// mergeMounts merge CRI mounts with extra mounts. If a mount destination
|
|
// is mounted by both a CRI mount and an extra mount, the CRI mount will
|
|
// be kept.
|
|
var (
|
|
criMounts = config.GetMounts()
|
|
mounts = append([]*runtime.Mount{}, criMounts...)
|
|
)
|
|
// Copy all mounts from extra mounts, except for mounts overridden by CRI.
|
|
for _, e := range extra {
|
|
found := false
|
|
for _, c := range criMounts {
|
|
if filepath.Clean(e.ContainerPath) == filepath.Clean(c.ContainerPath) {
|
|
found = true
|
|
break
|
|
}
|
|
}
|
|
if !found {
|
|
mounts = append(mounts, e)
|
|
}
|
|
}
|
|
|
|
// Sort mounts in number of parts. This ensures that high level mounts don't
|
|
// shadow other mounts.
|
|
sort.Sort(orderedMounts(mounts))
|
|
|
|
// Mount cgroup into the container as readonly, which inherits docker's behavior.
|
|
s.Mounts = append(s.Mounts, runtimespec.Mount{
|
|
Source: "cgroup",
|
|
Destination: "/sys/fs/cgroup",
|
|
Type: "cgroup",
|
|
Options: []string{"nosuid", "noexec", "nodev", "relatime", "ro"},
|
|
})
|
|
|
|
// Copy all mounts from default mounts, except for
|
|
// - mounts overridden by supplied mount;
|
|
// - all mounts under /dev if a supplied /dev is present.
|
|
mountSet := make(map[string]struct{})
|
|
for _, m := range mounts {
|
|
mountSet[filepath.Clean(m.ContainerPath)] = struct{}{}
|
|
}
|
|
|
|
defaultMounts := s.Mounts
|
|
s.Mounts = nil
|
|
|
|
for _, m := range defaultMounts {
|
|
dst := filepath.Clean(m.Destination)
|
|
if _, ok := mountSet[dst]; ok {
|
|
// filter out mount overridden by a supplied mount
|
|
continue
|
|
}
|
|
if _, mountDev := mountSet["/dev"]; mountDev && strings.HasPrefix(dst, "/dev/") {
|
|
// filter out everything under /dev if /dev is a supplied mount
|
|
continue
|
|
}
|
|
s.Mounts = append(s.Mounts, m)
|
|
}
|
|
|
|
for _, mount := range mounts {
|
|
var (
|
|
dst = mount.GetContainerPath()
|
|
src = mount.GetHostPath()
|
|
)
|
|
// Create the host path if it doesn't exist.
|
|
// TODO(random-liu): Add CRI validation test for this case.
|
|
if _, err := osi.Stat(src); err != nil {
|
|
if !os.IsNotExist(err) {
|
|
return fmt.Errorf("failed to stat %q: %w", src, err)
|
|
}
|
|
if err := osi.MkdirAll(src, 0755); err != nil {
|
|
return fmt.Errorf("failed to mkdir %q: %w", src, err)
|
|
}
|
|
}
|
|
// TODO(random-liu): Add cri-containerd integration test or cri validation test
|
|
// for this.
|
|
src, err := osi.ResolveSymbolicLink(src)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to resolve symlink %q: %w", src, err)
|
|
}
|
|
if s.Linux == nil {
|
|
s.Linux = &runtimespec.Linux{}
|
|
}
|
|
options := []string{"rbind"}
|
|
switch mount.GetPropagation() {
|
|
case runtime.MountPropagation_PROPAGATION_PRIVATE:
|
|
options = append(options, "rprivate")
|
|
// Since default root propagation in runc is rprivate ignore
|
|
// setting the root propagation
|
|
case runtime.MountPropagation_PROPAGATION_BIDIRECTIONAL:
|
|
if err := ensureShared(src, osi.LookupMount); err != nil {
|
|
return err
|
|
}
|
|
options = append(options, "rshared")
|
|
s.Linux.RootfsPropagation = "rshared"
|
|
case runtime.MountPropagation_PROPAGATION_HOST_TO_CONTAINER:
|
|
if err := ensureSharedOrSlave(src, osi.LookupMount); err != nil {
|
|
return err
|
|
}
|
|
options = append(options, "rslave")
|
|
if s.Linux.RootfsPropagation != "rshared" &&
|
|
s.Linux.RootfsPropagation != "rslave" {
|
|
s.Linux.RootfsPropagation = "rslave"
|
|
}
|
|
default:
|
|
log.G(ctx).Warnf("Unknown propagation mode for hostPath %q", mount.HostPath)
|
|
options = append(options, "rprivate")
|
|
}
|
|
|
|
// NOTE(random-liu): we don't change all mounts to `ro` when root filesystem
|
|
// is readonly. This is different from docker's behavior, but make more sense.
|
|
if mount.GetReadonly() {
|
|
options = append(options, "ro")
|
|
} else {
|
|
options = append(options, "rw")
|
|
}
|
|
|
|
if mount.GetSelinuxRelabel() {
|
|
if err := label.Relabel(src, mountLabel, false); err != nil && err != unix.ENOTSUP {
|
|
return fmt.Errorf("relabel %q with %q failed: %w", src, mountLabel, err)
|
|
}
|
|
}
|
|
s.Mounts = append(s.Mounts, runtimespec.Mount{
|
|
Source: src,
|
|
Destination: dst,
|
|
Type: "bind",
|
|
Options: options,
|
|
})
|
|
}
|
|
return nil
|
|
}
|
|
}
|
|
|
|
// Ensure mount point on which path is mounted, is shared.
|
|
func ensureShared(path string, lookupMount func(string) (mount.Info, error)) error {
|
|
mountInfo, err := lookupMount(path)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
// Make sure source mount point is shared.
|
|
optsSplit := strings.Split(mountInfo.Optional, " ")
|
|
for _, opt := range optsSplit {
|
|
if strings.HasPrefix(opt, "shared:") {
|
|
return nil
|
|
}
|
|
}
|
|
|
|
return fmt.Errorf("path %q is mounted on %q but it is not a shared mount", path, mountInfo.Mountpoint)
|
|
}
|
|
|
|
// ensure mount point on which path is mounted, is either shared or slave.
|
|
func ensureSharedOrSlave(path string, lookupMount func(string) (mount.Info, error)) error {
|
|
mountInfo, err := lookupMount(path)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
// Make sure source mount point is shared.
|
|
optsSplit := strings.Split(mountInfo.Optional, " ")
|
|
for _, opt := range optsSplit {
|
|
if strings.HasPrefix(opt, "shared:") {
|
|
return nil
|
|
} else if strings.HasPrefix(opt, "master:") {
|
|
return nil
|
|
}
|
|
}
|
|
return fmt.Errorf("path %q is mounted on %q but it is not a shared or slave mount", path, mountInfo.Mountpoint)
|
|
}
|
|
|
|
// getDeviceUserGroupID() is used to find the right uid/gid
|
|
// value for the device node created in the container namespace.
|
|
// The runtime executes mknod() and chmod()s the created
|
|
// device with the values returned here.
|
|
//
|
|
// On Linux, uid and gid are sufficient and the user/groupname do not
|
|
// need to be resolved.
|
|
//
|
|
// TODO(mythi): In case of user namespaces, the runtime simply bind
|
|
// mounts the devices from the host. Additional logic is needed
|
|
// to check that the runtimes effective UID/GID on the host has the
|
|
// permissions to access the device node and/or the right user namespace
|
|
// mappings are created.
|
|
//
|
|
// Ref: https://github.com/kubernetes/kubernetes/issues/92211
|
|
func getDeviceUserGroupID(runAsVal *runtime.Int64Value) uint32 {
|
|
if runAsVal != nil {
|
|
return uint32(runAsVal.GetValue())
|
|
}
|
|
return 0
|
|
}
|
|
|
|
// WithDevices sets the provided devices onto the container spec
|
|
func WithDevices(osi osinterface.OS, config *runtime.ContainerConfig, enableDeviceOwnershipFromSecurityContext bool) oci.SpecOpts {
|
|
return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) (err error) {
|
|
if s.Linux == nil {
|
|
s.Linux = &runtimespec.Linux{}
|
|
}
|
|
if s.Linux.Resources == nil {
|
|
s.Linux.Resources = &runtimespec.LinuxResources{}
|
|
}
|
|
|
|
oldDevices := len(s.Linux.Devices)
|
|
|
|
for _, device := range config.GetDevices() {
|
|
path, err := osi.ResolveSymbolicLink(device.HostPath)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
o := oci.WithDevices(path, device.ContainerPath, device.Permissions)
|
|
if err := o(ctx, client, c, s); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
if enableDeviceOwnershipFromSecurityContext {
|
|
UID := getDeviceUserGroupID(config.GetLinux().GetSecurityContext().GetRunAsUser())
|
|
GID := getDeviceUserGroupID(config.GetLinux().GetSecurityContext().GetRunAsGroup())
|
|
// Loop all new devices added by oci.WithDevices() to update their
|
|
// dev.UID/dev.GID.
|
|
//
|
|
// non-zero UID/GID from SecurityContext is used to override host's
|
|
// device UID/GID for the container.
|
|
for idx := oldDevices; idx < len(s.Linux.Devices); idx++ {
|
|
if UID != 0 {
|
|
*s.Linux.Devices[idx].UID = UID
|
|
}
|
|
if GID != 0 {
|
|
*s.Linux.Devices[idx].GID = GID
|
|
}
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
}
|
|
|
|
var (
|
|
swapControllerAvailability bool
|
|
swapControllerAvailabilityOnce sync.Once
|
|
)
|
|
|
|
// SwapControllerAvailable returns true if the swap controller is available
|
|
func SwapControllerAvailable() bool {
|
|
swapControllerAvailabilityOnce.Do(func() {
|
|
const warn = "Failed to detect the availability of the swap controller, assuming not available"
|
|
p := "/sys/fs/cgroup/memory/memory.memsw.limit_in_bytes"
|
|
if cgroups.Mode() == cgroups.Unified {
|
|
// memory.swap.max does not exist in the cgroup root, so we check /sys/fs/cgroup/<SELF>/memory.swap.max
|
|
_, unified, err := cgroup1.ParseCgroupFileUnified("/proc/self/cgroup")
|
|
if err != nil {
|
|
err = fmt.Errorf("failed to parse /proc/self/cgroup: %w", err)
|
|
logrus.WithError(err).Warn(warn)
|
|
return
|
|
}
|
|
p = filepath.Join("/sys/fs/cgroup", unified, "memory.swap.max")
|
|
}
|
|
if _, err := os.Stat(p); err != nil {
|
|
if !errors.Is(err, os.ErrNotExist) {
|
|
logrus.WithError(err).Warn(warn)
|
|
}
|
|
return
|
|
}
|
|
swapControllerAvailability = true
|
|
})
|
|
return swapControllerAvailability
|
|
}
|
|
|
|
// WithResources sets the provided resource restrictions
|
|
func WithResources(resources *runtime.LinuxContainerResources, tolerateMissingHugetlbController, disableHugetlbController bool) oci.SpecOpts {
|
|
return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) (err error) {
|
|
if resources == nil {
|
|
return nil
|
|
}
|
|
if s.Linux == nil {
|
|
s.Linux = &runtimespec.Linux{}
|
|
}
|
|
if s.Linux.Resources == nil {
|
|
s.Linux.Resources = &runtimespec.LinuxResources{}
|
|
}
|
|
if s.Linux.Resources.CPU == nil {
|
|
s.Linux.Resources.CPU = &runtimespec.LinuxCPU{}
|
|
}
|
|
if s.Linux.Resources.Memory == nil {
|
|
s.Linux.Resources.Memory = &runtimespec.LinuxMemory{}
|
|
}
|
|
var (
|
|
p = uint64(resources.GetCpuPeriod())
|
|
q = resources.GetCpuQuota()
|
|
shares = uint64(resources.GetCpuShares())
|
|
limit = resources.GetMemoryLimitInBytes()
|
|
swapLimit = resources.GetMemorySwapLimitInBytes()
|
|
hugepages = resources.GetHugepageLimits()
|
|
)
|
|
|
|
if p != 0 {
|
|
s.Linux.Resources.CPU.Period = &p
|
|
}
|
|
if q != 0 {
|
|
s.Linux.Resources.CPU.Quota = &q
|
|
}
|
|
if shares != 0 {
|
|
s.Linux.Resources.CPU.Shares = &shares
|
|
}
|
|
if cpus := resources.GetCpusetCpus(); cpus != "" {
|
|
s.Linux.Resources.CPU.Cpus = cpus
|
|
}
|
|
if mems := resources.GetCpusetMems(); mems != "" {
|
|
s.Linux.Resources.CPU.Mems = resources.GetCpusetMems()
|
|
}
|
|
if limit != 0 {
|
|
s.Linux.Resources.Memory.Limit = &limit
|
|
// swap/memory limit should be equal to prevent container from swapping by default
|
|
if swapLimit == 0 && SwapControllerAvailable() {
|
|
s.Linux.Resources.Memory.Swap = &limit
|
|
}
|
|
}
|
|
if swapLimit != 0 {
|
|
s.Linux.Resources.Memory.Swap = &swapLimit
|
|
}
|
|
|
|
if !disableHugetlbController {
|
|
if isHugetlbControllerPresent() {
|
|
for _, limit := range hugepages {
|
|
s.Linux.Resources.HugepageLimits = append(s.Linux.Resources.HugepageLimits, runtimespec.LinuxHugepageLimit{
|
|
Pagesize: limit.PageSize,
|
|
Limit: limit.Limit,
|
|
})
|
|
}
|
|
} else {
|
|
if !tolerateMissingHugetlbController {
|
|
return errors.New("huge pages limits are specified but hugetlb cgroup controller is missing. " +
|
|
"Please set tolerate_missing_hugetlb_controller to `true` to ignore this error")
|
|
}
|
|
logrus.Warn("hugetlb cgroup controller is absent. skipping huge pages limits")
|
|
}
|
|
}
|
|
|
|
if unified := resources.GetUnified(); unified != nil {
|
|
if s.Linux.Resources.Unified == nil {
|
|
s.Linux.Resources.Unified = make(map[string]string)
|
|
}
|
|
for k, v := range unified {
|
|
s.Linux.Resources.Unified[k] = v
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
}
|
|
|
|
var (
|
|
supportsHugetlbOnce sync.Once
|
|
supportsHugetlb bool
|
|
)
|
|
|
|
func isHugetlbControllerPresent() bool {
|
|
supportsHugetlbOnce.Do(func() {
|
|
supportsHugetlb = false
|
|
if IsCgroup2UnifiedMode() {
|
|
supportsHugetlb, _ = cgroupv2HasHugetlb()
|
|
} else {
|
|
supportsHugetlb, _ = cgroupv1HasHugetlb()
|
|
}
|
|
})
|
|
return supportsHugetlb
|
|
}
|
|
|
|
var (
|
|
_cgroupv1HasHugetlbOnce sync.Once
|
|
_cgroupv1HasHugetlb bool
|
|
_cgroupv1HasHugetlbErr error
|
|
_cgroupv2HasHugetlbOnce sync.Once
|
|
_cgroupv2HasHugetlb bool
|
|
_cgroupv2HasHugetlbErr error
|
|
isUnifiedOnce sync.Once
|
|
isUnified bool
|
|
)
|
|
|
|
// cgroupv1HasHugetlb returns whether the hugetlb controller is present on
|
|
// cgroup v1.
|
|
func cgroupv1HasHugetlb() (bool, error) {
|
|
_cgroupv1HasHugetlbOnce.Do(func() {
|
|
if _, err := os.ReadDir("/sys/fs/cgroup/hugetlb"); err != nil {
|
|
_cgroupv1HasHugetlbErr = fmt.Errorf("readdir /sys/fs/cgroup/hugetlb: %w", err)
|
|
_cgroupv1HasHugetlb = false
|
|
} else {
|
|
_cgroupv1HasHugetlbErr = nil
|
|
_cgroupv1HasHugetlb = true
|
|
}
|
|
})
|
|
return _cgroupv1HasHugetlb, _cgroupv1HasHugetlbErr
|
|
}
|
|
|
|
// cgroupv2HasHugetlb returns whether the hugetlb controller is present on
|
|
// cgroup v2.
|
|
func cgroupv2HasHugetlb() (bool, error) {
|
|
_cgroupv2HasHugetlbOnce.Do(func() {
|
|
controllers, err := os.ReadFile("/sys/fs/cgroup/cgroup.controllers")
|
|
if err != nil {
|
|
_cgroupv2HasHugetlbErr = fmt.Errorf("read /sys/fs/cgroup/cgroup.controllers: %w", err)
|
|
return
|
|
}
|
|
_cgroupv2HasHugetlb = strings.Contains(string(controllers), "hugetlb")
|
|
})
|
|
return _cgroupv2HasHugetlb, _cgroupv2HasHugetlbErr
|
|
}
|
|
|
|
// IsCgroup2UnifiedMode returns whether we are running in cgroup v2 unified mode.
|
|
func IsCgroup2UnifiedMode() bool {
|
|
isUnifiedOnce.Do(func() {
|
|
var st syscall.Statfs_t
|
|
if err := syscall.Statfs("/sys/fs/cgroup", &st); err != nil {
|
|
panic("cannot statfs cgroup root")
|
|
}
|
|
isUnified = st.Type == unix.CGROUP2_SUPER_MAGIC
|
|
})
|
|
return isUnified
|
|
}
|
|
|
|
// WithOOMScoreAdj sets the oom score
|
|
func WithOOMScoreAdj(config *runtime.ContainerConfig, restrict bool) oci.SpecOpts {
|
|
return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) error {
|
|
if s.Process == nil {
|
|
s.Process = &runtimespec.Process{}
|
|
}
|
|
|
|
resources := config.GetLinux().GetResources()
|
|
if resources == nil {
|
|
return nil
|
|
}
|
|
adj := int(resources.GetOomScoreAdj())
|
|
if restrict {
|
|
var err error
|
|
adj, err = restrictOOMScoreAdj(adj)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
}
|
|
s.Process.OOMScoreAdj = &adj
|
|
return nil
|
|
}
|
|
}
|
|
|
|
// WithPodOOMScoreAdj sets the oom score for the pod sandbox
|
|
func WithPodOOMScoreAdj(adj int, restrict bool) oci.SpecOpts {
|
|
return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) error {
|
|
if s.Process == nil {
|
|
s.Process = &runtimespec.Process{}
|
|
}
|
|
if restrict {
|
|
var err error
|
|
adj, err = restrictOOMScoreAdj(adj)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
}
|
|
s.Process.OOMScoreAdj = &adj
|
|
return nil
|
|
}
|
|
}
|
|
|
|
func getCurrentOOMScoreAdj() (int, error) {
|
|
b, err := os.ReadFile("/proc/self/oom_score_adj")
|
|
if err != nil {
|
|
return 0, fmt.Errorf("could not get the daemon oom_score_adj: %w", err)
|
|
}
|
|
s := strings.TrimSpace(string(b))
|
|
i, err := strconv.Atoi(s)
|
|
if err != nil {
|
|
return 0, fmt.Errorf("could not get the daemon oom_score_adj: %w", err)
|
|
}
|
|
return i, nil
|
|
}
|
|
|
|
func restrictOOMScoreAdj(preferredOOMScoreAdj int) (int, error) {
|
|
currentOOMScoreAdj, err := getCurrentOOMScoreAdj()
|
|
if err != nil {
|
|
return preferredOOMScoreAdj, err
|
|
}
|
|
if preferredOOMScoreAdj < currentOOMScoreAdj {
|
|
return currentOOMScoreAdj, nil
|
|
}
|
|
return preferredOOMScoreAdj, nil
|
|
}
|
|
|
|
// WithCDI updates OCI spec with CDI content
|
|
func WithCDI(annotations map[string]string) oci.SpecOpts {
|
|
return func(ctx context.Context, _ oci.Client, c *containers.Container, s *oci.Spec) error {
|
|
// TODO: Once CRI is extended with native CDI support this will need to be updated...
|
|
_, cdiDevices, err := cdi.ParseAnnotations(annotations)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to parse CDI device annotations: %w", err)
|
|
}
|
|
if cdiDevices == nil {
|
|
return nil
|
|
}
|
|
|
|
log.G(ctx).Infof("container %v: CDI devices: %v", c.ID, cdiDevices)
|
|
|
|
registry := cdi.GetRegistry()
|
|
if err = registry.Refresh(); err != nil {
|
|
// We don't consider registry refresh failure a fatal error.
|
|
// For instance, a dynamically generated invalid CDI Spec file for
|
|
// any particular vendor shouldn't prevent injection of devices of
|
|
// different vendors. CDI itself knows better and it will fail the
|
|
// injection if necessary.
|
|
log.G(ctx).Warnf("CDI registry refresh failed: %v", err)
|
|
}
|
|
|
|
if _, err := registry.InjectDevices(s, cdiDevices...); err != nil {
|
|
return fmt.Errorf("CDI device injection failed: %w", err)
|
|
}
|
|
|
|
// One crucial thing to keep in mind is that CDI device injection
|
|
// might add OCI Spec environment variables, hooks, and mounts as
|
|
// well. Therefore it is important that none of the corresponding
|
|
// OCI Spec fields are reset up in the call stack once we return.
|
|
return nil
|
|
}
|
|
}
|