Move CRI from pkg/ to internal/

Signed-off-by: Maksym Pavlenko <pavlenko.maksym@gmail.com>
This commit is contained in:
Maksym Pavlenko
2024-02-02 09:45:44 -08:00
parent db1e16da34
commit bbac058cf3
215 changed files with 254 additions and 254 deletions

View File

@@ -0,0 +1,155 @@
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package opts
import (
"context"
"errors"
"fmt"
"os"
"strings"
"github.com/containerd/continuity/fs"
imagespec "github.com/opencontainers/image-spec/specs-go/v1"
containerd "github.com/containerd/containerd/v2/client"
"github.com/containerd/containerd/v2/core/containers"
"github.com/containerd/containerd/v2/core/mount"
"github.com/containerd/containerd/v2/core/snapshots"
"github.com/containerd/errdefs"
"github.com/containerd/log"
)
// WithNewSnapshot wraps `containerd.WithNewSnapshot` so that if creating the
// snapshot fails we make sure the image is actually unpacked and retry.
func WithNewSnapshot(id string, i containerd.Image, opts ...snapshots.Opt) containerd.NewContainerOpts {
f := containerd.WithNewSnapshot(id, i, opts...)
return func(ctx context.Context, client *containerd.Client, c *containers.Container) error {
if err := f(ctx, client, c); err != nil {
if !errdefs.IsNotFound(err) {
return err
}
if err := i.Unpack(ctx, c.Snapshotter); err != nil {
return fmt.Errorf("error unpacking image: %w", err)
}
return f(ctx, client, c)
}
return nil
}
}
// WithVolumes copies ownership of volume in rootfs to its corresponding host path.
// It doesn't update runtime spec.
// The passed in map is a host path to container path map for all volumes.
func WithVolumes(volumeMounts map[string]string, platform imagespec.Platform) containerd.NewContainerOpts {
return func(ctx context.Context, client *containerd.Client, c *containers.Container) (err error) {
if c.Snapshotter == "" {
return errors.New("no snapshotter set for container")
}
if c.SnapshotKey == "" {
return errors.New("rootfs not created for container")
}
snapshotter := client.SnapshotService(c.Snapshotter)
mounts, err := snapshotter.Mounts(ctx, c.SnapshotKey)
if err != nil {
return err
}
// Since only read is needed, append ReadOnly mount option to prevent linux kernel
// from syncing whole filesystem in umount syscall.
if len(mounts) == 1 && mounts[0].Type == "overlay" {
mounts[0].Options = append(mounts[0].Options, "ro")
}
root, err := os.MkdirTemp("", "ctd-volume")
if err != nil {
return err
}
// We change RemoveAll to Remove so that we either leak a temp dir
// if it fails but not RM snapshot data.
// refer to https://github.com/containerd/containerd/pull/1868
// https://github.com/containerd/containerd/pull/1785
defer os.Remove(root)
if err := mount.All(mounts, root); err != nil {
return fmt.Errorf("failed to mount: %w", err)
}
defer func() {
if uerr := mount.Unmount(root, 0); uerr != nil {
log.G(ctx).WithError(uerr).Errorf("Failed to unmount snapshot %q", root)
if err == nil {
err = uerr
}
}
}()
for host, volume := range volumeMounts {
if platform.OS == "windows" {
// Windows allows volume mounts in subfolders under C: and as any other drive letter like D:, E:, etc.
// An image may contain files inside a folder defined as a VOLUME in a Dockerfile. On Windows, images
// can only contain pre-existing files for volumes situated on the root filesystem, which is C:.
// For any other volumes, we need to skip attempting to copy existing contents.
//
// C:\some\volume --> \some\volume
// D:\some\volume --> skip
if len(volume) >= 2 && string(volume[1]) == ":" {
// Perform a case insensitive comparison to "C", and skip non-C mounted volumes.
if !strings.EqualFold(string(volume[0]), "c") {
continue
}
// This is a volume mounted somewhere under C:\. We strip the drive letter and allow fs.RootPath()
// to append the remaining path to the rootfs path as seen by the host OS.
volume = volume[2:]
}
}
src, err := fs.RootPath(root, volume)
if err != nil {
return fmt.Errorf("rootpath on mountPath %s, volume %s: %w", root, volume, err)
}
if _, err := os.Stat(src); err != nil {
if os.IsNotExist(err) {
// Skip copying directory if it does not exist.
continue
}
return fmt.Errorf("stat volume in rootfs: %w", err)
}
if err := copyExistingContents(src, host); err != nil {
return fmt.Errorf("taking runtime copy of volume: %w", err)
}
}
return nil
}
}
// copyExistingContents copies from the source to the destination and
// ensures the ownership is appropriately set.
func copyExistingContents(source, destination string) error {
f, err := os.Open(destination)
if err != nil {
return err
}
defer f.Close()
dstList, err := f.Readdirnames(-1)
if err != nil {
return err
}
if len(dstList) != 0 {
return fmt.Errorf("volume at %q is not initially empty", destination)
}
return fs.CopyDir(destination, source, fs.WithXAttrExclude("security.selinux"))
}

View File

@@ -0,0 +1,119 @@
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package opts
import (
"context"
"fmt"
"os"
"path/filepath"
"sort"
runtimespec "github.com/opencontainers/runtime-spec/specs-go"
runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
"github.com/containerd/containerd/v2/core/containers"
"github.com/containerd/containerd/v2/pkg/oci"
osinterface "github.com/containerd/containerd/v2/pkg/os"
)
// WithDarwinMounts adds mounts from CRI's container config + extra mounts.
func WithDarwinMounts(osi osinterface.OS, config *runtime.ContainerConfig, extra []*runtime.Mount) oci.SpecOpts {
return func(ctx context.Context, client oci.Client, container *containers.Container, s *oci.Spec) error {
// mergeMounts merge CRI mounts with extra mounts. If a mount destination
// is mounted by both a CRI mount and an extra mount, the CRI mount will
// be kept.
var (
criMounts = config.GetMounts()
mounts = append([]*runtime.Mount{}, criMounts...)
)
// Copy all mounts from extra mounts, except for mounts overridden by CRI.
for _, e := range extra {
found := false
for _, c := range criMounts {
if cleanMount(e.ContainerPath) == cleanMount(c.ContainerPath) {
found = true
break
}
}
if !found {
mounts = append(mounts, e)
}
}
// Sort mounts in number of parts. This ensures that high level mounts don't
// shadow other mounts.
sort.Sort(orderedMounts(mounts))
// Copy all mounts from default mounts, except for
// - mounts overridden by supplied mount;
mountSet := make(map[string]struct{})
for _, m := range mounts {
mountSet[filepath.Clean(m.ContainerPath)] = struct{}{}
}
defaultMounts := s.Mounts
s.Mounts = nil
for _, m := range defaultMounts {
dst := cleanMount(m.Destination)
if _, ok := mountSet[dst]; ok {
// filter out mount overridden by a supplied mount
continue
}
s.Mounts = append(s.Mounts, m)
}
for _, mount := range mounts {
var (
dst = mount.GetContainerPath()
src = mount.GetHostPath()
)
// Create the host path if it doesn't exist.
if _, err := osi.Stat(src); err != nil {
if !os.IsNotExist(err) {
return fmt.Errorf("failed to stat %q: %w", src, err)
}
if err := osi.MkdirAll(src, 0755); err != nil {
return fmt.Errorf("failed to mkdir %q: %w", src, err)
}
}
src, err := osi.ResolveSymbolicLink(src)
if err != nil {
return fmt.Errorf("failed to resolve symlink %q: %w", src, err)
}
var options []string
if mount.GetReadonly() {
options = append(options, "ro")
} else {
options = append(options, "rw")
}
s.Mounts = append(s.Mounts, runtimespec.Mount{
Source: src,
Destination: dst,
Type: "bind",
Options: options,
})
}
return nil
}
}

View File

@@ -0,0 +1,177 @@
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package opts
import (
"context"
"errors"
"fmt"
"os"
"path/filepath"
"strings"
"sync"
"syscall"
"github.com/containerd/cgroups/v3"
"golang.org/x/sys/unix"
runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
"tags.cncf.io/container-device-interface/pkg/cdi"
"github.com/containerd/containerd/v2/core/containers"
"github.com/containerd/containerd/v2/pkg/oci"
"github.com/containerd/log"
)
// Linux dependent OCI spec opts.
var (
swapControllerAvailability bool
swapControllerAvailabilityOnce sync.Once
)
// SwapControllerAvailable returns true if the swap controller is available
func SwapControllerAvailable() bool {
swapControllerAvailabilityOnce.Do(func() {
const warn = "Failed to detect the availability of the swap controller, assuming not available"
p := "/sys/fs/cgroup/memory/memory.memsw.limit_in_bytes"
if cgroups.Mode() == cgroups.Unified {
// memory.swap.max does not exist in the cgroup root, so we check /sys/fs/cgroup/<SELF>/memory.swap.max
_, unified, err := cgroups.ParseCgroupFileUnified("/proc/self/cgroup")
if err != nil {
err = fmt.Errorf("failed to parse /proc/self/cgroup: %w", err)
log.L.WithError(err).Warn(warn)
return
}
p = filepath.Join("/sys/fs/cgroup", unified, "memory.swap.max")
}
if _, err := os.Stat(p); err != nil {
if !errors.Is(err, os.ErrNotExist) {
log.L.WithError(err).Warn(warn)
}
return
}
swapControllerAvailability = true
})
return swapControllerAvailability
}
var (
supportsHugetlbOnce sync.Once
supportsHugetlb bool
)
func isHugetlbControllerPresent() bool {
supportsHugetlbOnce.Do(func() {
supportsHugetlb = false
if IsCgroup2UnifiedMode() {
supportsHugetlb = cgroupv2HasHugetlb()
} else {
supportsHugetlb = cgroupv1HasHugetlb()
}
})
return supportsHugetlb
}
var (
_cgroupv1HasHugetlbOnce sync.Once
_cgroupv1HasHugetlb bool
_cgroupv2HasHugetlbOnce sync.Once
_cgroupv2HasHugetlb bool
isUnifiedOnce sync.Once
isUnified bool
)
// cgroupv1HasHugetlb returns whether the hugetlb controller is present on
// cgroup v1.
func cgroupv1HasHugetlb() bool {
_cgroupv1HasHugetlbOnce.Do(func() {
if _, err := os.ReadDir("/sys/fs/cgroup/hugetlb"); err != nil {
_cgroupv1HasHugetlb = false
} else {
_cgroupv1HasHugetlb = true
}
})
return _cgroupv1HasHugetlb
}
// cgroupv2HasHugetlb returns whether the hugetlb controller is present on
// cgroup v2.
func cgroupv2HasHugetlb() bool {
_cgroupv2HasHugetlbOnce.Do(func() {
controllers, err := os.ReadFile("/sys/fs/cgroup/cgroup.controllers")
if err != nil {
return
}
_cgroupv2HasHugetlb = strings.Contains(string(controllers), "hugetlb")
})
return _cgroupv2HasHugetlb
}
// IsCgroup2UnifiedMode returns whether we are running in cgroup v2 unified mode.
func IsCgroup2UnifiedMode() bool {
isUnifiedOnce.Do(func() {
var st syscall.Statfs_t
if err := syscall.Statfs("/sys/fs/cgroup", &st); err != nil {
panic("cannot statfs cgroup root")
}
isUnified = st.Type == unix.CGROUP2_SUPER_MAGIC
})
return isUnified
}
// WithCDI updates OCI spec with CDI content
func WithCDI(annotations map[string]string, CDIDevices []*runtime.CDIDevice) oci.SpecOpts {
return func(ctx context.Context, client oci.Client, c *containers.Container, s *oci.Spec) error {
seen := make(map[string]bool)
// Add devices from CDIDevices CRI field
var devices []string
var err error
for _, device := range CDIDevices {
deviceName := device.Name
if seen[deviceName] {
log.G(ctx).Debugf("Skipping duplicated CDI device %s", deviceName)
continue
}
devices = append(devices, deviceName)
seen[deviceName] = true
}
log.G(ctx).Infof("Container %v: CDI devices from CRI Config.CDIDevices: %v", c.ID, devices)
// Add devices from CDI annotations
_, devsFromAnnotations, err := cdi.ParseAnnotations(annotations)
if err != nil {
return fmt.Errorf("failed to parse CDI device annotations: %w", err)
}
if devsFromAnnotations != nil {
log.G(ctx).Infof("Container %v: CDI devices from annotations: %v", c.ID, devsFromAnnotations)
for _, deviceName := range devsFromAnnotations {
if seen[deviceName] {
// TODO: change to Warning when passing CDI devices as annotations is deprecated
log.G(ctx).Debugf("Skipping duplicated CDI device %s", deviceName)
continue
}
devices = append(devices, deviceName)
seen[deviceName] = true
}
// TODO: change to Warning when passing CDI devices as annotations is deprecated
log.G(ctx).Debug("Passing CDI devices as annotations will be deprecated soon, please use CRI CDIDevices instead")
}
return oci.WithCDIDevices(devices...)(ctx, client, c, s)
}
}

View File

@@ -0,0 +1,471 @@
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package opts
import (
"context"
"errors"
"fmt"
"os"
"path/filepath"
"sort"
"strconv"
"strings"
"syscall"
runtimespec "github.com/opencontainers/runtime-spec/specs-go"
"github.com/opencontainers/selinux/go-selinux/label"
runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
"github.com/containerd/containerd/v2/core/containers"
"github.com/containerd/containerd/v2/core/mount"
"github.com/containerd/containerd/v2/pkg/oci"
osinterface "github.com/containerd/containerd/v2/pkg/os"
"github.com/containerd/log"
)
// RuntimeConfig is a subset of [github.com/containerd/containerd/v2/internal/cri/config].
// Needed for avoiding circular imports.
type RuntimeConfig struct {
TreatRoMountsAsRro bool // only applies to volumes
}
// WithMounts sorts and adds runtime and CRI mounts to the spec
func WithMounts(osi osinterface.OS, config *runtime.ContainerConfig, extra []*runtime.Mount, mountLabel string, rtConfig *RuntimeConfig) oci.SpecOpts {
return func(ctx context.Context, client oci.Client, _ *containers.Container, s *runtimespec.Spec) (err error) {
// mergeMounts merge CRI mounts with extra mounts. If a mount destination
// is mounted by both a CRI mount and an extra mount, the CRI mount will
// be kept.
var (
criMounts = config.GetMounts()
mounts = append([]*runtime.Mount{}, criMounts...)
)
// Copy all mounts from extra mounts, except for mounts overridden by CRI.
for _, e := range extra {
found := false
for _, c := range criMounts {
if filepath.Clean(e.ContainerPath) == filepath.Clean(c.ContainerPath) {
found = true
break
}
}
if !found {
mounts = append(mounts, e)
}
}
// Sort mounts in number of parts. This ensures that high level mounts don't
// shadow other mounts.
sort.Sort(orderedMounts(mounts))
// Mount cgroup into the container as readonly, which inherits docker's behavior.
// TreatRoMountsAsRro does not apply here, as /sys/fs/cgroup is not a volume.
s.Mounts = append(s.Mounts, runtimespec.Mount{
Source: "cgroup",
Destination: "/sys/fs/cgroup",
Type: "cgroup",
Options: []string{"nosuid", "noexec", "nodev", "relatime", "ro"},
})
// Copy all mounts from default mounts, except for
// - mounts overridden by supplied mount;
// - all mounts under /dev if a supplied /dev is present.
mountSet := make(map[string]struct{})
for _, m := range mounts {
mountSet[filepath.Clean(m.ContainerPath)] = struct{}{}
}
defaultMounts := s.Mounts
s.Mounts = nil
for _, m := range defaultMounts {
dst := filepath.Clean(m.Destination)
if _, ok := mountSet[dst]; ok {
// filter out mount overridden by a supplied mount
continue
}
if _, mountDev := mountSet["/dev"]; mountDev && strings.HasPrefix(dst, "/dev/") {
// filter out everything under /dev if /dev is a supplied mount
continue
}
s.Mounts = append(s.Mounts, m)
}
for _, mount := range mounts {
var (
dst = mount.GetContainerPath()
src = mount.GetHostPath()
)
// Create the host path if it doesn't exist.
// TODO(random-liu): Add CRI validation test for this case.
if _, err := osi.Stat(src); err != nil {
if !os.IsNotExist(err) {
return fmt.Errorf("failed to stat %q: %w", src, err)
}
if err := osi.MkdirAll(src, 0755); err != nil {
return fmt.Errorf("failed to mkdir %q: %w", src, err)
}
}
// TODO(random-liu): Add cri-containerd integration test or cri validation test
// for this.
src, err := osi.ResolveSymbolicLink(src)
if err != nil {
return fmt.Errorf("failed to resolve symlink %q: %w", src, err)
}
if s.Linux == nil {
s.Linux = &runtimespec.Linux{}
}
options := []string{"rbind"}
switch mount.GetPropagation() {
case runtime.MountPropagation_PROPAGATION_PRIVATE:
options = append(options, "rprivate")
// Since default root propagation in runc is rprivate ignore
// setting the root propagation
case runtime.MountPropagation_PROPAGATION_BIDIRECTIONAL:
if err := ensureShared(src, osi.LookupMount); err != nil {
return err
}
options = append(options, "rshared")
s.Linux.RootfsPropagation = "rshared"
case runtime.MountPropagation_PROPAGATION_HOST_TO_CONTAINER:
if err := ensureSharedOrSlave(src, osi.LookupMount); err != nil {
return err
}
options = append(options, "rslave")
if s.Linux.RootfsPropagation != "rshared" &&
s.Linux.RootfsPropagation != "rslave" {
s.Linux.RootfsPropagation = "rslave"
}
default:
log.G(ctx).Warnf("Unknown propagation mode for hostPath %q", mount.HostPath)
options = append(options, "rprivate")
}
var srcIsDir bool
if srcSt, err := osi.Stat(src); err != nil {
if errors.Is(err, os.ErrNotExist) { // happens when osi is FakeOS
srcIsDir = true // assume src to be dir
} else {
return fmt.Errorf("failed to stat mount source %q: %w", src, err)
}
} else if srcSt != nil { // srcSt can be nil when osi is FakeOS
srcIsDir = srcSt.IsDir()
}
// NOTE(random-liu): we don't change all mounts to `ro` when root filesystem
// is readonly. This is different from docker's behavior, but make more sense.
if mount.GetReadonly() {
if rtConfig != nil && rtConfig.TreatRoMountsAsRro && srcIsDir {
options = append(options, "rro")
} else {
options = append(options, "ro")
}
} else {
options = append(options, "rw")
}
if mount.GetSelinuxRelabel() {
ENOTSUP := syscall.Errno(0x5f) // Linux specific error code, this branch will not execute on non Linux platforms.
if err := label.Relabel(src, mountLabel, false); err != nil && err != ENOTSUP {
return fmt.Errorf("relabel %q with %q failed: %w", src, mountLabel, err)
}
}
var uidMapping []runtimespec.LinuxIDMapping
if mount.UidMappings != nil {
for _, mapping := range mount.UidMappings {
uidMapping = append(uidMapping, runtimespec.LinuxIDMapping{
HostID: mapping.HostId,
ContainerID: mapping.ContainerId,
Size: mapping.Length,
})
}
}
var gidMapping []runtimespec.LinuxIDMapping
if mount.GidMappings != nil {
for _, mapping := range mount.GidMappings {
gidMapping = append(gidMapping, runtimespec.LinuxIDMapping{
HostID: mapping.HostId,
ContainerID: mapping.ContainerId,
Size: mapping.Length,
})
}
}
s.Mounts = append(s.Mounts, runtimespec.Mount{
Source: src,
Destination: dst,
Type: "bind",
Options: options,
UIDMappings: uidMapping,
GIDMappings: gidMapping,
})
}
return nil
}
}
// Ensure mount point on which path is mounted, is shared.
func ensureShared(path string, lookupMount func(string) (mount.Info, error)) error {
mountInfo, err := lookupMount(path)
if err != nil {
return err
}
// Make sure source mount point is shared.
optsSplit := strings.Split(mountInfo.Optional, " ")
for _, opt := range optsSplit {
if strings.HasPrefix(opt, "shared:") {
return nil
}
}
return fmt.Errorf("path %q is mounted on %q but it is not a shared mount", path, mountInfo.Mountpoint)
}
// ensure mount point on which path is mounted, is either shared or slave.
func ensureSharedOrSlave(path string, lookupMount func(string) (mount.Info, error)) error {
mountInfo, err := lookupMount(path)
if err != nil {
return err
}
// Make sure source mount point is shared.
optsSplit := strings.Split(mountInfo.Optional, " ")
for _, opt := range optsSplit {
if strings.HasPrefix(opt, "shared:") {
return nil
} else if strings.HasPrefix(opt, "master:") {
return nil
}
}
return fmt.Errorf("path %q is mounted on %q but it is not a shared or slave mount", path, mountInfo.Mountpoint)
}
// getDeviceUserGroupID() is used to find the right uid/gid
// value for the device node created in the container namespace.
// The runtime executes mknod() and chmod()s the created
// device with the values returned here.
//
// On Linux, uid and gid are sufficient and the user/groupname do not
// need to be resolved.
//
// TODO(mythi): In case of user namespaces, the runtime simply bind
// mounts the devices from the host. Additional logic is needed
// to check that the runtimes effective UID/GID on the host has the
// permissions to access the device node and/or the right user namespace
// mappings are created.
//
// Ref: https://github.com/kubernetes/kubernetes/issues/92211
func getDeviceUserGroupID(runAsVal *runtime.Int64Value) uint32 {
if runAsVal != nil {
return uint32(runAsVal.GetValue())
}
return 0
}
// WithDevices sets the provided devices onto the container spec
func WithDevices(osi osinterface.OS, config *runtime.ContainerConfig, enableDeviceOwnershipFromSecurityContext bool) oci.SpecOpts {
return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) (err error) {
if s.Linux == nil {
s.Linux = &runtimespec.Linux{}
}
if s.Linux.Resources == nil {
s.Linux.Resources = &runtimespec.LinuxResources{}
}
oldDevices := len(s.Linux.Devices)
for _, device := range config.GetDevices() {
path, err := osi.ResolveSymbolicLink(device.HostPath)
if err != nil {
return err
}
o := oci.WithDevices(path, device.ContainerPath, device.Permissions)
if err := o(ctx, client, c, s); err != nil {
return err
}
}
if enableDeviceOwnershipFromSecurityContext {
UID := getDeviceUserGroupID(config.GetLinux().GetSecurityContext().GetRunAsUser())
GID := getDeviceUserGroupID(config.GetLinux().GetSecurityContext().GetRunAsGroup())
// Loop all new devices added by oci.WithDevices() to update their
// dev.UID/dev.GID.
//
// non-zero UID/GID from SecurityContext is used to override host's
// device UID/GID for the container.
for idx := oldDevices; idx < len(s.Linux.Devices); idx++ {
if UID != 0 {
*s.Linux.Devices[idx].UID = UID
}
if GID != 0 {
*s.Linux.Devices[idx].GID = GID
}
}
}
return nil
}
}
// WithResources sets the provided resource restrictions
func WithResources(resources *runtime.LinuxContainerResources, tolerateMissingHugetlbController, disableHugetlbController bool) oci.SpecOpts {
return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) (err error) {
if resources == nil {
return nil
}
if s.Linux == nil {
s.Linux = &runtimespec.Linux{}
}
if s.Linux.Resources == nil {
s.Linux.Resources = &runtimespec.LinuxResources{}
}
if s.Linux.Resources.CPU == nil {
s.Linux.Resources.CPU = &runtimespec.LinuxCPU{}
}
if s.Linux.Resources.Memory == nil {
s.Linux.Resources.Memory = &runtimespec.LinuxMemory{}
}
var (
p = uint64(resources.GetCpuPeriod())
q = resources.GetCpuQuota()
shares = uint64(resources.GetCpuShares())
limit = resources.GetMemoryLimitInBytes()
swapLimit = resources.GetMemorySwapLimitInBytes()
hugepages = resources.GetHugepageLimits()
)
if p != 0 {
s.Linux.Resources.CPU.Period = &p
}
if q != 0 {
s.Linux.Resources.CPU.Quota = &q
}
if shares != 0 {
s.Linux.Resources.CPU.Shares = &shares
}
if cpus := resources.GetCpusetCpus(); cpus != "" {
s.Linux.Resources.CPU.Cpus = cpus
}
if mems := resources.GetCpusetMems(); mems != "" {
s.Linux.Resources.CPU.Mems = resources.GetCpusetMems()
}
if limit != 0 {
s.Linux.Resources.Memory.Limit = &limit
// swap/memory limit should be equal to prevent container from swapping by default
if swapLimit == 0 && SwapControllerAvailable() {
s.Linux.Resources.Memory.Swap = &limit
}
}
if swapLimit != 0 && SwapControllerAvailable() {
s.Linux.Resources.Memory.Swap = &swapLimit
}
if !disableHugetlbController {
if isHugetlbControllerPresent() {
for _, limit := range hugepages {
s.Linux.Resources.HugepageLimits = append(s.Linux.Resources.HugepageLimits, runtimespec.LinuxHugepageLimit{
Pagesize: limit.PageSize,
Limit: limit.Limit,
})
}
} else {
if !tolerateMissingHugetlbController {
return errors.New("huge pages limits are specified but hugetlb cgroup controller is missing. " +
"Please set tolerate_missing_hugetlb_controller to `true` to ignore this error")
}
log.L.Warn("hugetlb cgroup controller is absent. skipping huge pages limits")
}
}
if unified := resources.GetUnified(); unified != nil {
if s.Linux.Resources.Unified == nil {
s.Linux.Resources.Unified = make(map[string]string)
}
for k, v := range unified {
s.Linux.Resources.Unified[k] = v
}
}
return nil
}
}
// WithOOMScoreAdj sets the oom score
func WithOOMScoreAdj(config *runtime.ContainerConfig, restrict bool) oci.SpecOpts {
return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) error {
if s.Process == nil {
s.Process = &runtimespec.Process{}
}
resources := config.GetLinux().GetResources()
if resources == nil {
return nil
}
adj := int(resources.GetOomScoreAdj())
if restrict {
var err error
adj, err = restrictOOMScoreAdj(adj)
if err != nil {
return err
}
}
s.Process.OOMScoreAdj = &adj
return nil
}
}
// WithPodOOMScoreAdj sets the oom score for the pod sandbox
func WithPodOOMScoreAdj(adj int, restrict bool) oci.SpecOpts {
return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) error {
if s.Process == nil {
s.Process = &runtimespec.Process{}
}
if restrict {
var err error
adj, err = restrictOOMScoreAdj(adj)
if err != nil {
return err
}
}
s.Process.OOMScoreAdj = &adj
return nil
}
}
func getCurrentOOMScoreAdj() (int, error) {
b, err := os.ReadFile("/proc/self/oom_score_adj")
if err != nil {
return 0, fmt.Errorf("could not get the daemon oom_score_adj: %w", err)
}
s := strings.TrimSpace(string(b))
i, err := strconv.Atoi(s)
if err != nil {
return 0, fmt.Errorf("could not get the daemon oom_score_adj: %w", err)
}
return i, nil
}
func restrictOOMScoreAdj(preferredOOMScoreAdj int) (int, error) {
currentOOMScoreAdj, err := getCurrentOOMScoreAdj()
if err != nil {
return preferredOOMScoreAdj, err
}
if preferredOOMScoreAdj < currentOOMScoreAdj {
return currentOOMScoreAdj, nil
}
return preferredOOMScoreAdj, nil
}

View File

@@ -0,0 +1,47 @@
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package opts
import (
"testing"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func TestMergeGids(t *testing.T) {
gids1 := []uint32{3, 2, 1}
gids2 := []uint32{2, 3, 4}
assert.Equal(t, []uint32{1, 2, 3, 4}, mergeGids(gids1, gids2))
}
func TestRestrictOOMScoreAdj(t *testing.T) {
current, err := getCurrentOOMScoreAdj()
require.NoError(t, err)
got, err := restrictOOMScoreAdj(current - 1)
require.NoError(t, err)
assert.Equal(t, got, current)
got, err = restrictOOMScoreAdj(current)
require.NoError(t, err)
assert.Equal(t, got, current)
got, err = restrictOOMScoreAdj(current + 1)
require.NoError(t, err)
assert.Equal(t, got, current+1)
}

View File

@@ -0,0 +1,42 @@
//go:build !linux
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package opts
import (
"context"
"github.com/containerd/containerd/v2/core/containers"
"github.com/containerd/containerd/v2/pkg/oci"
runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
)
func isHugetlbControllerPresent() bool {
return false
}
func SwapControllerAvailable() bool {
return false
}
// WithCDI does nothing on non-Linux platforms.
func WithCDI(_ map[string]string, _ []*runtime.CDIDevice) oci.SpecOpts {
return func(ctx context.Context, client oci.Client, container *containers.Container, spec *oci.Spec) error {
return nil
}
}

View File

@@ -0,0 +1,36 @@
//go:build !windows
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package opts
import (
"context"
"github.com/containerd/containerd/v2/core/containers"
"github.com/containerd/containerd/v2/pkg/oci"
"github.com/containerd/errdefs"
imagespec "github.com/opencontainers/image-spec/specs-go/v1"
runtimespec "github.com/opencontainers/runtime-spec/specs-go"
runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
)
func WithProcessCommandLineOrArgsForWindows(config *runtime.ContainerConfig, image *imagespec.ImageConfig) oci.SpecOpts {
return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) (err error) {
return errdefs.ErrNotImplemented
}
}

View File

@@ -0,0 +1,375 @@
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package opts
import (
"context"
"errors"
"fmt"
"os"
"path/filepath"
"sort"
"strings"
imagespec "github.com/opencontainers/image-spec/specs-go/v1"
runtimespec "github.com/opencontainers/runtime-spec/specs-go"
runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
"github.com/containerd/containerd/v2/core/containers"
"github.com/containerd/containerd/v2/internal/cri/util"
"github.com/containerd/containerd/v2/pkg/oci"
)
// DefaultSandboxCPUshares is default cpu shares for sandbox container.
// TODO(windows): Revisit cpu shares for windows (https://github.com/containerd/cri/issues/1297)
const DefaultSandboxCPUshares = 2
// WithRelativeRoot sets the root for the container
func WithRelativeRoot(root string) oci.SpecOpts {
return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) (err error) {
if s.Root == nil {
s.Root = &runtimespec.Root{}
}
s.Root.Path = root
return nil
}
}
// WithoutRoot sets the root to nil for the container.
func WithoutRoot(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) error {
s.Root = nil
return nil
}
// WithProcessArgs sets the process args on the spec based on the image and runtime config
func WithProcessArgs(config *runtime.ContainerConfig, image *imagespec.ImageConfig) oci.SpecOpts {
return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) (err error) {
command, args := config.GetCommand(), config.GetArgs()
// The following logic is migrated from https://github.com/moby/moby/blob/master/daemon/commit.go
// TODO(random-liu): Clearly define the commands overwrite behavior.
if len(command) == 0 {
// Copy array to avoid data race.
if len(args) == 0 {
args = append([]string{}, image.Cmd...)
}
if command == nil {
if !(len(image.Entrypoint) == 1 && image.Entrypoint[0] == "") {
command = append([]string{}, image.Entrypoint...)
}
}
}
if len(command) == 0 && len(args) == 0 {
return errors.New("no command specified")
}
return oci.WithProcessArgs(append(command, args...)...)(ctx, client, c, s)
}
}
// mounts defines how to sort runtime.Mount.
// This is the same with the Docker implementation:
//
// https://github.com/moby/moby/blob/17.05.x/daemon/volumes.go#L26
type orderedMounts []*runtime.Mount
// Len returns the number of mounts. Used in sorting.
func (m orderedMounts) Len() int {
return len(m)
}
// Less returns true if the number of parts (a/b/c would be 3 parts) in the
// mount indexed by parameter 1 is less than that of the mount indexed by
// parameter 2. Used in sorting.
func (m orderedMounts) Less(i, j int) bool {
return m.parts(i) < m.parts(j)
}
// Swap swaps two items in an array of mounts. Used in sorting
func (m orderedMounts) Swap(i, j int) {
m[i], m[j] = m[j], m[i]
}
// parts returns the number of parts in the destination of a mount. Used in sorting.
func (m orderedMounts) parts(i int) int {
return strings.Count(filepath.Clean(m[i].ContainerPath), string(os.PathSeparator))
}
// WithAnnotation sets the provided annotation
func WithAnnotation(k, v string) oci.SpecOpts {
return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) error {
if s.Annotations == nil {
s.Annotations = make(map[string]string)
}
s.Annotations[k] = v
return nil
}
}
// WithAdditionalGIDs adds any additional groups listed for a particular user in the
// /etc/groups file of the image's root filesystem to the OCI spec's additionalGids array.
func WithAdditionalGIDs(userstr string) oci.SpecOpts {
return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) (err error) {
if s.Process == nil {
s.Process = &runtimespec.Process{}
}
gids := s.Process.User.AdditionalGids
if err := oci.WithAdditionalGIDs(userstr)(ctx, client, c, s); err != nil {
return err
}
// Merge existing gids and new gids.
s.Process.User.AdditionalGids = mergeGids(s.Process.User.AdditionalGids, gids)
return nil
}
}
func mergeGids(gids1, gids2 []uint32) []uint32 {
gidsMap := make(map[uint32]struct{})
for _, gid1 := range gids1 {
gidsMap[gid1] = struct{}{}
}
for _, gid2 := range gids2 {
gidsMap[gid2] = struct{}{}
}
var gids []uint32
for gid := range gidsMap {
gids = append(gids, gid)
}
sort.Slice(gids, func(i, j int) bool { return gids[i] < gids[j] })
return gids
}
// WithoutDefaultSecuritySettings removes the default security settings generated on a spec
func WithoutDefaultSecuritySettings(_ context.Context, _ oci.Client, c *containers.Container, s *runtimespec.Spec) error {
if s.Process == nil {
s.Process = &runtimespec.Process{}
}
// Make sure no default seccomp/apparmor is specified
s.Process.ApparmorProfile = ""
if s.Linux != nil {
s.Linux.Seccomp = nil
}
// Remove default rlimits (See https://github.com/containerd/cri/issues/515)
s.Process.Rlimits = nil
return nil
}
// WithCapabilities sets the provided capabilities from the security context
func WithCapabilities(sc *runtime.LinuxContainerSecurityContext, allCaps []string) oci.SpecOpts {
capabilities := sc.GetCapabilities()
if capabilities == nil {
return nullOpt
}
var opts []oci.SpecOpts
// Add/drop all capabilities if "all" is specified, so that
// following individual add/drop could still work. E.g.
// AddCapabilities: []string{"ALL"}, DropCapabilities: []string{"CHOWN"}
// will be all capabilities without `CAP_CHOWN`.
if util.InStringSlice(capabilities.GetAddCapabilities(), "ALL") {
opts = append(opts, oci.WithCapabilities(allCaps))
}
if util.InStringSlice(capabilities.GetDropCapabilities(), "ALL") {
opts = append(opts, oci.WithCapabilities(nil))
}
var caps []string
for _, c := range capabilities.GetAddCapabilities() {
if strings.ToUpper(c) == "ALL" {
continue
}
// Capabilities in CRI doesn't have `CAP_` prefix, so add it.
caps = append(caps, "CAP_"+strings.ToUpper(c))
}
opts = append(opts, oci.WithAddedCapabilities(caps))
caps = []string{}
for _, c := range capabilities.GetDropCapabilities() {
if strings.ToUpper(c) == "ALL" {
continue
}
caps = append(caps, "CAP_"+strings.ToUpper(c))
}
opts = append(opts, oci.WithDroppedCapabilities(caps))
return oci.Compose(opts...)
}
func nullOpt(_ context.Context, _ oci.Client, _ *containers.Container, _ *runtimespec.Spec) error {
return nil
}
// WithoutAmbientCaps removes the ambient caps from the spec
func WithoutAmbientCaps(_ context.Context, _ oci.Client, c *containers.Container, s *runtimespec.Spec) error {
if s.Process == nil {
s.Process = &runtimespec.Process{}
}
if s.Process.Capabilities == nil {
s.Process.Capabilities = &runtimespec.LinuxCapabilities{}
}
s.Process.Capabilities.Ambient = nil
return nil
}
// WithDisabledCgroups clears the Cgroups Path from the spec
func WithDisabledCgroups(_ context.Context, _ oci.Client, c *containers.Container, s *runtimespec.Spec) error {
if s.Linux == nil {
s.Linux = &runtimespec.Linux{}
}
s.Linux.CgroupsPath = ""
return nil
}
// WithSelinuxLabels sets the mount and process labels
func WithSelinuxLabels(process, mount string) oci.SpecOpts {
return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) (err error) {
if s.Linux == nil {
s.Linux = &runtimespec.Linux{}
}
if s.Process == nil {
s.Process = &runtimespec.Process{}
}
s.Linux.MountLabel = mount
s.Process.SelinuxLabel = process
return nil
}
}
// WithSysctls sets the provided sysctls onto the spec
func WithSysctls(sysctls map[string]string) oci.SpecOpts {
return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) error {
if s.Linux == nil {
s.Linux = &runtimespec.Linux{}
}
if s.Linux.Sysctl == nil {
s.Linux.Sysctl = make(map[string]string)
}
for k, v := range sysctls {
s.Linux.Sysctl[k] = v
}
return nil
}
}
// WithSupplementalGroups sets the supplemental groups for the process
func WithSupplementalGroups(groups []int64) oci.SpecOpts {
return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) error {
if s.Process == nil {
s.Process = &runtimespec.Process{}
}
var guids []uint32
for _, g := range groups {
guids = append(guids, uint32(g))
}
s.Process.User.AdditionalGids = mergeGids(s.Process.User.AdditionalGids, guids)
return nil
}
}
// WithDefaultSandboxShares sets the default sandbox CPU shares
func WithDefaultSandboxShares(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) error {
if s.Linux == nil {
s.Linux = &runtimespec.Linux{}
}
if s.Linux.Resources == nil {
s.Linux.Resources = &runtimespec.LinuxResources{}
}
if s.Linux.Resources.CPU == nil {
s.Linux.Resources.CPU = &runtimespec.LinuxCPU{}
}
i := uint64(DefaultSandboxCPUshares)
s.Linux.Resources.CPU.Shares = &i
return nil
}
// WithoutNamespace removes the provided namespace
func WithoutNamespace(t runtimespec.LinuxNamespaceType) oci.SpecOpts {
return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) error {
if s.Linux == nil {
return nil
}
var namespaces []runtimespec.LinuxNamespace
for i, ns := range s.Linux.Namespaces {
if ns.Type != t {
namespaces = append(namespaces, s.Linux.Namespaces[i])
}
}
s.Linux.Namespaces = namespaces
return nil
}
}
// WithPodNamespaces sets the pod namespaces for the container
func WithPodNamespaces(config *runtime.LinuxContainerSecurityContext, sandboxPid uint32, targetPid uint32, uids, gids []runtimespec.LinuxIDMapping) oci.SpecOpts {
namespaces := config.GetNamespaceOptions()
opts := []oci.SpecOpts{
oci.WithLinuxNamespace(runtimespec.LinuxNamespace{Type: runtimespec.NetworkNamespace, Path: GetNetworkNamespace(sandboxPid)}),
oci.WithLinuxNamespace(runtimespec.LinuxNamespace{Type: runtimespec.IPCNamespace, Path: GetIPCNamespace(sandboxPid)}),
oci.WithLinuxNamespace(runtimespec.LinuxNamespace{Type: runtimespec.UTSNamespace, Path: GetUTSNamespace(sandboxPid)}),
}
if namespaces.GetPid() != runtime.NamespaceMode_CONTAINER {
opts = append(opts, oci.WithLinuxNamespace(runtimespec.LinuxNamespace{Type: runtimespec.PIDNamespace, Path: GetPIDNamespace(targetPid)}))
}
if namespaces.GetUsernsOptions() != nil {
switch namespaces.GetUsernsOptions().GetMode() {
case runtime.NamespaceMode_NODE:
// Nothing to do. Not adding userns field uses the node userns.
case runtime.NamespaceMode_POD:
opts = append(opts, oci.WithLinuxNamespace(runtimespec.LinuxNamespace{Type: runtimespec.UserNamespace, Path: GetUserNamespace(sandboxPid)}))
opts = append(opts, oci.WithUserNamespace(uids, gids))
}
}
return oci.Compose(opts...)
}
const (
// netNSFormat is the format of network namespace of a process.
netNSFormat = "/proc/%v/ns/net"
// ipcNSFormat is the format of ipc namespace of a process.
ipcNSFormat = "/proc/%v/ns/ipc"
// utsNSFormat is the format of uts namespace of a process.
utsNSFormat = "/proc/%v/ns/uts"
// pidNSFormat is the format of pid namespace of a process.
pidNSFormat = "/proc/%v/ns/pid"
// userNSFormat is the format of user namespace of a process.
userNSFormat = "/proc/%v/ns/user"
)
// GetNetworkNamespace returns the network namespace of a process.
func GetNetworkNamespace(pid uint32) string {
return fmt.Sprintf(netNSFormat, pid)
}
// GetIPCNamespace returns the ipc namespace of a process.
func GetIPCNamespace(pid uint32) string {
return fmt.Sprintf(ipcNSFormat, pid)
}
// GetUTSNamespace returns the uts namespace of a process.
func GetUTSNamespace(pid uint32) string {
return fmt.Sprintf(utsNSFormat, pid)
}
// GetPIDNamespace returns the pid namespace of a process.
func GetPIDNamespace(pid uint32) string {
return fmt.Sprintf(pidNSFormat, pid)
}
// GetUserNamespace returns the user namespace of a process.
func GetUserNamespace(pid uint32) string {
return fmt.Sprintf(userNSFormat, pid)
}

View File

@@ -0,0 +1,46 @@
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package opts
import (
"sort"
"testing"
"github.com/stretchr/testify/assert"
runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
)
func TestOrderedMounts(t *testing.T) {
mounts := []*runtime.Mount{
{ContainerPath: "/a/b/c"},
{ContainerPath: "/a/b"},
{ContainerPath: "/a/b/c/d"},
{ContainerPath: "/a"},
{ContainerPath: "/b"},
{ContainerPath: "/b/c"},
}
expected := []*runtime.Mount{
{ContainerPath: "/a"},
{ContainerPath: "/b"},
{ContainerPath: "/a/b"},
{ContainerPath: "/b/c"},
{ContainerPath: "/a/b/c"},
{ContainerPath: "/a/b/c/d"},
}
sort.Stable(orderedMounts(mounts))
assert.Equal(t, expected, mounts)
}

View File

@@ -0,0 +1,121 @@
//go:build windows
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package opts
import (
"context"
"errors"
"strings"
"github.com/containerd/containerd/v2/core/containers"
"github.com/containerd/containerd/v2/pkg/oci"
imagespec "github.com/opencontainers/image-spec/specs-go/v1"
runtimespec "github.com/opencontainers/runtime-spec/specs-go"
"golang.org/x/sys/windows"
runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
)
func escapeAndCombineArgsWindows(args []string) string {
escaped := make([]string, len(args))
for i, a := range args {
escaped[i] = windows.EscapeArg(a)
}
return strings.Join(escaped, " ")
}
// WithProcessCommandLineOrArgsForWindows sets the process command line or process args on the spec based on the image
// and runtime config
// If image.ArgsEscaped field is set, this function sets the process command line and if not, it sets the
// process args field
func WithProcessCommandLineOrArgsForWindows(config *runtime.ContainerConfig, image *imagespec.ImageConfig) oci.SpecOpts {
if image.ArgsEscaped { //nolint:staticcheck // ArgsEscaped is deprecated
return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) (err error) {
// firstArgFromImg is a flag that is returned to indicate that the first arg in the slice comes from either the
// image Entrypoint or Cmd. If the first arg instead comes from the container config (e.g. overriding the image values),
// it should be false. This is done to support the non-OCI ArgsEscaped field that Docker used to determine how the image
// entrypoint and cmd should be interpreted.
//
args, firstArgFromImg, err := getArgs(image.Entrypoint, image.Cmd, config.GetCommand(), config.GetArgs())
if err != nil {
return err
}
var cmdLine string
if image.ArgsEscaped && firstArgFromImg { //nolint:staticcheck // ArgsEscaped is deprecated
cmdLine = args[0]
if len(args) > 1 {
cmdLine += " " + escapeAndCombineArgsWindows(args[1:])
}
} else {
cmdLine = escapeAndCombineArgsWindows(args)
}
return oci.WithProcessCommandLine(cmdLine)(ctx, client, c, s)
}
}
// if ArgsEscaped is not set
return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) (err error) {
args, _, err := getArgs(image.Entrypoint, image.Cmd, config.GetCommand(), config.GetArgs())
if err != nil {
return err
}
return oci.WithProcessArgs(args...)(ctx, client, c, s)
}
}
// getArgs is used to evaluate the overall args for the container by taking into account the image command and entrypoints
// along with the container command and entrypoints specified through the podspec if any
func getArgs(imgEntrypoint []string, imgCmd []string, ctrEntrypoint []string, ctrCmd []string) ([]string, bool, error) {
//nolint:dupword
// firstArgFromImg is a flag that is returned to indicate that the first arg in the slice comes from either the image
// Entrypoint or Cmd. If the first arg instead comes from the container config (e.g. overriding the image values),
// it should be false.
// Essentially this means firstArgFromImg should be true iff:
// Ctr entrypoint ctr cmd image entrypoint image cmd firstArgFromImg
// --------------------------------------------------------------------------------
// nil nil exists nil true
// nil nil nil exists true
// This is needed to support the non-OCI ArgsEscaped field used by Docker. ArgsEscaped is used for
// Windows images to indicate that the command has already been escaped and should be
// used directly as the command line.
var firstArgFromImg bool
entrypoint, cmd := ctrEntrypoint, ctrCmd
// The following logic is migrated from https://github.com/moby/moby/blob/master/daemon/commit.go
// TODO(random-liu): Clearly define the commands overwrite behavior.
if len(entrypoint) == 0 {
// Copy array to avoid data race.
if len(cmd) == 0 {
cmd = append([]string{}, imgCmd...)
if len(imgCmd) > 0 {
firstArgFromImg = true
}
}
if entrypoint == nil {
entrypoint = append([]string{}, imgEntrypoint...)
if len(imgEntrypoint) > 0 || len(ctrCmd) == 0 {
firstArgFromImg = true
}
}
}
if len(entrypoint) == 0 && len(cmd) == 0 {
return nil, false, errors.New("no command specified")
}
return append(entrypoint, cmd...), firstArgFromImg, nil
}

View File

@@ -0,0 +1,261 @@
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package opts
import (
"context"
"fmt"
"os"
"path/filepath"
"sort"
"strings"
"github.com/containerd/containerd/v2/core/containers"
"github.com/containerd/containerd/v2/pkg/oci"
osinterface "github.com/containerd/containerd/v2/pkg/os"
runtimespec "github.com/opencontainers/runtime-spec/specs-go"
runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
)
// namedPipePath returns true if the given path is to a named pipe.
func namedPipePath(p string) bool {
return strings.HasPrefix(p, `\\.\pipe\`)
}
// cleanMount returns a cleaned version of the mount path. The input is returned
// as-is if it is a named pipe path.
func cleanMount(p string) string {
if namedPipePath(p) {
return p
}
return filepath.Clean(p)
}
func parseMount(osi osinterface.OS, mount *runtime.Mount) (*runtimespec.Mount, error) {
var (
dst = mount.GetContainerPath()
src = mount.GetHostPath()
)
// In the case of a named pipe mount on Windows, don't stat the file
// or do other operations that open it, as that could interfere with
// the listening process. filepath.Clean also breaks named pipe
// paths, so don't use it.
if !namedPipePath(src) {
if _, err := osi.Stat(src); err != nil {
// Create the host path if it doesn't exist. This will align
// the behavior with the Linux implementation, but it doesn't
// align with Docker's behavior on Windows.
if !os.IsNotExist(err) {
return nil, fmt.Errorf("failed to stat %q: %w", src, err)
}
if err := osi.MkdirAll(src, 0755); err != nil {
return nil, fmt.Errorf("failed to mkdir %q: %w", src, err)
}
}
var err error
originalSrc := src
src, err = osi.ResolveSymbolicLink(src)
if err != nil {
return nil, fmt.Errorf("failed to resolve symlink %q: %w", originalSrc, err)
}
// hcsshim requires clean path, especially '/' -> '\'. Additionally,
// for the destination, absolute paths should have the C: prefix.
src = filepath.Clean(src)
// filepath.Clean adds a '.' at the end if the path is a
// drive (like Z:, E: etc.). Keeping this '.' in the path
// causes incorrect parameter error when starting the
// container on windows. Remove it here.
if !(len(dst) == 2 && dst[1] == ':') {
dst = filepath.Clean(dst)
if dst[0] == '\\' {
dst = "C:" + dst
}
} else if dst[0] == 'c' || dst[0] == 'C' {
return nil, fmt.Errorf("destination path can not be C drive")
}
}
var options []string
// NOTE(random-liu): we don't change all mounts to `ro` when root filesystem
// is readonly. This is different from docker's behavior, but make more sense.
if mount.GetReadonly() {
options = append(options, "ro")
} else {
options = append(options, "rw")
}
return &runtimespec.Mount{Source: src, Destination: dst, Options: options}, nil
}
// WithWindowsMounts sorts and adds runtime and CRI mounts to the spec for
// windows container.
func WithWindowsMounts(osi osinterface.OS, config *runtime.ContainerConfig, extra []*runtime.Mount) oci.SpecOpts {
return func(ctx context.Context, client oci.Client, _ *containers.Container, s *runtimespec.Spec) error {
// mergeMounts merge CRI mounts with extra mounts. If a mount destination
// is mounted by both a CRI mount and an extra mount, the CRI mount will
// be kept.
var (
criMounts = config.GetMounts()
mounts = append([]*runtime.Mount{}, criMounts...)
)
// Copy all mounts from extra mounts, except for mounts overridden by CRI.
for _, e := range extra {
found := false
for _, c := range criMounts {
if cleanMount(e.ContainerPath) == cleanMount(c.ContainerPath) {
found = true
break
}
}
if !found {
mounts = append(mounts, e)
}
}
// Sort mounts in number of parts. This ensures that high level mounts don't
// shadow other mounts.
sort.Sort(orderedMounts(mounts))
// Copy all mounts from default mounts, except for
// mounts overridden by supplied mount;
mountSet := make(map[string]struct{})
for _, m := range mounts {
mountSet[cleanMount(m.ContainerPath)] = struct{}{}
}
defaultMounts := s.Mounts
s.Mounts = nil
for _, m := range defaultMounts {
dst := cleanMount(m.Destination)
if _, ok := mountSet[dst]; ok {
// filter out mount overridden by a supplied mount
continue
}
s.Mounts = append(s.Mounts, m)
}
for _, mount := range mounts {
parsedMount, err := parseMount(osi, mount)
if err != nil {
return err
}
s.Mounts = append(s.Mounts, *parsedMount)
}
return nil
}
}
// WithWindowsResources sets the provided resource restrictions for windows.
func WithWindowsResources(resources *runtime.WindowsContainerResources) oci.SpecOpts {
return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) error {
if resources == nil {
return nil
}
if s.Windows == nil {
s.Windows = &runtimespec.Windows{}
}
if s.Windows.Resources == nil {
s.Windows.Resources = &runtimespec.WindowsResources{}
}
if s.Windows.Resources.Memory == nil {
s.Windows.Resources.Memory = &runtimespec.WindowsMemoryResources{}
}
var (
count = uint64(resources.GetCpuCount())
shares = uint16(resources.GetCpuShares())
max = uint16(resources.GetCpuMaximum())
limit = uint64(resources.GetMemoryLimitInBytes())
)
if s.Windows.Resources.CPU == nil && (count != 0 || shares != 0 || max != 0) {
s.Windows.Resources.CPU = &runtimespec.WindowsCPUResources{}
}
if count != 0 {
s.Windows.Resources.CPU.Count = &count
}
if shares != 0 {
s.Windows.Resources.CPU.Shares = &shares
}
if max != 0 {
s.Windows.Resources.CPU.Maximum = &max
}
if limit != 0 {
s.Windows.Resources.Memory.Limit = &limit
}
return nil
}
}
// WithWindowsDefaultSandboxShares sets the default sandbox CPU shares
func WithWindowsDefaultSandboxShares(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) error {
if s.Windows == nil {
s.Windows = &runtimespec.Windows{}
}
if s.Windows.Resources == nil {
s.Windows.Resources = &runtimespec.WindowsResources{}
}
if s.Windows.Resources.CPU == nil {
s.Windows.Resources.CPU = &runtimespec.WindowsCPUResources{}
}
i := uint16(DefaultSandboxCPUshares)
s.Windows.Resources.CPU.Shares = &i
return nil
}
// WithWindowsCredentialSpec assigns `credentialSpec` to the
// `runtime.Spec.Windows.CredentialSpec` field.
func WithWindowsCredentialSpec(credentialSpec string) oci.SpecOpts {
return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) error {
if s.Windows == nil {
s.Windows = &runtimespec.Windows{}
}
s.Windows.CredentialSpec = credentialSpec
return nil
}
}
// WithWindowsDevices sets the provided devices onto the container spec
func WithWindowsDevices(config *runtime.ContainerConfig) oci.SpecOpts {
return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) (err error) {
for _, device := range config.GetDevices() {
if device.ContainerPath != "" {
return fmt.Errorf("unexpected ContainerPath %s, must be empty", device.ContainerPath)
}
if device.Permissions != "" {
return fmt.Errorf("unexpected Permissions %s, must be empty", device.Permissions)
}
hostPath := device.HostPath
if strings.HasPrefix(hostPath, "class/") {
hostPath = "class://" + strings.TrimPrefix(hostPath, "class/")
}
idType, id, ok := strings.Cut(hostPath, "://")
if !ok {
return fmt.Errorf("unrecognised HostPath format %v, must match IDType://ID", device.HostPath)
}
o := oci.WithWindowsDevice(idType, id)
if err := o(ctx, client, c, s); err != nil {
return fmt.Errorf("failed adding device with HostPath %v: %w", device.HostPath, err)
}
}
return nil
}
}

View File

@@ -0,0 +1,249 @@
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package opts
import (
"context"
"fmt"
"strings"
"testing"
"github.com/opencontainers/runtime-spec/specs-go"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
"github.com/containerd/containerd/v2/core/containers"
"github.com/containerd/containerd/v2/pkg/namespaces"
"github.com/containerd/containerd/v2/pkg/oci"
osinterface "github.com/containerd/containerd/v2/pkg/os"
)
func TestWithDevices(t *testing.T) {
testcases := []struct {
name string
devices []*runtime.Device
isLCOW bool
expectError bool
expectedWindowsDevices []specs.WindowsDevice
}{
{
name: "empty",
expectError: false,
},
// The only supported field is HostPath
{
name: "empty fields",
devices: []*runtime.Device{{}},
expectError: true,
},
{
name: "containerPath",
devices: []*runtime.Device{{ContainerPath: "something"}},
expectError: true,
},
{
name: "permissions",
devices: []*runtime.Device{{Permissions: "something"}},
expectError: true,
},
// Produced by https://github.com/aarnaud/k8s-directx-device-plugin/blob/0f3db32622daa577c85621941682bee6f9080954/cmd/k8s-device-plugin/main.go
// This is also the syntax dockershim and cri-dockerd support (or rather, pass through to docker, which parses this syntax)
{
name: "hostPath_docker_style",
devices: []*runtime.Device{{HostPath: "class/5B45201D-F2F2-4F3B-85BB-30FF1F953599"}},
expectError: false,
expectedWindowsDevices: []specs.WindowsDevice{{ID: "5B45201D-F2F2-4F3B-85BB-30FF1F953599", IDType: "class"}},
},
// Docker _only_ accepts `class` ID Type, so anything else should fail.
// See https://github.com/moby/moby/blob/v20.10.13/daemon/oci_windows.go#L283-L294
{
name: "hostPath_docker_style_non-class_idtype",
devices: []*runtime.Device{{HostPath: "vpci-location-path/5B45201D-F2F2-4F3B-85BB-30FF1F953599"}},
expectError: true,
},
// A bunch of examples from https://github.com/microsoft/hcsshim/blob/v0.9.2/test/cri-containerd/container_virtual_device_test.go
{
name: "hostPath_hcsshim_lcow_gpu",
// Not actually a GPU PCIP instance, but my personal machine doesn't have any PCIP devices, so I found one on the 'net.
devices: []*runtime.Device{{HostPath: `gpu://PCIP\VEN_8086&DEV_43A2&SUBSYS_72708086&REV_00\3&11583659&0&F5`}},
isLCOW: true,
expectError: false,
expectedWindowsDevices: []specs.WindowsDevice{{ID: `PCIP\VEN_8086&DEV_43A2&SUBSYS_72708086&REV_00\3&11583659&0&F5`, IDType: "gpu"}},
},
{
name: "hostPath_hcsshim_wcow_location_path",
devices: []*runtime.Device{{HostPath: "vpci-location-path://PCIROOT(0)#PCI(0100)#PCI(0000)#PCI(0000)#PCI(0001)"}},
expectError: false,
expectedWindowsDevices: []specs.WindowsDevice{{ID: "PCIROOT(0)#PCI(0100)#PCI(0000)#PCI(0000)#PCI(0001)", IDType: "vpci-location-path"}},
},
{
name: "hostPath_hcsshim_wcow_class_guid",
devices: []*runtime.Device{{HostPath: "class://5B45201D-F2F2-4F3B-85BB-30FF1F953599"}},
expectError: false,
expectedWindowsDevices: []specs.WindowsDevice{{ID: "5B45201D-F2F2-4F3B-85BB-30FF1F953599", IDType: "class"}},
},
{
name: "hostPath_hcsshim_wcow_gpu_hyper-v",
// Not actually a GPU PCIP instance, but my personal machine doesn't have any PCIP devices, so I found one on the 'net.
devices: []*runtime.Device{{HostPath: `vpci://PCIP\VEN_8086&DEV_43A2&SUBSYS_72708086&REV_00\3&11583659&0&F5`}},
expectError: false,
expectedWindowsDevices: []specs.WindowsDevice{{ID: `PCIP\VEN_8086&DEV_43A2&SUBSYS_72708086&REV_00\3&11583659&0&F5`, IDType: "vpci"}},
},
// Example from https://github.com/microsoft/hcsshim/blob/v0.9.2/test/cri-containerd/container_test.go
// According to https://github.com/jterry75/cri/blob/f8e83e63cc027d0e9c0c984f9db3cba58d3672d4/pkg/server/container_create_windows.go#L625-L649
// this is intended to generate LinuxDevice entries that the GCS shim in a LCOW container host will remap
// into device mounts from its own in-UVM kernel.
// From discussion on https://github.com/containerd/containerd/pull/6618, we reject this syntax for now.
{
name: "hostPath_hcsshim_lcow_sandbox_device",
devices: []*runtime.Device{{HostPath: "/dev/fuse"}},
isLCOW: true,
expectError: true,
},
// Some edge cases suggested by the above real-world examples
{
name: "hostPath_no_slash",
devices: []*runtime.Device{{HostPath: "no_slash"}},
expectError: true,
},
{
name: "hostPath_but_no_type",
devices: []*runtime.Device{{HostPath: "://5B45201D-F2F2-4F3B-85BB-30FF1F953599"}},
expectError: true,
},
{
name: "hostPath_but_no_id",
devices: []*runtime.Device{{HostPath: "gpu://"}},
expectError: false,
expectedWindowsDevices: []specs.WindowsDevice{{ID: "", IDType: "gpu"}},
},
{
name: "hostPath_dockerstyle_with_slashes_in_id",
devices: []*runtime.Device{{HostPath: "class/slashed/id"}},
expectError: false,
expectedWindowsDevices: []specs.WindowsDevice{{ID: "slashed/id", IDType: "class"}},
},
{
name: "hostPath_docker_style_non-class_idtypewith_slashes_in_id",
devices: []*runtime.Device{{HostPath: "vpci-location-path/slashed/id"}},
expectError: true,
},
{
name: "hostPath_hcsshim_wcow_location_path_twice",
devices: []*runtime.Device{
{HostPath: "vpci-location-path://PCIROOT(0)#PCI(0100)#PCI(0000)#PCI(0000)#PCI(0001)"},
{HostPath: "vpci-location-path://PCIROOT(0)#PCI(0100)#PCI(0000)#PCI(0000)#PCI(0002)"}},
expectError: false,
expectedWindowsDevices: []specs.WindowsDevice{
{ID: "PCIROOT(0)#PCI(0100)#PCI(0000)#PCI(0000)#PCI(0001)", IDType: "vpci-location-path"},
{ID: "PCIROOT(0)#PCI(0100)#PCI(0000)#PCI(0000)#PCI(0002)", IDType: "vpci-location-path"},
},
},
}
for _, tc := range testcases {
t.Run(tc.name, func(t *testing.T) {
var (
ctx = namespaces.WithNamespace(context.Background(), "testing")
c = &containers.Container{ID: t.Name()}
)
config := runtime.ContainerConfig{}
config.Devices = tc.devices
specOpts := []oci.SpecOpts{WithWindowsDevices(&config)}
platform := "windows"
if tc.isLCOW {
platform = "linux"
}
spec, err := oci.GenerateSpecWithPlatform(ctx, nil, platform, c, specOpts...)
if tc.expectError {
assert.Error(t, err)
} else {
require.NoError(t, err)
}
// Ensure we got the right LCOWness in the spec
if tc.isLCOW {
assert.NotNil(t, spec.Linux)
} else {
assert.Nil(t, spec.Linux)
}
if len(tc.expectedWindowsDevices) != 0 {
require.NotNil(t, spec.Windows)
require.NotNil(t, spec.Windows.Devices)
assert.Equal(t, spec.Windows.Devices, tc.expectedWindowsDevices)
} else if spec.Windows != nil && spec.Windows.Devices != nil {
assert.Empty(t, spec.Windows.Devices)
}
if spec.Linux != nil && spec.Linux.Devices != nil {
assert.Empty(t, spec.Linux.Devices)
}
})
}
}
func TestDriveMounts(t *testing.T) {
tests := []struct {
mnt *runtime.Mount
expectedContainerPath string
expectedError error
}{
{&runtime.Mount{HostPath: `C:\`, ContainerPath: `D:\foo`}, `D:\foo`, nil},
{&runtime.Mount{HostPath: `C:\`, ContainerPath: `D:\`}, `D:\`, nil},
{&runtime.Mount{HostPath: `C:\`, ContainerPath: `D:`}, `D:`, nil},
{&runtime.Mount{HostPath: `\\.\pipe\a_fake_pipe_name_that_shouldnt_exist`, ContainerPath: `\\.\pipe\foo`}, `\\.\pipe\foo`, nil},
// If `C:\` is passed as container path it should continue and forward that to HCS and fail
// to align with docker's behavior.
{&runtime.Mount{HostPath: `C:\`, ContainerPath: `C:\`}, `C:\`, nil},
// If `C:` is passed we can detect and fail immediately.
{&runtime.Mount{HostPath: `C:\`, ContainerPath: `C:`}, ``, fmt.Errorf("destination path can not be C drive")},
}
var realOS osinterface.RealOS
for _, test := range tests {
parsedMount, err := parseMount(realOS, test.mnt)
if err != nil && !strings.EqualFold(err.Error(), test.expectedError.Error()) {
t.Fatalf("expected err: %s, got %s instead", test.expectedError, err)
} else if err == nil && test.expectedContainerPath != parsedMount.Destination {
t.Fatalf("expected container path: %s, got %s instead", test.expectedContainerPath, parsedMount.Destination)
}
}
}