Have separate spec builder for each platform
Signed-off-by: Maksym Pavlenko <pavlenko.maksym@gmail.com>
This commit is contained in:
parent
fdfa3519a3
commit
40be96efa9
@ -45,3 +45,10 @@ func WithHostDevices(_ context.Context, _ Client, _ *containers.Container, s *Sp
|
|||||||
func DeviceFromPath(path string) (*specs.LinuxDevice, error) {
|
func DeviceFromPath(path string) (*specs.LinuxDevice, error) {
|
||||||
return nil, errors.New("device from path not supported on Windows")
|
return nil, errors.New("device from path not supported on Windows")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// WithDevices does nothing on Windows.
|
||||||
|
func WithDevices(devicePath, containerPath, permissions string) SpecOpts {
|
||||||
|
return func(ctx context.Context, client Client, container *containers.Container, spec *Spec) error {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
@ -25,13 +25,14 @@ import (
|
|||||||
goruntime "runtime"
|
goruntime "runtime"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
|
"github.com/containerd/continuity/fs"
|
||||||
|
|
||||||
"github.com/containerd/containerd"
|
"github.com/containerd/containerd"
|
||||||
"github.com/containerd/containerd/containers"
|
"github.com/containerd/containerd/containers"
|
||||||
"github.com/containerd/containerd/errdefs"
|
"github.com/containerd/containerd/errdefs"
|
||||||
"github.com/containerd/containerd/log"
|
"github.com/containerd/containerd/log"
|
||||||
"github.com/containerd/containerd/mount"
|
"github.com/containerd/containerd/mount"
|
||||||
"github.com/containerd/containerd/snapshots"
|
"github.com/containerd/containerd/snapshots"
|
||||||
"github.com/containerd/continuity/fs"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
// WithNewSnapshot wraps `containerd.WithNewSnapshot` so that if creating the
|
// WithNewSnapshot wraps `containerd.WithNewSnapshot` so that if creating the
|
||||||
|
@ -22,8 +22,6 @@ import (
|
|||||||
"fmt"
|
"fmt"
|
||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"sort"
|
|
||||||
"strconv"
|
|
||||||
"strings"
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
"syscall"
|
"syscall"
|
||||||
@ -31,255 +29,15 @@ import (
|
|||||||
"github.com/container-orchestrated-devices/container-device-interface/pkg/cdi"
|
"github.com/container-orchestrated-devices/container-device-interface/pkg/cdi"
|
||||||
"github.com/containerd/cgroups/v3"
|
"github.com/containerd/cgroups/v3"
|
||||||
"github.com/containerd/cgroups/v3/cgroup1"
|
"github.com/containerd/cgroups/v3/cgroup1"
|
||||||
runtimespec "github.com/opencontainers/runtime-spec/specs-go"
|
|
||||||
"github.com/opencontainers/selinux/go-selinux/label"
|
|
||||||
"github.com/sirupsen/logrus"
|
"github.com/sirupsen/logrus"
|
||||||
"golang.org/x/sys/unix"
|
"golang.org/x/sys/unix"
|
||||||
runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
|
|
||||||
|
|
||||||
"github.com/containerd/containerd/containers"
|
"github.com/containerd/containerd/containers"
|
||||||
"github.com/containerd/containerd/log"
|
"github.com/containerd/containerd/log"
|
||||||
"github.com/containerd/containerd/mount"
|
|
||||||
"github.com/containerd/containerd/oci"
|
"github.com/containerd/containerd/oci"
|
||||||
osinterface "github.com/containerd/containerd/pkg/os"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
// WithMounts sorts and adds runtime and CRI mounts to the spec
|
// Linux dependent OCI spec opts.
|
||||||
func WithMounts(osi osinterface.OS, config *runtime.ContainerConfig, extra []*runtime.Mount, mountLabel string) oci.SpecOpts {
|
|
||||||
return func(ctx context.Context, client oci.Client, _ *containers.Container, s *runtimespec.Spec) (err error) {
|
|
||||||
// mergeMounts merge CRI mounts with extra mounts. If a mount destination
|
|
||||||
// is mounted by both a CRI mount and an extra mount, the CRI mount will
|
|
||||||
// be kept.
|
|
||||||
var (
|
|
||||||
criMounts = config.GetMounts()
|
|
||||||
mounts = append([]*runtime.Mount{}, criMounts...)
|
|
||||||
)
|
|
||||||
// Copy all mounts from extra mounts, except for mounts overridden by CRI.
|
|
||||||
for _, e := range extra {
|
|
||||||
found := false
|
|
||||||
for _, c := range criMounts {
|
|
||||||
if filepath.Clean(e.ContainerPath) == filepath.Clean(c.ContainerPath) {
|
|
||||||
found = true
|
|
||||||
break
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if !found {
|
|
||||||
mounts = append(mounts, e)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Sort mounts in number of parts. This ensures that high level mounts don't
|
|
||||||
// shadow other mounts.
|
|
||||||
sort.Sort(orderedMounts(mounts))
|
|
||||||
|
|
||||||
// Mount cgroup into the container as readonly, which inherits docker's behavior.
|
|
||||||
s.Mounts = append(s.Mounts, runtimespec.Mount{
|
|
||||||
Source: "cgroup",
|
|
||||||
Destination: "/sys/fs/cgroup",
|
|
||||||
Type: "cgroup",
|
|
||||||
Options: []string{"nosuid", "noexec", "nodev", "relatime", "ro"},
|
|
||||||
})
|
|
||||||
|
|
||||||
// Copy all mounts from default mounts, except for
|
|
||||||
// - mounts overridden by supplied mount;
|
|
||||||
// - all mounts under /dev if a supplied /dev is present.
|
|
||||||
mountSet := make(map[string]struct{})
|
|
||||||
for _, m := range mounts {
|
|
||||||
mountSet[filepath.Clean(m.ContainerPath)] = struct{}{}
|
|
||||||
}
|
|
||||||
|
|
||||||
defaultMounts := s.Mounts
|
|
||||||
s.Mounts = nil
|
|
||||||
|
|
||||||
for _, m := range defaultMounts {
|
|
||||||
dst := filepath.Clean(m.Destination)
|
|
||||||
if _, ok := mountSet[dst]; ok {
|
|
||||||
// filter out mount overridden by a supplied mount
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
if _, mountDev := mountSet["/dev"]; mountDev && strings.HasPrefix(dst, "/dev/") {
|
|
||||||
// filter out everything under /dev if /dev is a supplied mount
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
s.Mounts = append(s.Mounts, m)
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, mount := range mounts {
|
|
||||||
var (
|
|
||||||
dst = mount.GetContainerPath()
|
|
||||||
src = mount.GetHostPath()
|
|
||||||
)
|
|
||||||
// Create the host path if it doesn't exist.
|
|
||||||
// TODO(random-liu): Add CRI validation test for this case.
|
|
||||||
if _, err := osi.Stat(src); err != nil {
|
|
||||||
if !os.IsNotExist(err) {
|
|
||||||
return fmt.Errorf("failed to stat %q: %w", src, err)
|
|
||||||
}
|
|
||||||
if err := osi.MkdirAll(src, 0755); err != nil {
|
|
||||||
return fmt.Errorf("failed to mkdir %q: %w", src, err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// TODO(random-liu): Add cri-containerd integration test or cri validation test
|
|
||||||
// for this.
|
|
||||||
src, err := osi.ResolveSymbolicLink(src)
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("failed to resolve symlink %q: %w", src, err)
|
|
||||||
}
|
|
||||||
if s.Linux == nil {
|
|
||||||
s.Linux = &runtimespec.Linux{}
|
|
||||||
}
|
|
||||||
options := []string{"rbind"}
|
|
||||||
switch mount.GetPropagation() {
|
|
||||||
case runtime.MountPropagation_PROPAGATION_PRIVATE:
|
|
||||||
options = append(options, "rprivate")
|
|
||||||
// Since default root propagation in runc is rprivate ignore
|
|
||||||
// setting the root propagation
|
|
||||||
case runtime.MountPropagation_PROPAGATION_BIDIRECTIONAL:
|
|
||||||
if err := ensureShared(src, osi.LookupMount); err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
options = append(options, "rshared")
|
|
||||||
s.Linux.RootfsPropagation = "rshared"
|
|
||||||
case runtime.MountPropagation_PROPAGATION_HOST_TO_CONTAINER:
|
|
||||||
if err := ensureSharedOrSlave(src, osi.LookupMount); err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
options = append(options, "rslave")
|
|
||||||
if s.Linux.RootfsPropagation != "rshared" &&
|
|
||||||
s.Linux.RootfsPropagation != "rslave" {
|
|
||||||
s.Linux.RootfsPropagation = "rslave"
|
|
||||||
}
|
|
||||||
default:
|
|
||||||
log.G(ctx).Warnf("Unknown propagation mode for hostPath %q", mount.HostPath)
|
|
||||||
options = append(options, "rprivate")
|
|
||||||
}
|
|
||||||
|
|
||||||
// NOTE(random-liu): we don't change all mounts to `ro` when root filesystem
|
|
||||||
// is readonly. This is different from docker's behavior, but make more sense.
|
|
||||||
if mount.GetReadonly() {
|
|
||||||
options = append(options, "ro")
|
|
||||||
} else {
|
|
||||||
options = append(options, "rw")
|
|
||||||
}
|
|
||||||
|
|
||||||
if mount.GetSelinuxRelabel() {
|
|
||||||
if err := label.Relabel(src, mountLabel, false); err != nil && err != unix.ENOTSUP {
|
|
||||||
return fmt.Errorf("relabel %q with %q failed: %w", src, mountLabel, err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
s.Mounts = append(s.Mounts, runtimespec.Mount{
|
|
||||||
Source: src,
|
|
||||||
Destination: dst,
|
|
||||||
Type: "bind",
|
|
||||||
Options: options,
|
|
||||||
})
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Ensure mount point on which path is mounted, is shared.
|
|
||||||
func ensureShared(path string, lookupMount func(string) (mount.Info, error)) error {
|
|
||||||
mountInfo, err := lookupMount(path)
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
// Make sure source mount point is shared.
|
|
||||||
optsSplit := strings.Split(mountInfo.Optional, " ")
|
|
||||||
for _, opt := range optsSplit {
|
|
||||||
if strings.HasPrefix(opt, "shared:") {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return fmt.Errorf("path %q is mounted on %q but it is not a shared mount", path, mountInfo.Mountpoint)
|
|
||||||
}
|
|
||||||
|
|
||||||
// ensure mount point on which path is mounted, is either shared or slave.
|
|
||||||
func ensureSharedOrSlave(path string, lookupMount func(string) (mount.Info, error)) error {
|
|
||||||
mountInfo, err := lookupMount(path)
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
// Make sure source mount point is shared.
|
|
||||||
optsSplit := strings.Split(mountInfo.Optional, " ")
|
|
||||||
for _, opt := range optsSplit {
|
|
||||||
if strings.HasPrefix(opt, "shared:") {
|
|
||||||
return nil
|
|
||||||
} else if strings.HasPrefix(opt, "master:") {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return fmt.Errorf("path %q is mounted on %q but it is not a shared or slave mount", path, mountInfo.Mountpoint)
|
|
||||||
}
|
|
||||||
|
|
||||||
// getDeviceUserGroupID() is used to find the right uid/gid
|
|
||||||
// value for the device node created in the container namespace.
|
|
||||||
// The runtime executes mknod() and chmod()s the created
|
|
||||||
// device with the values returned here.
|
|
||||||
//
|
|
||||||
// On Linux, uid and gid are sufficient and the user/groupname do not
|
|
||||||
// need to be resolved.
|
|
||||||
//
|
|
||||||
// TODO(mythi): In case of user namespaces, the runtime simply bind
|
|
||||||
// mounts the devices from the host. Additional logic is needed
|
|
||||||
// to check that the runtimes effective UID/GID on the host has the
|
|
||||||
// permissions to access the device node and/or the right user namespace
|
|
||||||
// mappings are created.
|
|
||||||
//
|
|
||||||
// Ref: https://github.com/kubernetes/kubernetes/issues/92211
|
|
||||||
func getDeviceUserGroupID(runAsVal *runtime.Int64Value) uint32 {
|
|
||||||
if runAsVal != nil {
|
|
||||||
return uint32(runAsVal.GetValue())
|
|
||||||
}
|
|
||||||
return 0
|
|
||||||
}
|
|
||||||
|
|
||||||
// WithDevices sets the provided devices onto the container spec
|
|
||||||
func WithDevices(osi osinterface.OS, config *runtime.ContainerConfig, enableDeviceOwnershipFromSecurityContext bool) oci.SpecOpts {
|
|
||||||
return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) (err error) {
|
|
||||||
if s.Linux == nil {
|
|
||||||
s.Linux = &runtimespec.Linux{}
|
|
||||||
}
|
|
||||||
if s.Linux.Resources == nil {
|
|
||||||
s.Linux.Resources = &runtimespec.LinuxResources{}
|
|
||||||
}
|
|
||||||
|
|
||||||
oldDevices := len(s.Linux.Devices)
|
|
||||||
|
|
||||||
for _, device := range config.GetDevices() {
|
|
||||||
path, err := osi.ResolveSymbolicLink(device.HostPath)
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
o := oci.WithDevices(path, device.ContainerPath, device.Permissions)
|
|
||||||
if err := o(ctx, client, c, s); err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if enableDeviceOwnershipFromSecurityContext {
|
|
||||||
UID := getDeviceUserGroupID(config.GetLinux().GetSecurityContext().GetRunAsUser())
|
|
||||||
GID := getDeviceUserGroupID(config.GetLinux().GetSecurityContext().GetRunAsGroup())
|
|
||||||
// Loop all new devices added by oci.WithDevices() to update their
|
|
||||||
// dev.UID/dev.GID.
|
|
||||||
//
|
|
||||||
// non-zero UID/GID from SecurityContext is used to override host's
|
|
||||||
// device UID/GID for the container.
|
|
||||||
for idx := oldDevices; idx < len(s.Linux.Devices); idx++ {
|
|
||||||
if UID != 0 {
|
|
||||||
*s.Linux.Devices[idx].UID = UID
|
|
||||||
}
|
|
||||||
if GID != 0 {
|
|
||||||
*s.Linux.Devices[idx].GID = GID
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
var (
|
var (
|
||||||
swapControllerAvailability bool
|
swapControllerAvailability bool
|
||||||
@ -312,88 +70,6 @@ func SwapControllerAvailable() bool {
|
|||||||
return swapControllerAvailability
|
return swapControllerAvailability
|
||||||
}
|
}
|
||||||
|
|
||||||
// WithResources sets the provided resource restrictions
|
|
||||||
func WithResources(resources *runtime.LinuxContainerResources, tolerateMissingHugetlbController, disableHugetlbController bool) oci.SpecOpts {
|
|
||||||
return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) (err error) {
|
|
||||||
if resources == nil {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
if s.Linux == nil {
|
|
||||||
s.Linux = &runtimespec.Linux{}
|
|
||||||
}
|
|
||||||
if s.Linux.Resources == nil {
|
|
||||||
s.Linux.Resources = &runtimespec.LinuxResources{}
|
|
||||||
}
|
|
||||||
if s.Linux.Resources.CPU == nil {
|
|
||||||
s.Linux.Resources.CPU = &runtimespec.LinuxCPU{}
|
|
||||||
}
|
|
||||||
if s.Linux.Resources.Memory == nil {
|
|
||||||
s.Linux.Resources.Memory = &runtimespec.LinuxMemory{}
|
|
||||||
}
|
|
||||||
var (
|
|
||||||
p = uint64(resources.GetCpuPeriod())
|
|
||||||
q = resources.GetCpuQuota()
|
|
||||||
shares = uint64(resources.GetCpuShares())
|
|
||||||
limit = resources.GetMemoryLimitInBytes()
|
|
||||||
swapLimit = resources.GetMemorySwapLimitInBytes()
|
|
||||||
hugepages = resources.GetHugepageLimits()
|
|
||||||
)
|
|
||||||
|
|
||||||
if p != 0 {
|
|
||||||
s.Linux.Resources.CPU.Period = &p
|
|
||||||
}
|
|
||||||
if q != 0 {
|
|
||||||
s.Linux.Resources.CPU.Quota = &q
|
|
||||||
}
|
|
||||||
if shares != 0 {
|
|
||||||
s.Linux.Resources.CPU.Shares = &shares
|
|
||||||
}
|
|
||||||
if cpus := resources.GetCpusetCpus(); cpus != "" {
|
|
||||||
s.Linux.Resources.CPU.Cpus = cpus
|
|
||||||
}
|
|
||||||
if mems := resources.GetCpusetMems(); mems != "" {
|
|
||||||
s.Linux.Resources.CPU.Mems = resources.GetCpusetMems()
|
|
||||||
}
|
|
||||||
if limit != 0 {
|
|
||||||
s.Linux.Resources.Memory.Limit = &limit
|
|
||||||
// swap/memory limit should be equal to prevent container from swapping by default
|
|
||||||
if swapLimit == 0 && SwapControllerAvailable() {
|
|
||||||
s.Linux.Resources.Memory.Swap = &limit
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if swapLimit != 0 {
|
|
||||||
s.Linux.Resources.Memory.Swap = &swapLimit
|
|
||||||
}
|
|
||||||
|
|
||||||
if !disableHugetlbController {
|
|
||||||
if isHugetlbControllerPresent() {
|
|
||||||
for _, limit := range hugepages {
|
|
||||||
s.Linux.Resources.HugepageLimits = append(s.Linux.Resources.HugepageLimits, runtimespec.LinuxHugepageLimit{
|
|
||||||
Pagesize: limit.PageSize,
|
|
||||||
Limit: limit.Limit,
|
|
||||||
})
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
if !tolerateMissingHugetlbController {
|
|
||||||
return errors.New("huge pages limits are specified but hugetlb cgroup controller is missing. " +
|
|
||||||
"Please set tolerate_missing_hugetlb_controller to `true` to ignore this error")
|
|
||||||
}
|
|
||||||
logrus.Warn("hugetlb cgroup controller is absent. skipping huge pages limits")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if unified := resources.GetUnified(); unified != nil {
|
|
||||||
if s.Linux.Resources.Unified == nil {
|
|
||||||
s.Linux.Resources.Unified = make(map[string]string)
|
|
||||||
}
|
|
||||||
for k, v := range unified {
|
|
||||||
s.Linux.Resources.Unified[k] = v
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
var (
|
var (
|
||||||
supportsHugetlbOnce sync.Once
|
supportsHugetlbOnce sync.Once
|
||||||
supportsHugetlb bool
|
supportsHugetlb bool
|
||||||
@ -463,72 +139,6 @@ func IsCgroup2UnifiedMode() bool {
|
|||||||
return isUnified
|
return isUnified
|
||||||
}
|
}
|
||||||
|
|
||||||
// WithOOMScoreAdj sets the oom score
|
|
||||||
func WithOOMScoreAdj(config *runtime.ContainerConfig, restrict bool) oci.SpecOpts {
|
|
||||||
return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) error {
|
|
||||||
if s.Process == nil {
|
|
||||||
s.Process = &runtimespec.Process{}
|
|
||||||
}
|
|
||||||
|
|
||||||
resources := config.GetLinux().GetResources()
|
|
||||||
if resources == nil {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
adj := int(resources.GetOomScoreAdj())
|
|
||||||
if restrict {
|
|
||||||
var err error
|
|
||||||
adj, err = restrictOOMScoreAdj(adj)
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
}
|
|
||||||
s.Process.OOMScoreAdj = &adj
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// WithPodOOMScoreAdj sets the oom score for the pod sandbox
|
|
||||||
func WithPodOOMScoreAdj(adj int, restrict bool) oci.SpecOpts {
|
|
||||||
return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) error {
|
|
||||||
if s.Process == nil {
|
|
||||||
s.Process = &runtimespec.Process{}
|
|
||||||
}
|
|
||||||
if restrict {
|
|
||||||
var err error
|
|
||||||
adj, err = restrictOOMScoreAdj(adj)
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
}
|
|
||||||
s.Process.OOMScoreAdj = &adj
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func getCurrentOOMScoreAdj() (int, error) {
|
|
||||||
b, err := os.ReadFile("/proc/self/oom_score_adj")
|
|
||||||
if err != nil {
|
|
||||||
return 0, fmt.Errorf("could not get the daemon oom_score_adj: %w", err)
|
|
||||||
}
|
|
||||||
s := strings.TrimSpace(string(b))
|
|
||||||
i, err := strconv.Atoi(s)
|
|
||||||
if err != nil {
|
|
||||||
return 0, fmt.Errorf("could not get the daemon oom_score_adj: %w", err)
|
|
||||||
}
|
|
||||||
return i, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func restrictOOMScoreAdj(preferredOOMScoreAdj int) (int, error) {
|
|
||||||
currentOOMScoreAdj, err := getCurrentOOMScoreAdj()
|
|
||||||
if err != nil {
|
|
||||||
return preferredOOMScoreAdj, err
|
|
||||||
}
|
|
||||||
if preferredOOMScoreAdj < currentOOMScoreAdj {
|
|
||||||
return currentOOMScoreAdj, nil
|
|
||||||
}
|
|
||||||
return preferredOOMScoreAdj, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// WithCDI updates OCI spec with CDI content
|
// WithCDI updates OCI spec with CDI content
|
||||||
func WithCDI(annotations map[string]string) oci.SpecOpts {
|
func WithCDI(annotations map[string]string) oci.SpecOpts {
|
||||||
return func(ctx context.Context, _ oci.Client, c *containers.Container, s *oci.Spec) error {
|
return func(ctx context.Context, _ oci.Client, c *containers.Container, s *oci.Spec) error {
|
||||||
|
426
pkg/cri/opts/spec_linux_opts.go
Normal file
426
pkg/cri/opts/spec_linux_opts.go
Normal file
@ -0,0 +1,426 @@
|
|||||||
|
/*
|
||||||
|
Copyright The containerd Authors.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package opts
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"sort"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
"syscall"
|
||||||
|
|
||||||
|
runtimespec "github.com/opencontainers/runtime-spec/specs-go"
|
||||||
|
"github.com/opencontainers/selinux/go-selinux/label"
|
||||||
|
"github.com/sirupsen/logrus"
|
||||||
|
runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
|
||||||
|
|
||||||
|
"github.com/containerd/containerd/containers"
|
||||||
|
"github.com/containerd/containerd/log"
|
||||||
|
"github.com/containerd/containerd/mount"
|
||||||
|
"github.com/containerd/containerd/oci"
|
||||||
|
osinterface "github.com/containerd/containerd/pkg/os"
|
||||||
|
)
|
||||||
|
|
||||||
|
// WithMounts sorts and adds runtime and CRI mounts to the spec
|
||||||
|
func WithMounts(osi osinterface.OS, config *runtime.ContainerConfig, extra []*runtime.Mount, mountLabel string) oci.SpecOpts {
|
||||||
|
return func(ctx context.Context, client oci.Client, _ *containers.Container, s *runtimespec.Spec) (err error) {
|
||||||
|
// mergeMounts merge CRI mounts with extra mounts. If a mount destination
|
||||||
|
// is mounted by both a CRI mount and an extra mount, the CRI mount will
|
||||||
|
// be kept.
|
||||||
|
var (
|
||||||
|
criMounts = config.GetMounts()
|
||||||
|
mounts = append([]*runtime.Mount{}, criMounts...)
|
||||||
|
)
|
||||||
|
// Copy all mounts from extra mounts, except for mounts overridden by CRI.
|
||||||
|
for _, e := range extra {
|
||||||
|
found := false
|
||||||
|
for _, c := range criMounts {
|
||||||
|
if filepath.Clean(e.ContainerPath) == filepath.Clean(c.ContainerPath) {
|
||||||
|
found = true
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !found {
|
||||||
|
mounts = append(mounts, e)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Sort mounts in number of parts. This ensures that high level mounts don't
|
||||||
|
// shadow other mounts.
|
||||||
|
sort.Sort(orderedMounts(mounts))
|
||||||
|
|
||||||
|
// Mount cgroup into the container as readonly, which inherits docker's behavior.
|
||||||
|
s.Mounts = append(s.Mounts, runtimespec.Mount{
|
||||||
|
Source: "cgroup",
|
||||||
|
Destination: "/sys/fs/cgroup",
|
||||||
|
Type: "cgroup",
|
||||||
|
Options: []string{"nosuid", "noexec", "nodev", "relatime", "ro"},
|
||||||
|
})
|
||||||
|
|
||||||
|
// Copy all mounts from default mounts, except for
|
||||||
|
// - mounts overridden by supplied mount;
|
||||||
|
// - all mounts under /dev if a supplied /dev is present.
|
||||||
|
mountSet := make(map[string]struct{})
|
||||||
|
for _, m := range mounts {
|
||||||
|
mountSet[filepath.Clean(m.ContainerPath)] = struct{}{}
|
||||||
|
}
|
||||||
|
|
||||||
|
defaultMounts := s.Mounts
|
||||||
|
s.Mounts = nil
|
||||||
|
|
||||||
|
for _, m := range defaultMounts {
|
||||||
|
dst := filepath.Clean(m.Destination)
|
||||||
|
if _, ok := mountSet[dst]; ok {
|
||||||
|
// filter out mount overridden by a supplied mount
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if _, mountDev := mountSet["/dev"]; mountDev && strings.HasPrefix(dst, "/dev/") {
|
||||||
|
// filter out everything under /dev if /dev is a supplied mount
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
s.Mounts = append(s.Mounts, m)
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, mount := range mounts {
|
||||||
|
var (
|
||||||
|
dst = mount.GetContainerPath()
|
||||||
|
src = mount.GetHostPath()
|
||||||
|
)
|
||||||
|
// Create the host path if it doesn't exist.
|
||||||
|
// TODO(random-liu): Add CRI validation test for this case.
|
||||||
|
if _, err := osi.Stat(src); err != nil {
|
||||||
|
if !os.IsNotExist(err) {
|
||||||
|
return fmt.Errorf("failed to stat %q: %w", src, err)
|
||||||
|
}
|
||||||
|
if err := osi.MkdirAll(src, 0755); err != nil {
|
||||||
|
return fmt.Errorf("failed to mkdir %q: %w", src, err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// TODO(random-liu): Add cri-containerd integration test or cri validation test
|
||||||
|
// for this.
|
||||||
|
src, err := osi.ResolveSymbolicLink(src)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("failed to resolve symlink %q: %w", src, err)
|
||||||
|
}
|
||||||
|
if s.Linux == nil {
|
||||||
|
s.Linux = &runtimespec.Linux{}
|
||||||
|
}
|
||||||
|
options := []string{"rbind"}
|
||||||
|
switch mount.GetPropagation() {
|
||||||
|
case runtime.MountPropagation_PROPAGATION_PRIVATE:
|
||||||
|
options = append(options, "rprivate")
|
||||||
|
// Since default root propagation in runc is rprivate ignore
|
||||||
|
// setting the root propagation
|
||||||
|
case runtime.MountPropagation_PROPAGATION_BIDIRECTIONAL:
|
||||||
|
if err := ensureShared(src, osi.LookupMount); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
options = append(options, "rshared")
|
||||||
|
s.Linux.RootfsPropagation = "rshared"
|
||||||
|
case runtime.MountPropagation_PROPAGATION_HOST_TO_CONTAINER:
|
||||||
|
if err := ensureSharedOrSlave(src, osi.LookupMount); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
options = append(options, "rslave")
|
||||||
|
if s.Linux.RootfsPropagation != "rshared" &&
|
||||||
|
s.Linux.RootfsPropagation != "rslave" {
|
||||||
|
s.Linux.RootfsPropagation = "rslave"
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
log.G(ctx).Warnf("Unknown propagation mode for hostPath %q", mount.HostPath)
|
||||||
|
options = append(options, "rprivate")
|
||||||
|
}
|
||||||
|
|
||||||
|
// NOTE(random-liu): we don't change all mounts to `ro` when root filesystem
|
||||||
|
// is readonly. This is different from docker's behavior, but make more sense.
|
||||||
|
if mount.GetReadonly() {
|
||||||
|
options = append(options, "ro")
|
||||||
|
} else {
|
||||||
|
options = append(options, "rw")
|
||||||
|
}
|
||||||
|
|
||||||
|
if mount.GetSelinuxRelabel() {
|
||||||
|
ENOTSUP := syscall.Errno(0x5f) // Linux specific error code, this branch will not execute on non Linux platforms.
|
||||||
|
if err := label.Relabel(src, mountLabel, false); err != nil && err != ENOTSUP {
|
||||||
|
return fmt.Errorf("relabel %q with %q failed: %w", src, mountLabel, err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
s.Mounts = append(s.Mounts, runtimespec.Mount{
|
||||||
|
Source: src,
|
||||||
|
Destination: dst,
|
||||||
|
Type: "bind",
|
||||||
|
Options: options,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Ensure mount point on which path is mounted, is shared.
|
||||||
|
func ensureShared(path string, lookupMount func(string) (mount.Info, error)) error {
|
||||||
|
mountInfo, err := lookupMount(path)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Make sure source mount point is shared.
|
||||||
|
optsSplit := strings.Split(mountInfo.Optional, " ")
|
||||||
|
for _, opt := range optsSplit {
|
||||||
|
if strings.HasPrefix(opt, "shared:") {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return fmt.Errorf("path %q is mounted on %q but it is not a shared mount", path, mountInfo.Mountpoint)
|
||||||
|
}
|
||||||
|
|
||||||
|
// ensure mount point on which path is mounted, is either shared or slave.
|
||||||
|
func ensureSharedOrSlave(path string, lookupMount func(string) (mount.Info, error)) error {
|
||||||
|
mountInfo, err := lookupMount(path)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
// Make sure source mount point is shared.
|
||||||
|
optsSplit := strings.Split(mountInfo.Optional, " ")
|
||||||
|
for _, opt := range optsSplit {
|
||||||
|
if strings.HasPrefix(opt, "shared:") {
|
||||||
|
return nil
|
||||||
|
} else if strings.HasPrefix(opt, "master:") {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return fmt.Errorf("path %q is mounted on %q but it is not a shared or slave mount", path, mountInfo.Mountpoint)
|
||||||
|
}
|
||||||
|
|
||||||
|
// getDeviceUserGroupID() is used to find the right uid/gid
|
||||||
|
// value for the device node created in the container namespace.
|
||||||
|
// The runtime executes mknod() and chmod()s the created
|
||||||
|
// device with the values returned here.
|
||||||
|
//
|
||||||
|
// On Linux, uid and gid are sufficient and the user/groupname do not
|
||||||
|
// need to be resolved.
|
||||||
|
//
|
||||||
|
// TODO(mythi): In case of user namespaces, the runtime simply bind
|
||||||
|
// mounts the devices from the host. Additional logic is needed
|
||||||
|
// to check that the runtimes effective UID/GID on the host has the
|
||||||
|
// permissions to access the device node and/or the right user namespace
|
||||||
|
// mappings are created.
|
||||||
|
//
|
||||||
|
// Ref: https://github.com/kubernetes/kubernetes/issues/92211
|
||||||
|
func getDeviceUserGroupID(runAsVal *runtime.Int64Value) uint32 {
|
||||||
|
if runAsVal != nil {
|
||||||
|
return uint32(runAsVal.GetValue())
|
||||||
|
}
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
// WithDevices sets the provided devices onto the container spec
|
||||||
|
func WithDevices(osi osinterface.OS, config *runtime.ContainerConfig, enableDeviceOwnershipFromSecurityContext bool) oci.SpecOpts {
|
||||||
|
return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) (err error) {
|
||||||
|
if s.Linux == nil {
|
||||||
|
s.Linux = &runtimespec.Linux{}
|
||||||
|
}
|
||||||
|
if s.Linux.Resources == nil {
|
||||||
|
s.Linux.Resources = &runtimespec.LinuxResources{}
|
||||||
|
}
|
||||||
|
|
||||||
|
oldDevices := len(s.Linux.Devices)
|
||||||
|
|
||||||
|
for _, device := range config.GetDevices() {
|
||||||
|
path, err := osi.ResolveSymbolicLink(device.HostPath)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
o := oci.WithDevices(path, device.ContainerPath, device.Permissions)
|
||||||
|
if err := o(ctx, client, c, s); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if enableDeviceOwnershipFromSecurityContext {
|
||||||
|
UID := getDeviceUserGroupID(config.GetLinux().GetSecurityContext().GetRunAsUser())
|
||||||
|
GID := getDeviceUserGroupID(config.GetLinux().GetSecurityContext().GetRunAsGroup())
|
||||||
|
// Loop all new devices added by oci.WithDevices() to update their
|
||||||
|
// dev.UID/dev.GID.
|
||||||
|
//
|
||||||
|
// non-zero UID/GID from SecurityContext is used to override host's
|
||||||
|
// device UID/GID for the container.
|
||||||
|
for idx := oldDevices; idx < len(s.Linux.Devices); idx++ {
|
||||||
|
if UID != 0 {
|
||||||
|
*s.Linux.Devices[idx].UID = UID
|
||||||
|
}
|
||||||
|
if GID != 0 {
|
||||||
|
*s.Linux.Devices[idx].GID = GID
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// WithResources sets the provided resource restrictions
|
||||||
|
func WithResources(resources *runtime.LinuxContainerResources, tolerateMissingHugetlbController, disableHugetlbController bool) oci.SpecOpts {
|
||||||
|
return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) (err error) {
|
||||||
|
if resources == nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
if s.Linux == nil {
|
||||||
|
s.Linux = &runtimespec.Linux{}
|
||||||
|
}
|
||||||
|
if s.Linux.Resources == nil {
|
||||||
|
s.Linux.Resources = &runtimespec.LinuxResources{}
|
||||||
|
}
|
||||||
|
if s.Linux.Resources.CPU == nil {
|
||||||
|
s.Linux.Resources.CPU = &runtimespec.LinuxCPU{}
|
||||||
|
}
|
||||||
|
if s.Linux.Resources.Memory == nil {
|
||||||
|
s.Linux.Resources.Memory = &runtimespec.LinuxMemory{}
|
||||||
|
}
|
||||||
|
var (
|
||||||
|
p = uint64(resources.GetCpuPeriod())
|
||||||
|
q = resources.GetCpuQuota()
|
||||||
|
shares = uint64(resources.GetCpuShares())
|
||||||
|
limit = resources.GetMemoryLimitInBytes()
|
||||||
|
swapLimit = resources.GetMemorySwapLimitInBytes()
|
||||||
|
hugepages = resources.GetHugepageLimits()
|
||||||
|
)
|
||||||
|
|
||||||
|
if p != 0 {
|
||||||
|
s.Linux.Resources.CPU.Period = &p
|
||||||
|
}
|
||||||
|
if q != 0 {
|
||||||
|
s.Linux.Resources.CPU.Quota = &q
|
||||||
|
}
|
||||||
|
if shares != 0 {
|
||||||
|
s.Linux.Resources.CPU.Shares = &shares
|
||||||
|
}
|
||||||
|
if cpus := resources.GetCpusetCpus(); cpus != "" {
|
||||||
|
s.Linux.Resources.CPU.Cpus = cpus
|
||||||
|
}
|
||||||
|
if mems := resources.GetCpusetMems(); mems != "" {
|
||||||
|
s.Linux.Resources.CPU.Mems = resources.GetCpusetMems()
|
||||||
|
}
|
||||||
|
if limit != 0 {
|
||||||
|
s.Linux.Resources.Memory.Limit = &limit
|
||||||
|
// swap/memory limit should be equal to prevent container from swapping by default
|
||||||
|
if swapLimit == 0 && SwapControllerAvailable() {
|
||||||
|
s.Linux.Resources.Memory.Swap = &limit
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if swapLimit != 0 {
|
||||||
|
s.Linux.Resources.Memory.Swap = &swapLimit
|
||||||
|
}
|
||||||
|
|
||||||
|
if !disableHugetlbController {
|
||||||
|
if isHugetlbControllerPresent() {
|
||||||
|
for _, limit := range hugepages {
|
||||||
|
s.Linux.Resources.HugepageLimits = append(s.Linux.Resources.HugepageLimits, runtimespec.LinuxHugepageLimit{
|
||||||
|
Pagesize: limit.PageSize,
|
||||||
|
Limit: limit.Limit,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if !tolerateMissingHugetlbController {
|
||||||
|
return errors.New("huge pages limits are specified but hugetlb cgroup controller is missing. " +
|
||||||
|
"Please set tolerate_missing_hugetlb_controller to `true` to ignore this error")
|
||||||
|
}
|
||||||
|
logrus.Warn("hugetlb cgroup controller is absent. skipping huge pages limits")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if unified := resources.GetUnified(); unified != nil {
|
||||||
|
if s.Linux.Resources.Unified == nil {
|
||||||
|
s.Linux.Resources.Unified = make(map[string]string)
|
||||||
|
}
|
||||||
|
for k, v := range unified {
|
||||||
|
s.Linux.Resources.Unified[k] = v
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// WithOOMScoreAdj sets the oom score
|
||||||
|
func WithOOMScoreAdj(config *runtime.ContainerConfig, restrict bool) oci.SpecOpts {
|
||||||
|
return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) error {
|
||||||
|
if s.Process == nil {
|
||||||
|
s.Process = &runtimespec.Process{}
|
||||||
|
}
|
||||||
|
|
||||||
|
resources := config.GetLinux().GetResources()
|
||||||
|
if resources == nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
adj := int(resources.GetOomScoreAdj())
|
||||||
|
if restrict {
|
||||||
|
var err error
|
||||||
|
adj, err = restrictOOMScoreAdj(adj)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
s.Process.OOMScoreAdj = &adj
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// WithPodOOMScoreAdj sets the oom score for the pod sandbox
|
||||||
|
func WithPodOOMScoreAdj(adj int, restrict bool) oci.SpecOpts {
|
||||||
|
return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) error {
|
||||||
|
if s.Process == nil {
|
||||||
|
s.Process = &runtimespec.Process{}
|
||||||
|
}
|
||||||
|
if restrict {
|
||||||
|
var err error
|
||||||
|
adj, err = restrictOOMScoreAdj(adj)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
s.Process.OOMScoreAdj = &adj
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func getCurrentOOMScoreAdj() (int, error) {
|
||||||
|
b, err := os.ReadFile("/proc/self/oom_score_adj")
|
||||||
|
if err != nil {
|
||||||
|
return 0, fmt.Errorf("could not get the daemon oom_score_adj: %w", err)
|
||||||
|
}
|
||||||
|
s := strings.TrimSpace(string(b))
|
||||||
|
i, err := strconv.Atoi(s)
|
||||||
|
if err != nil {
|
||||||
|
return 0, fmt.Errorf("could not get the daemon oom_score_adj: %w", err)
|
||||||
|
}
|
||||||
|
return i, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func restrictOOMScoreAdj(preferredOOMScoreAdj int) (int, error) {
|
||||||
|
currentOOMScoreAdj, err := getCurrentOOMScoreAdj()
|
||||||
|
if err != nil {
|
||||||
|
return preferredOOMScoreAdj, err
|
||||||
|
}
|
||||||
|
if preferredOOMScoreAdj < currentOOMScoreAdj {
|
||||||
|
return currentOOMScoreAdj, nil
|
||||||
|
}
|
||||||
|
return preferredOOMScoreAdj, nil
|
||||||
|
}
|
41
pkg/cri/opts/spec_nonlinux.go
Normal file
41
pkg/cri/opts/spec_nonlinux.go
Normal file
@ -0,0 +1,41 @@
|
|||||||
|
//go:build !linux
|
||||||
|
|
||||||
|
/*
|
||||||
|
Copyright The containerd Authors.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package opts
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
|
||||||
|
"github.com/containerd/containerd/containers"
|
||||||
|
"github.com/containerd/containerd/oci"
|
||||||
|
)
|
||||||
|
|
||||||
|
func isHugetlbControllerPresent() bool {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
func SwapControllerAvailable() bool {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
// WithCDI does nothing on non Linux platforms.
|
||||||
|
func WithCDI(_ map[string]string) oci.SpecOpts {
|
||||||
|
return func(ctx context.Context, client oci.Client, container *containers.Container, spec *oci.Spec) error {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
}
|
@ -24,11 +24,11 @@ import (
|
|||||||
"sort"
|
"sort"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
"github.com/containerd/containerd/containers"
|
|
||||||
"github.com/containerd/containerd/oci"
|
|
||||||
runtimespec "github.com/opencontainers/runtime-spec/specs-go"
|
runtimespec "github.com/opencontainers/runtime-spec/specs-go"
|
||||||
runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
|
runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
|
||||||
|
|
||||||
|
"github.com/containerd/containerd/containers"
|
||||||
|
"github.com/containerd/containerd/oci"
|
||||||
osinterface "github.com/containerd/containerd/pkg/os"
|
osinterface "github.com/containerd/containerd/pkg/os"
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -229,8 +229,8 @@ func WithWindowsCredentialSpec(credentialSpec string) oci.SpecOpts {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// WithDevices sets the provided devices onto the container spec
|
// WithWindowsDevices sets the provided devices onto the container spec
|
||||||
func WithDevices(config *runtime.ContainerConfig) oci.SpecOpts {
|
func WithWindowsDevices(config *runtime.ContainerConfig) oci.SpecOpts {
|
||||||
return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) (err error) {
|
return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) (err error) {
|
||||||
for _, device := range config.GetDevices() {
|
for _, device := range config.GetDevices() {
|
||||||
if device.ContainerPath != "" {
|
if device.ContainerPath != "" {
|
@ -22,14 +22,15 @@ import (
|
|||||||
"strings"
|
"strings"
|
||||||
"testing"
|
"testing"
|
||||||
|
|
||||||
"github.com/containerd/containerd/containers"
|
|
||||||
"github.com/containerd/containerd/namespaces"
|
|
||||||
"github.com/containerd/containerd/oci"
|
|
||||||
osinterface "github.com/containerd/containerd/pkg/os"
|
|
||||||
"github.com/opencontainers/runtime-spec/specs-go"
|
"github.com/opencontainers/runtime-spec/specs-go"
|
||||||
"github.com/stretchr/testify/assert"
|
"github.com/stretchr/testify/assert"
|
||||||
"github.com/stretchr/testify/require"
|
"github.com/stretchr/testify/require"
|
||||||
runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
|
runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
|
||||||
|
|
||||||
|
"github.com/containerd/containerd/containers"
|
||||||
|
"github.com/containerd/containerd/namespaces"
|
||||||
|
"github.com/containerd/containerd/oci"
|
||||||
|
osinterface "github.com/containerd/containerd/pkg/os"
|
||||||
)
|
)
|
||||||
|
|
||||||
func TestWithDevices(t *testing.T) {
|
func TestWithDevices(t *testing.T) {
|
||||||
@ -183,7 +184,7 @@ func TestWithDevices(t *testing.T) {
|
|||||||
config := runtime.ContainerConfig{}
|
config := runtime.ContainerConfig{}
|
||||||
config.Devices = tc.devices
|
config.Devices = tc.devices
|
||||||
|
|
||||||
specOpts := []oci.SpecOpts{WithDevices(&config)}
|
specOpts := []oci.SpecOpts{WithWindowsDevices(&config)}
|
||||||
|
|
||||||
platform := "windows"
|
platform := "windows"
|
||||||
if tc.isLCOW {
|
if tc.isLCOW {
|
||||||
|
@ -24,6 +24,14 @@ import (
|
|||||||
"strconv"
|
"strconv"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
|
"github.com/containerd/typeurl"
|
||||||
|
"github.com/davecgh/go-spew/spew"
|
||||||
|
imagespec "github.com/opencontainers/image-spec/specs-go/v1"
|
||||||
|
runtimespec "github.com/opencontainers/runtime-spec/specs-go"
|
||||||
|
"github.com/opencontainers/selinux/go-selinux"
|
||||||
|
"github.com/opencontainers/selinux/go-selinux/label"
|
||||||
|
runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
|
||||||
|
|
||||||
"github.com/containerd/containerd"
|
"github.com/containerd/containerd"
|
||||||
"github.com/containerd/containerd/api/types"
|
"github.com/containerd/containerd/api/types"
|
||||||
"github.com/containerd/containerd/containers"
|
"github.com/containerd/containerd/containers"
|
||||||
@ -37,12 +45,6 @@ import (
|
|||||||
containerstore "github.com/containerd/containerd/pkg/cri/store/container"
|
containerstore "github.com/containerd/containerd/pkg/cri/store/container"
|
||||||
"github.com/containerd/containerd/pkg/cri/util"
|
"github.com/containerd/containerd/pkg/cri/util"
|
||||||
ctrdutil "github.com/containerd/containerd/pkg/cri/util"
|
ctrdutil "github.com/containerd/containerd/pkg/cri/util"
|
||||||
"github.com/containerd/typeurl"
|
|
||||||
"github.com/davecgh/go-spew/spew"
|
|
||||||
imagespec "github.com/opencontainers/image-spec/specs-go/v1"
|
|
||||||
runtimespec "github.com/opencontainers/runtime-spec/specs-go"
|
|
||||||
"github.com/opencontainers/selinux/go-selinux"
|
|
||||||
runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
func init() {
|
func init() {
|
||||||
@ -419,18 +421,75 @@ func (c *criService) buildContainerSpec(
|
|||||||
ociRuntime config.Runtime,
|
ociRuntime config.Runtime,
|
||||||
) (_ *runtimespec.Spec, retErr error) {
|
) (_ *runtimespec.Spec, retErr error) {
|
||||||
var (
|
var (
|
||||||
specOpts []oci.SpecOpts
|
|
||||||
|
|
||||||
// Platform helpers
|
|
||||||
isLinux = platform.OS == "linux"
|
isLinux = platform.OS == "linux"
|
||||||
isWindows = platform.OS == "windows"
|
isWindows = platform.OS == "windows"
|
||||||
|
isDarwin = platform.OS == "darwin"
|
||||||
)
|
)
|
||||||
|
|
||||||
if isLinux {
|
switch {
|
||||||
specOpts = append(specOpts, oci.WithoutRunMount)
|
case isLinux:
|
||||||
|
return c.buildLinuxSpec(
|
||||||
|
id,
|
||||||
|
sandboxID,
|
||||||
|
sandboxPid,
|
||||||
|
netNSPath,
|
||||||
|
containerName,
|
||||||
|
imageName,
|
||||||
|
config,
|
||||||
|
sandboxConfig,
|
||||||
|
imageConfig,
|
||||||
|
extraMounts,
|
||||||
|
ociRuntime,
|
||||||
|
)
|
||||||
|
case isWindows:
|
||||||
|
return c.buildWindowsSpec(
|
||||||
|
id,
|
||||||
|
sandboxID,
|
||||||
|
sandboxPid,
|
||||||
|
netNSPath,
|
||||||
|
containerName,
|
||||||
|
imageName,
|
||||||
|
config,
|
||||||
|
sandboxConfig,
|
||||||
|
imageConfig,
|
||||||
|
extraMounts,
|
||||||
|
ociRuntime,
|
||||||
|
)
|
||||||
|
case isDarwin:
|
||||||
|
return c.buildDarwinSpec(
|
||||||
|
id,
|
||||||
|
sandboxID,
|
||||||
|
containerName,
|
||||||
|
imageName,
|
||||||
|
config,
|
||||||
|
sandboxConfig,
|
||||||
|
imageConfig,
|
||||||
|
extraMounts,
|
||||||
|
ociRuntime,
|
||||||
|
)
|
||||||
|
default:
|
||||||
|
return nil, fmt.Errorf("unsupported spec platform: %s", platform.OS)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Only clear the default security settings if the runtime does not have a custom
|
func (c *criService) buildLinuxSpec(
|
||||||
// base runtime spec. Admins can use this functionality to define
|
id string,
|
||||||
|
sandboxID string,
|
||||||
|
sandboxPid uint32,
|
||||||
|
netNSPath string,
|
||||||
|
containerName string,
|
||||||
|
imageName string,
|
||||||
|
config *runtime.ContainerConfig,
|
||||||
|
sandboxConfig *runtime.PodSandboxConfig,
|
||||||
|
imageConfig *imagespec.ImageConfig,
|
||||||
|
extraMounts []*runtime.Mount,
|
||||||
|
ociRuntime config.Runtime,
|
||||||
|
) (_ *runtimespec.Spec, retErr error) {
|
||||||
|
specOpts := []oci.SpecOpts{
|
||||||
|
oci.WithoutRunMount,
|
||||||
|
}
|
||||||
|
// only clear the default security settings if the runtime does not have a custom
|
||||||
|
// base runtime spec spec. Admins can use this functionality to define
|
||||||
// default ulimits, seccomp, or other default settings.
|
// default ulimits, seccomp, or other default settings.
|
||||||
if ociRuntime.BaseRuntimeSpec == "" {
|
if ociRuntime.BaseRuntimeSpec == "" {
|
||||||
specOpts = append(specOpts, customopts.WithoutDefaultSecuritySettings)
|
specOpts = append(specOpts, customopts.WithoutDefaultSecuritySettings)
|
||||||
@ -438,11 +497,22 @@ func (c *criService) buildContainerSpec(
|
|||||||
|
|
||||||
specOpts = append(specOpts,
|
specOpts = append(specOpts,
|
||||||
customopts.WithRelativeRoot(relativeRootfsPath),
|
customopts.WithRelativeRoot(relativeRootfsPath),
|
||||||
|
customopts.WithProcessArgs(config, imageConfig),
|
||||||
oci.WithDefaultPathEnv,
|
oci.WithDefaultPathEnv,
|
||||||
// this will be set based on the security context below
|
// this will be set based on the security context below
|
||||||
oci.WithNewPrivileges,
|
oci.WithNewPrivileges,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if config.GetWorkingDir() != "" {
|
||||||
|
specOpts = append(specOpts, oci.WithProcessCwd(config.GetWorkingDir()))
|
||||||
|
} else if imageConfig.WorkingDir != "" {
|
||||||
|
specOpts = append(specOpts, oci.WithProcessCwd(imageConfig.WorkingDir))
|
||||||
|
}
|
||||||
|
|
||||||
|
if config.GetTty() {
|
||||||
|
specOpts = append(specOpts, oci.WithTTY)
|
||||||
|
}
|
||||||
|
|
||||||
// Add HOSTNAME env.
|
// Add HOSTNAME env.
|
||||||
var (
|
var (
|
||||||
err error
|
err error
|
||||||
@ -454,19 +524,6 @@ func (c *criService) buildContainerSpec(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
specOpts = append(specOpts, oci.WithEnv([]string{hostnameEnv + "=" + hostname}))
|
specOpts = append(specOpts, oci.WithEnv([]string{hostnameEnv + "=" + hostname}))
|
||||||
}
|
|
||||||
|
|
||||||
specOpts = append(specOpts, customopts.WithProcessArgs(config, imageConfig))
|
|
||||||
|
|
||||||
if config.GetWorkingDir() != "" {
|
|
||||||
specOpts = append(specOpts, oci.WithProcessCwd(config.GetWorkingDir()))
|
|
||||||
} else if imageConfig.WorkingDir != "" {
|
|
||||||
specOpts = append(specOpts, oci.WithProcessCwd(imageConfig.WorkingDir))
|
|
||||||
}
|
|
||||||
|
|
||||||
if config.GetTty() {
|
|
||||||
specOpts = append(specOpts, oci.WithTTY)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Apply envs from image config first, so that envs from container config
|
// Apply envs from image config first, so that envs from container config
|
||||||
// can override them.
|
// can override them.
|
||||||
@ -476,44 +533,32 @@ func (c *criService) buildContainerSpec(
|
|||||||
}
|
}
|
||||||
specOpts = append(specOpts, oci.WithEnv(env))
|
specOpts = append(specOpts, oci.WithEnv(env))
|
||||||
|
|
||||||
if isWindows {
|
securityContext := config.GetLinux().GetSecurityContext()
|
||||||
specOpts = append(specOpts,
|
labelOptions, err := toLabel(securityContext.GetSelinuxOptions())
|
||||||
// Clear the root location since hcsshim expects it.
|
|
||||||
// NOTE: readonly rootfs doesn't work on windows.
|
|
||||||
customopts.WithoutRoot,
|
|
||||||
oci.WithWindowsNetworkNamespace(netNSPath),
|
|
||||||
oci.WithHostname(sandboxConfig.GetHostname()),
|
|
||||||
)
|
|
||||||
|
|
||||||
// All containers in a pod need to have HostProcess set if it was set on the pod,
|
|
||||||
// and vice versa no containers in the pod can be HostProcess if the pods spec
|
|
||||||
// didn't have the field set. The only case that is valid is if these are the same value.
|
|
||||||
cntrHpc := config.GetWindows().GetSecurityContext().GetHostProcess()
|
|
||||||
sandboxHpc := sandboxConfig.GetWindows().GetSecurityContext().GetHostProcess()
|
|
||||||
if cntrHpc != sandboxHpc {
|
|
||||||
return nil, errors.New("pod spec and all containers inside must have the HostProcess field set to be valid")
|
|
||||||
}
|
|
||||||
|
|
||||||
specOpts = append(specOpts, customopts.WithAnnotation(annotations.WindowsHostProcess, strconv.FormatBool(sandboxHpc)))
|
|
||||||
}
|
|
||||||
|
|
||||||
// Get spec opts that depend on features offered by the platform containerd daemon is running on.
|
|
||||||
platformSpecOpts, err := c.platformSpec(
|
|
||||||
id,
|
|
||||||
sandboxID,
|
|
||||||
config,
|
|
||||||
sandboxConfig,
|
|
||||||
imageConfig,
|
|
||||||
extraMounts,
|
|
||||||
)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
if len(labelOptions) == 0 {
|
||||||
|
// Use pod level SELinux config
|
||||||
|
if sandbox, err := c.sandboxStore.Get(sandboxID); err == nil {
|
||||||
|
labelOptions, err = selinux.DupSecOpt(sandbox.ProcessLabel)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
specOpts = append(specOpts, platformSpecOpts...)
|
processLabel, mountLabel, err := label.InitLabels(labelOptions)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to init selinux options %+v: %w", securityContext.GetSelinuxOptions(), err)
|
||||||
|
}
|
||||||
|
defer func() {
|
||||||
|
if retErr != nil {
|
||||||
|
selinux.ReleaseLabel(processLabel)
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
if isLinux {
|
specOpts = append(specOpts, customopts.WithMounts(c.os, config, extraMounts, mountLabel))
|
||||||
securityContext := config.GetLinux().GetSecurityContext()
|
|
||||||
|
|
||||||
if !c.config.DisableProcMount {
|
if !c.config.DisableProcMount {
|
||||||
// Change the default masked/readonly paths to empty slices
|
// Change the default masked/readonly paths to empty slices
|
||||||
@ -534,6 +579,9 @@ func (c *criService) buildContainerSpec(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
specOpts = append(specOpts, customopts.WithDevices(c.os, config, c.config.DeviceOwnershipFromSecurityContext),
|
||||||
|
customopts.WithCapabilities(securityContext, c.allCaps))
|
||||||
|
|
||||||
if securityContext.GetPrivileged() {
|
if securityContext.GetPrivileged() {
|
||||||
if !sandboxConfig.GetLinux().GetSecurityContext().GetPrivileged() {
|
if !sandboxConfig.GetLinux().GetSecurityContext().GetPrivileged() {
|
||||||
return nil, errors.New("no privileged container allowed in sandbox")
|
return nil, errors.New("no privileged container allowed in sandbox")
|
||||||
@ -551,7 +599,10 @@ func (c *criService) buildContainerSpec(
|
|||||||
// is not clearly defined in Kubernetes.
|
// is not clearly defined in Kubernetes.
|
||||||
// See https://github.com/kubernetes/kubernetes/issues/56374
|
// See https://github.com/kubernetes/kubernetes/issues/56374
|
||||||
// Keep docker's behavior for now.
|
// Keep docker's behavior for now.
|
||||||
specOpts = append(specOpts, customopts.WithoutAmbientCaps)
|
specOpts = append(specOpts,
|
||||||
|
customopts.WithoutAmbientCaps,
|
||||||
|
customopts.WithSelinuxLabels(processLabel, mountLabel),
|
||||||
|
)
|
||||||
|
|
||||||
// TODO: Figure out whether we should set no new privilege for sandbox container by default
|
// TODO: Figure out whether we should set no new privilege for sandbox container by default
|
||||||
if securityContext.GetNoNewPrivs() {
|
if securityContext.GetNoNewPrivs() {
|
||||||
@ -562,8 +613,49 @@ func (c *criService) buildContainerSpec(
|
|||||||
specOpts = append(specOpts, oci.WithRootFSReadonly())
|
specOpts = append(specOpts, oci.WithRootFSReadonly())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if c.config.DisableCgroup {
|
||||||
|
specOpts = append(specOpts, customopts.WithDisabledCgroups)
|
||||||
|
} else {
|
||||||
|
specOpts = append(specOpts, customopts.WithResources(config.GetLinux().GetResources(), c.config.TolerateMissingHugetlbController, c.config.DisableHugetlbController))
|
||||||
|
if sandboxConfig.GetLinux().GetCgroupParent() != "" {
|
||||||
|
cgroupsPath := getCgroupsPath(sandboxConfig.GetLinux().GetCgroupParent(), id)
|
||||||
|
specOpts = append(specOpts, oci.WithCgroup(cgroupsPath))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
supplementalGroups := securityContext.GetSupplementalGroups()
|
supplementalGroups := securityContext.GetSupplementalGroups()
|
||||||
specOpts = append(specOpts, customopts.WithSupplementalGroups(supplementalGroups))
|
|
||||||
|
// Get blockio class
|
||||||
|
blockIOClass, err := c.blockIOClassFromAnnotations(config.GetMetadata().GetName(), config.Annotations, sandboxConfig.Annotations)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to set blockio class: %w", err)
|
||||||
|
}
|
||||||
|
if blockIOClass != "" {
|
||||||
|
if linuxBlockIO, err := blockIOToLinuxOci(blockIOClass); err == nil {
|
||||||
|
specOpts = append(specOpts, oci.WithBlockIO(linuxBlockIO))
|
||||||
|
} else {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get RDT class
|
||||||
|
rdtClass, err := c.rdtClassFromAnnotations(config.GetMetadata().GetName(), config.Annotations, sandboxConfig.Annotations)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to set RDT class: %w", err)
|
||||||
|
}
|
||||||
|
if rdtClass != "" {
|
||||||
|
specOpts = append(specOpts, oci.WithRdt(rdtClass, "", ""))
|
||||||
|
}
|
||||||
|
|
||||||
|
for pKey, pValue := range getPassthroughAnnotations(sandboxConfig.Annotations,
|
||||||
|
ociRuntime.PodAnnotations) {
|
||||||
|
specOpts = append(specOpts, customopts.WithAnnotation(pKey, pValue))
|
||||||
|
}
|
||||||
|
|
||||||
|
for pKey, pValue := range getPassthroughAnnotations(config.Annotations,
|
||||||
|
ociRuntime.ContainerAnnotations) {
|
||||||
|
specOpts = append(specOpts, customopts.WithAnnotation(pKey, pValue))
|
||||||
|
}
|
||||||
|
|
||||||
// Default target PID namespace is the sandbox PID.
|
// Default target PID namespace is the sandbox PID.
|
||||||
targetPid := sandboxPid
|
targetPid := sandboxPid
|
||||||
@ -580,12 +672,177 @@ func (c *criService) buildContainerSpec(
|
|||||||
targetPid = status.Pid
|
targetPid = status.Pid
|
||||||
}
|
}
|
||||||
|
|
||||||
specOpts = append(specOpts,
|
uids, gids, err := parseUsernsIDs(nsOpts.GetUsernsOptions())
|
||||||
// TODO: This is a hack to make this compile. We should move userns support to sbserver.
|
if err != nil {
|
||||||
customopts.WithPodNamespaces(securityContext, sandboxPid, targetPid, nil, nil),
|
return nil, fmt.Errorf("user namespace configuration: %w", err)
|
||||||
)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Check sandbox userns config is consistent with container config.
|
||||||
|
sandboxUsernsOpts := sandboxConfig.GetLinux().GetSecurityContext().GetNamespaceOptions().GetUsernsOptions()
|
||||||
|
if !sameUsernsConfig(sandboxUsernsOpts, nsOpts.GetUsernsOptions()) {
|
||||||
|
return nil, fmt.Errorf("user namespace config for sandbox is different from container. Sandbox userns config: %v - Container userns config: %v", sandboxUsernsOpts, nsOpts.GetUsernsOptions())
|
||||||
|
}
|
||||||
|
|
||||||
|
specOpts = append(specOpts,
|
||||||
|
customopts.WithOOMScoreAdj(config, c.config.RestrictOOMScoreAdj),
|
||||||
|
customopts.WithPodNamespaces(securityContext, sandboxPid, targetPid, uids, gids),
|
||||||
|
customopts.WithSupplementalGroups(supplementalGroups),
|
||||||
|
customopts.WithAnnotation(annotations.ContainerType, annotations.ContainerTypeContainer),
|
||||||
|
customopts.WithAnnotation(annotations.SandboxID, sandboxID),
|
||||||
|
customopts.WithAnnotation(annotations.SandboxNamespace, sandboxConfig.GetMetadata().GetNamespace()),
|
||||||
|
customopts.WithAnnotation(annotations.SandboxUID, sandboxConfig.GetMetadata().GetUid()),
|
||||||
|
customopts.WithAnnotation(annotations.SandboxName, sandboxConfig.GetMetadata().GetName()),
|
||||||
|
customopts.WithAnnotation(annotations.ContainerName, containerName),
|
||||||
|
customopts.WithAnnotation(annotations.ImageName, imageName),
|
||||||
|
)
|
||||||
|
|
||||||
|
// cgroupns is used for hiding /sys/fs/cgroup from containers.
|
||||||
|
// For compatibility, cgroupns is not used when running in cgroup v1 mode or in privileged.
|
||||||
|
// https://github.com/containers/libpod/issues/4363
|
||||||
|
// https://github.com/kubernetes/enhancements/blob/0e409b47497e398b369c281074485c8de129694f/keps/sig-node/20191118-cgroups-v2.md#cgroup-namespace
|
||||||
|
if isUnifiedCgroupsMode() && !securityContext.GetPrivileged() {
|
||||||
|
specOpts = append(specOpts, oci.WithLinuxNamespace(runtimespec.LinuxNamespace{Type: runtimespec.CgroupNamespace}))
|
||||||
|
}
|
||||||
|
|
||||||
|
return c.runtimeSpec(id, ociRuntime.BaseRuntimeSpec, specOpts...)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *criService) buildWindowsSpec(
|
||||||
|
id string,
|
||||||
|
sandboxID string,
|
||||||
|
sandboxPid uint32,
|
||||||
|
netNSPath string,
|
||||||
|
containerName string,
|
||||||
|
imageName string,
|
||||||
|
config *runtime.ContainerConfig,
|
||||||
|
sandboxConfig *runtime.PodSandboxConfig,
|
||||||
|
imageConfig *imagespec.ImageConfig,
|
||||||
|
extraMounts []*runtime.Mount,
|
||||||
|
ociRuntime config.Runtime,
|
||||||
|
) (_ *runtimespec.Spec, retErr error) {
|
||||||
|
specOpts := []oci.SpecOpts{
|
||||||
|
customopts.WithProcessArgs(config, imageConfig),
|
||||||
|
}
|
||||||
|
|
||||||
|
// All containers in a pod need to have HostProcess set if it was set on the pod,
|
||||||
|
// and vice versa no containers in the pod can be HostProcess if the pods spec
|
||||||
|
// didn't have the field set. The only case that is valid is if these are the same value.
|
||||||
|
cntrHpc := config.GetWindows().GetSecurityContext().GetHostProcess()
|
||||||
|
sandboxHpc := sandboxConfig.GetWindows().GetSecurityContext().GetHostProcess()
|
||||||
|
if cntrHpc != sandboxHpc {
|
||||||
|
return nil, errors.New("pod spec and all containers inside must have the HostProcess field set to be valid")
|
||||||
|
}
|
||||||
|
|
||||||
|
if config.GetWorkingDir() != "" {
|
||||||
|
specOpts = append(specOpts, oci.WithProcessCwd(config.GetWorkingDir()))
|
||||||
|
} else if imageConfig.WorkingDir != "" {
|
||||||
|
specOpts = append(specOpts, oci.WithProcessCwd(imageConfig.WorkingDir))
|
||||||
|
}
|
||||||
|
|
||||||
|
if config.GetTty() {
|
||||||
|
specOpts = append(specOpts, oci.WithTTY)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Apply envs from image config first, so that envs from container config
|
||||||
|
// can override them.
|
||||||
|
env := append([]string{}, imageConfig.Env...)
|
||||||
|
for _, e := range config.GetEnvs() {
|
||||||
|
env = append(env, e.GetKey()+"="+e.GetValue())
|
||||||
|
}
|
||||||
|
specOpts = append(specOpts, oci.WithEnv(env))
|
||||||
|
|
||||||
|
specOpts = append(specOpts,
|
||||||
|
// Clear the root location since hcsshim expects it.
|
||||||
|
// NOTE: readonly rootfs doesn't work on windows.
|
||||||
|
customopts.WithoutRoot,
|
||||||
|
oci.WithWindowsNetworkNamespace(netNSPath),
|
||||||
|
oci.WithHostname(sandboxConfig.GetHostname()),
|
||||||
|
)
|
||||||
|
|
||||||
|
specOpts = append(specOpts, customopts.WithWindowsMounts(c.os, config, extraMounts), customopts.WithWindowsDevices(config))
|
||||||
|
|
||||||
|
// Start with the image config user and override below if RunAsUsername is not "".
|
||||||
|
username := imageConfig.User
|
||||||
|
|
||||||
|
windowsConfig := config.GetWindows()
|
||||||
|
if windowsConfig != nil {
|
||||||
|
specOpts = append(specOpts, customopts.WithWindowsResources(windowsConfig.GetResources()))
|
||||||
|
securityCtx := windowsConfig.GetSecurityContext()
|
||||||
|
if securityCtx != nil {
|
||||||
|
runAsUser := securityCtx.GetRunAsUsername()
|
||||||
|
if runAsUser != "" {
|
||||||
|
username = runAsUser
|
||||||
|
}
|
||||||
|
cs := securityCtx.GetCredentialSpec()
|
||||||
|
if cs != "" {
|
||||||
|
specOpts = append(specOpts, customopts.WithWindowsCredentialSpec(cs))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// There really isn't a good Windows way to verify that the username is available in the
|
||||||
|
// image as early as here like there is for Linux. Later on in the stack hcsshim
|
||||||
|
// will handle the behavior of erroring out if the user isn't available in the image
|
||||||
|
// when trying to run the init process.
|
||||||
|
specOpts = append(specOpts, oci.WithUser(username))
|
||||||
|
|
||||||
|
for pKey, pValue := range getPassthroughAnnotations(sandboxConfig.Annotations,
|
||||||
|
ociRuntime.PodAnnotations) {
|
||||||
|
specOpts = append(specOpts, customopts.WithAnnotation(pKey, pValue))
|
||||||
|
}
|
||||||
|
|
||||||
|
for pKey, pValue := range getPassthroughAnnotations(config.Annotations,
|
||||||
|
ociRuntime.ContainerAnnotations) {
|
||||||
|
specOpts = append(specOpts, customopts.WithAnnotation(pKey, pValue))
|
||||||
|
}
|
||||||
|
|
||||||
|
specOpts = append(specOpts,
|
||||||
|
customopts.WithAnnotation(annotations.ContainerType, annotations.ContainerTypeContainer),
|
||||||
|
customopts.WithAnnotation(annotations.SandboxID, sandboxID),
|
||||||
|
customopts.WithAnnotation(annotations.SandboxNamespace, sandboxConfig.GetMetadata().GetNamespace()),
|
||||||
|
customopts.WithAnnotation(annotations.SandboxUID, sandboxConfig.GetMetadata().GetUid()),
|
||||||
|
customopts.WithAnnotation(annotations.SandboxName, sandboxConfig.GetMetadata().GetName()),
|
||||||
|
customopts.WithAnnotation(annotations.ContainerName, containerName),
|
||||||
|
customopts.WithAnnotation(annotations.ImageName, imageName),
|
||||||
|
customopts.WithAnnotation(annotations.WindowsHostProcess, strconv.FormatBool(sandboxHpc)),
|
||||||
|
)
|
||||||
|
|
||||||
|
return c.runtimeSpec(id, ociRuntime.BaseRuntimeSpec, specOpts...)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *criService) buildDarwinSpec(
|
||||||
|
id string,
|
||||||
|
sandboxID string,
|
||||||
|
containerName string,
|
||||||
|
imageName string,
|
||||||
|
config *runtime.ContainerConfig,
|
||||||
|
sandboxConfig *runtime.PodSandboxConfig,
|
||||||
|
imageConfig *imagespec.ImageConfig,
|
||||||
|
extraMounts []*runtime.Mount,
|
||||||
|
ociRuntime config.Runtime,
|
||||||
|
) (_ *runtimespec.Spec, retErr error) {
|
||||||
|
specOpts := []oci.SpecOpts{
|
||||||
|
customopts.WithProcessArgs(config, imageConfig),
|
||||||
|
}
|
||||||
|
|
||||||
|
if config.GetWorkingDir() != "" {
|
||||||
|
specOpts = append(specOpts, oci.WithProcessCwd(config.GetWorkingDir()))
|
||||||
|
} else if imageConfig.WorkingDir != "" {
|
||||||
|
specOpts = append(specOpts, oci.WithProcessCwd(imageConfig.WorkingDir))
|
||||||
|
}
|
||||||
|
|
||||||
|
if config.GetTty() {
|
||||||
|
specOpts = append(specOpts, oci.WithTTY)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Apply envs from image config first, so that envs from container config
|
||||||
|
// can override them.
|
||||||
|
env := append([]string{}, imageConfig.Env...)
|
||||||
|
for _, e := range config.GetEnvs() {
|
||||||
|
env = append(env, e.GetKey()+"="+e.GetValue())
|
||||||
|
}
|
||||||
|
specOpts = append(specOpts, oci.WithEnv(env))
|
||||||
|
|
||||||
for pKey, pValue := range getPassthroughAnnotations(sandboxConfig.Annotations,
|
for pKey, pValue := range getPassthroughAnnotations(sandboxConfig.Annotations,
|
||||||
ociRuntime.PodAnnotations) {
|
ociRuntime.PodAnnotations) {
|
||||||
specOpts = append(specOpts, customopts.WithAnnotation(pKey, pValue))
|
specOpts = append(specOpts, customopts.WithAnnotation(pKey, pValue))
|
||||||
|
@ -25,16 +25,13 @@ import (
|
|||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
"github.com/containerd/cgroups/v3"
|
imagespec "github.com/opencontainers/image-spec/specs-go/v1"
|
||||||
|
runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
|
||||||
|
|
||||||
"github.com/containerd/containerd/contrib/apparmor"
|
"github.com/containerd/containerd/contrib/apparmor"
|
||||||
"github.com/containerd/containerd/contrib/seccomp"
|
"github.com/containerd/containerd/contrib/seccomp"
|
||||||
"github.com/containerd/containerd/oci"
|
"github.com/containerd/containerd/oci"
|
||||||
"github.com/containerd/containerd/snapshots"
|
"github.com/containerd/containerd/snapshots"
|
||||||
imagespec "github.com/opencontainers/image-spec/specs-go/v1"
|
|
||||||
runtimespec "github.com/opencontainers/runtime-spec/specs-go"
|
|
||||||
"github.com/opencontainers/selinux/go-selinux"
|
|
||||||
"github.com/opencontainers/selinux/go-selinux/label"
|
|
||||||
runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
|
|
||||||
|
|
||||||
customopts "github.com/containerd/containerd/pkg/cri/opts"
|
customopts "github.com/containerd/containerd/pkg/cri/opts"
|
||||||
)
|
)
|
||||||
@ -111,93 +108,6 @@ func (c *criService) containerMounts(sandboxID string, config *runtime.Container
|
|||||||
return mounts
|
return mounts
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *criService) platformSpec(
|
|
||||||
id string,
|
|
||||||
sandboxID string,
|
|
||||||
config *runtime.ContainerConfig,
|
|
||||||
sandboxConfig *runtime.PodSandboxConfig,
|
|
||||||
imageConfig *imagespec.ImageConfig,
|
|
||||||
extraMounts []*runtime.Mount,
|
|
||||||
) (_ []oci.SpecOpts, retErr error) {
|
|
||||||
specOpts := []oci.SpecOpts{}
|
|
||||||
|
|
||||||
securityContext := config.GetLinux().GetSecurityContext()
|
|
||||||
labelOptions, err := toLabel(securityContext.GetSelinuxOptions())
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
if len(labelOptions) == 0 {
|
|
||||||
// Use pod level SELinux config
|
|
||||||
if sandbox, err := c.sandboxStore.Get(sandboxID); err == nil {
|
|
||||||
labelOptions, err = selinux.DupSecOpt(sandbox.ProcessLabel)
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
processLabel, mountLabel, err := label.InitLabels(labelOptions)
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("failed to init selinux options %+v: %w", securityContext.GetSelinuxOptions(), err)
|
|
||||||
}
|
|
||||||
defer func() {
|
|
||||||
if retErr != nil {
|
|
||||||
selinux.ReleaseLabel(processLabel)
|
|
||||||
}
|
|
||||||
}()
|
|
||||||
|
|
||||||
specOpts = append(specOpts,
|
|
||||||
customopts.WithSelinuxLabels(processLabel, mountLabel),
|
|
||||||
customopts.WithMounts(c.os, config, extraMounts, mountLabel),
|
|
||||||
customopts.WithDevices(c.os, config, c.config.DeviceOwnershipFromSecurityContext),
|
|
||||||
customopts.WithCapabilities(securityContext, c.allCaps),
|
|
||||||
)
|
|
||||||
|
|
||||||
if c.config.DisableCgroup {
|
|
||||||
specOpts = append(specOpts, customopts.WithDisabledCgroups)
|
|
||||||
} else {
|
|
||||||
specOpts = append(specOpts, customopts.WithResources(config.GetLinux().GetResources(), c.config.TolerateMissingHugetlbController, c.config.DisableHugetlbController))
|
|
||||||
if sandboxConfig.GetLinux().GetCgroupParent() != "" {
|
|
||||||
cgroupsPath := getCgroupsPath(sandboxConfig.GetLinux().GetCgroupParent(), id)
|
|
||||||
specOpts = append(specOpts, oci.WithCgroup(cgroupsPath))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Get blockio class
|
|
||||||
blockIOClass, err := c.blockIOClassFromAnnotations(config.GetMetadata().GetName(), config.Annotations, sandboxConfig.Annotations)
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("failed to set blockio class: %w", err)
|
|
||||||
}
|
|
||||||
if blockIOClass != "" {
|
|
||||||
if linuxBlockIO, err := blockIOToLinuxOci(blockIOClass); err == nil {
|
|
||||||
specOpts = append(specOpts, oci.WithBlockIO(linuxBlockIO))
|
|
||||||
} else {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Get RDT class
|
|
||||||
rdtClass, err := c.rdtClassFromAnnotations(config.GetMetadata().GetName(), config.Annotations, sandboxConfig.Annotations)
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("failed to set RDT class: %w", err)
|
|
||||||
}
|
|
||||||
if rdtClass != "" {
|
|
||||||
specOpts = append(specOpts, oci.WithRdt(rdtClass, "", ""))
|
|
||||||
}
|
|
||||||
|
|
||||||
specOpts = append(specOpts, customopts.WithOOMScoreAdj(config, c.config.RestrictOOMScoreAdj))
|
|
||||||
|
|
||||||
// cgroupns is used for hiding /sys/fs/cgroup from containers.
|
|
||||||
// For compatibility, cgroupns is not used when running in cgroup v1 mode or in privileged.
|
|
||||||
// https://github.com/containers/libpod/issues/4363
|
|
||||||
// https://github.com/kubernetes/enhancements/blob/0e409b47497e398b369c281074485c8de129694f/keps/sig-node/20191118-cgroups-v2.md#cgroup-namespace
|
|
||||||
if cgroups.Mode() == cgroups.Unified && !securityContext.GetPrivileged() {
|
|
||||||
specOpts = append(specOpts, oci.WithLinuxNamespace(runtimespec.LinuxNamespace{Type: runtimespec.CgroupNamespace}))
|
|
||||||
}
|
|
||||||
|
|
||||||
return specOpts, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (c *criService) containerSpecOpts(config *runtime.ContainerConfig, imageConfig *imagespec.ImageConfig) ([]oci.SpecOpts, error) {
|
func (c *criService) containerSpecOpts(config *runtime.ContainerConfig, imageConfig *imagespec.ImageConfig) ([]oci.SpecOpts, error) {
|
||||||
var specOpts []oci.SpecOpts
|
var specOpts []oci.SpecOpts
|
||||||
securityContext := config.GetLinux().GetSecurityContext()
|
securityContext := config.GetLinux().GetSecurityContext()
|
||||||
|
@ -19,10 +19,11 @@
|
|||||||
package sbserver
|
package sbserver
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"github.com/containerd/containerd/oci"
|
|
||||||
"github.com/containerd/containerd/snapshots"
|
|
||||||
imagespec "github.com/opencontainers/image-spec/specs-go/v1"
|
imagespec "github.com/opencontainers/image-spec/specs-go/v1"
|
||||||
runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
|
runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
|
||||||
|
|
||||||
|
"github.com/containerd/containerd/oci"
|
||||||
|
"github.com/containerd/containerd/snapshots"
|
||||||
)
|
)
|
||||||
|
|
||||||
// containerMounts sets up necessary container system file mounts
|
// containerMounts sets up necessary container system file mounts
|
||||||
@ -31,17 +32,6 @@ func (c *criService) containerMounts(sandboxID string, config *runtime.Container
|
|||||||
return []*runtime.Mount{}
|
return []*runtime.Mount{}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *criService) platformSpec(
|
|
||||||
id string,
|
|
||||||
sandboxID string,
|
|
||||||
config *runtime.ContainerConfig,
|
|
||||||
sandboxConfig *runtime.PodSandboxConfig,
|
|
||||||
imageConfig *imagespec.ImageConfig,
|
|
||||||
extraMounts []*runtime.Mount,
|
|
||||||
) ([]oci.SpecOpts, error) {
|
|
||||||
return []oci.SpecOpts{}, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (c *criService) containerSpecOpts(config *runtime.ContainerConfig, imageConfig *imagespec.ImageConfig) ([]oci.SpecOpts, error) {
|
func (c *criService) containerSpecOpts(config *runtime.ContainerConfig, imageConfig *imagespec.ImageConfig) ([]oci.SpecOpts, error) {
|
||||||
return []oci.SpecOpts{}, nil
|
return []oci.SpecOpts{}, nil
|
||||||
}
|
}
|
||||||
|
@ -23,7 +23,6 @@ import (
|
|||||||
runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
|
runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
|
||||||
|
|
||||||
"github.com/containerd/containerd/oci"
|
"github.com/containerd/containerd/oci"
|
||||||
customopts "github.com/containerd/containerd/pkg/cri/opts"
|
|
||||||
"github.com/containerd/containerd/snapshots"
|
"github.com/containerd/containerd/snapshots"
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -32,49 +31,6 @@ func (c *criService) containerMounts(sandboxID string, config *runtime.Container
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *criService) platformSpec(
|
|
||||||
id string,
|
|
||||||
sandboxID string,
|
|
||||||
config *runtime.ContainerConfig,
|
|
||||||
sandboxConfig *runtime.PodSandboxConfig,
|
|
||||||
imageConfig *imagespec.ImageConfig,
|
|
||||||
extraMounts []*runtime.Mount,
|
|
||||||
) ([]oci.SpecOpts, error) {
|
|
||||||
specOpts := []oci.SpecOpts{}
|
|
||||||
|
|
||||||
specOpts = append(specOpts,
|
|
||||||
customopts.WithWindowsMounts(c.os, config, extraMounts),
|
|
||||||
customopts.WithDevices(config),
|
|
||||||
)
|
|
||||||
|
|
||||||
// Start with the image config user and override below if RunAsUsername is not "".
|
|
||||||
username := imageConfig.User
|
|
||||||
|
|
||||||
windowsConfig := config.GetWindows()
|
|
||||||
if windowsConfig != nil {
|
|
||||||
specOpts = append(specOpts, customopts.WithWindowsResources(windowsConfig.GetResources()))
|
|
||||||
securityCtx := windowsConfig.GetSecurityContext()
|
|
||||||
if securityCtx != nil {
|
|
||||||
runAsUser := securityCtx.GetRunAsUsername()
|
|
||||||
if runAsUser != "" {
|
|
||||||
username = runAsUser
|
|
||||||
}
|
|
||||||
cs := securityCtx.GetCredentialSpec()
|
|
||||||
if cs != "" {
|
|
||||||
specOpts = append(specOpts, customopts.WithWindowsCredentialSpec(cs))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// There really isn't a good Windows way to verify that the username is available in the
|
|
||||||
// image as early as here like there is for Linux. Later on in the stack hcsshim
|
|
||||||
// will handle the behavior of erroring out if the user isn't available in the image
|
|
||||||
// when trying to run the init process.
|
|
||||||
specOpts = append(specOpts, oci.WithUser(username))
|
|
||||||
|
|
||||||
return specOpts, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// No extra spec options needed for windows.
|
// No extra spec options needed for windows.
|
||||||
func (c *criService) containerSpecOpts(config *runtime.ContainerConfig, imageConfig *imagespec.ImageConfig) ([]oci.SpecOpts, error) {
|
func (c *criService) containerSpecOpts(config *runtime.ContainerConfig, imageConfig *imagespec.ImageConfig) ([]oci.SpecOpts, error) {
|
||||||
return nil, nil
|
return nil, nil
|
||||||
|
@ -21,6 +21,7 @@ import (
|
|||||||
"fmt"
|
"fmt"
|
||||||
"path"
|
"path"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
|
"regexp"
|
||||||
goruntime "runtime"
|
goruntime "runtime"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
@ -603,3 +604,180 @@ func hostNetwork(config *runtime.PodSandboxConfig) bool {
|
|||||||
}
|
}
|
||||||
return hostNet
|
return hostNet
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// getCgroupsPath generates container cgroups path.
|
||||||
|
func getCgroupsPath(cgroupsParent, id string) string {
|
||||||
|
base := path.Base(cgroupsParent)
|
||||||
|
if strings.HasSuffix(base, ".slice") {
|
||||||
|
// For a.slice/b.slice/c.slice, base is c.slice.
|
||||||
|
// runc systemd cgroup path format is "slice:prefix:name".
|
||||||
|
return strings.Join([]string{base, "cri-containerd", id}, ":")
|
||||||
|
}
|
||||||
|
return filepath.Join(cgroupsParent, id)
|
||||||
|
}
|
||||||
|
|
||||||
|
func toLabel(selinuxOptions *runtime.SELinuxOption) ([]string, error) {
|
||||||
|
var labels []string
|
||||||
|
|
||||||
|
if selinuxOptions == nil {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
if err := checkSelinuxLevel(selinuxOptions.Level); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
if selinuxOptions.User != "" {
|
||||||
|
labels = append(labels, "user:"+selinuxOptions.User)
|
||||||
|
}
|
||||||
|
if selinuxOptions.Role != "" {
|
||||||
|
labels = append(labels, "role:"+selinuxOptions.Role)
|
||||||
|
}
|
||||||
|
if selinuxOptions.Type != "" {
|
||||||
|
labels = append(labels, "type:"+selinuxOptions.Type)
|
||||||
|
}
|
||||||
|
if selinuxOptions.Level != "" {
|
||||||
|
labels = append(labels, "level:"+selinuxOptions.Level)
|
||||||
|
}
|
||||||
|
|
||||||
|
return labels, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func checkSelinuxLevel(level string) error {
|
||||||
|
if len(level) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
matched, err := regexp.MatchString(`^s\d(-s\d)??(:c\d{1,4}(\.c\d{1,4})?(,c\d{1,4}(\.c\d{1,4})?)*)?$`, level)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("the format of 'level' %q is not correct: %w", level, err)
|
||||||
|
}
|
||||||
|
if !matched {
|
||||||
|
return fmt.Errorf("the format of 'level' %q is not correct", level)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func parseUsernsIDMap(runtimeIDMap []*runtime.IDMapping) ([]runtimespec.LinuxIDMapping, error) {
|
||||||
|
var m []runtimespec.LinuxIDMapping
|
||||||
|
|
||||||
|
if len(runtimeIDMap) == 0 {
|
||||||
|
return m, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(runtimeIDMap) > 1 {
|
||||||
|
// We only accept 1 line, because containerd.WithRemappedSnapshot() only supports that.
|
||||||
|
return m, fmt.Errorf("only one mapping line supported, got %v mapping lines", len(runtimeIDMap))
|
||||||
|
}
|
||||||
|
|
||||||
|
// We know len is 1 now.
|
||||||
|
if runtimeIDMap[0] == nil {
|
||||||
|
return m, nil
|
||||||
|
}
|
||||||
|
uidMap := *runtimeIDMap[0]
|
||||||
|
|
||||||
|
if uidMap.Length < 1 {
|
||||||
|
return m, fmt.Errorf("invalid mapping length: %v", uidMap.Length)
|
||||||
|
}
|
||||||
|
|
||||||
|
m = []runtimespec.LinuxIDMapping{
|
||||||
|
{
|
||||||
|
ContainerID: uidMap.ContainerId,
|
||||||
|
HostID: uidMap.HostId,
|
||||||
|
Size: uidMap.Length,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
return m, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func parseUsernsIDs(userns *runtime.UserNamespace) (uids, gids []runtimespec.LinuxIDMapping, retErr error) {
|
||||||
|
if userns == nil {
|
||||||
|
// If userns is not set, the kubelet doesn't support this option
|
||||||
|
// and we should just fallback to no userns. This is completely
|
||||||
|
// valid.
|
||||||
|
return nil, nil, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
uids, err := parseUsernsIDMap(userns.GetUids())
|
||||||
|
if err != nil {
|
||||||
|
return nil, nil, fmt.Errorf("UID mapping: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
gids, err = parseUsernsIDMap(userns.GetGids())
|
||||||
|
if err != nil {
|
||||||
|
return nil, nil, fmt.Errorf("GID mapping: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
switch mode := userns.GetMode(); mode {
|
||||||
|
case runtime.NamespaceMode_NODE:
|
||||||
|
if len(uids) != 0 || len(gids) != 0 {
|
||||||
|
return nil, nil, fmt.Errorf("can't use user namespace mode %q with mappings. Got %v UID mappings and %v GID mappings", mode, len(uids), len(gids))
|
||||||
|
}
|
||||||
|
case runtime.NamespaceMode_POD:
|
||||||
|
// This is valid, we will handle it in WithPodNamespaces().
|
||||||
|
if len(uids) == 0 || len(gids) == 0 {
|
||||||
|
return nil, nil, fmt.Errorf("can't use user namespace mode %q without UID and GID mappings", mode)
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
return nil, nil, fmt.Errorf("unsupported user namespace mode: %q", mode)
|
||||||
|
}
|
||||||
|
|
||||||
|
return uids, gids, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// sameUsernsConfig checks if the userns configs are the same. If the mappings
|
||||||
|
// on each config are the same but in different order, it returns false.
|
||||||
|
// XXX: If the runtime.UserNamespace struct changes, we should update this
|
||||||
|
// function accordingly.
|
||||||
|
func sameUsernsConfig(a, b *runtime.UserNamespace) bool {
|
||||||
|
// If both are nil, they are the same.
|
||||||
|
if a == nil && b == nil {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
// If only one is nil, they are different.
|
||||||
|
if a == nil || b == nil {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
// At this point, a is not nil nor b.
|
||||||
|
|
||||||
|
if a.GetMode() != b.GetMode() {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
aUids, aGids, err := parseUsernsIDs(a)
|
||||||
|
if err != nil {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
bUids, bGids, err := parseUsernsIDs(b)
|
||||||
|
if err != nil {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
if !sameMapping(aUids, bUids) {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
if !sameMapping(aGids, bGids) {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
// sameMapping checks if the mappings are the same. If the mappings are the same
|
||||||
|
// but in different order, it returns false.
|
||||||
|
func sameMapping(a, b []runtimespec.LinuxIDMapping) bool {
|
||||||
|
if len(a) != len(b) {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
for x := range a {
|
||||||
|
if a[x].ContainerID != b[x].ContainerID {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
if a[x].HostID != b[x].HostID {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
if a[x].Size != b[x].Size {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
@ -20,23 +20,22 @@ import (
|
|||||||
"context"
|
"context"
|
||||||
"fmt"
|
"fmt"
|
||||||
"os"
|
"os"
|
||||||
"path"
|
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"regexp"
|
|
||||||
"sort"
|
"sort"
|
||||||
"strings"
|
"strings"
|
||||||
"syscall"
|
"syscall"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
|
"github.com/containerd/cgroups/v3"
|
||||||
|
"github.com/moby/sys/mountinfo"
|
||||||
|
"github.com/opencontainers/runtime-spec/specs-go"
|
||||||
|
"golang.org/x/sys/unix"
|
||||||
|
|
||||||
"github.com/containerd/containerd/log"
|
"github.com/containerd/containerd/log"
|
||||||
"github.com/containerd/containerd/mount"
|
"github.com/containerd/containerd/mount"
|
||||||
"github.com/containerd/containerd/pkg/apparmor"
|
"github.com/containerd/containerd/pkg/apparmor"
|
||||||
"github.com/containerd/containerd/pkg/seccomp"
|
"github.com/containerd/containerd/pkg/seccomp"
|
||||||
"github.com/containerd/containerd/pkg/seutil"
|
"github.com/containerd/containerd/pkg/seutil"
|
||||||
"github.com/moby/sys/mountinfo"
|
|
||||||
"github.com/opencontainers/runtime-spec/specs-go"
|
|
||||||
"golang.org/x/sys/unix"
|
|
||||||
runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
const (
|
const (
|
||||||
@ -50,17 +49,6 @@ const (
|
|||||||
resolvConfPath = "/etc/resolv.conf"
|
resolvConfPath = "/etc/resolv.conf"
|
||||||
)
|
)
|
||||||
|
|
||||||
// getCgroupsPath generates container cgroups path.
|
|
||||||
func getCgroupsPath(cgroupsParent, id string) string {
|
|
||||||
base := path.Base(cgroupsParent)
|
|
||||||
if strings.HasSuffix(base, ".slice") {
|
|
||||||
// For a.slice/b.slice/c.slice, base is c.slice.
|
|
||||||
// runc systemd cgroup path format is "slice:prefix:name".
|
|
||||||
return strings.Join([]string{base, "cri-containerd", id}, ":")
|
|
||||||
}
|
|
||||||
return filepath.Join(cgroupsParent, id)
|
|
||||||
}
|
|
||||||
|
|
||||||
// getSandboxRootDir returns the root directory for managing sandbox files,
|
// getSandboxRootDir returns the root directory for managing sandbox files,
|
||||||
// e.g. hosts files.
|
// e.g. hosts files.
|
||||||
func (c *criService) getSandboxRootDir(id string) string {
|
func (c *criService) getSandboxRootDir(id string) string {
|
||||||
@ -93,46 +81,6 @@ func (c *criService) getSandboxDevShm(id string) string {
|
|||||||
return filepath.Join(c.getVolatileSandboxRootDir(id), "shm")
|
return filepath.Join(c.getVolatileSandboxRootDir(id), "shm")
|
||||||
}
|
}
|
||||||
|
|
||||||
func toLabel(selinuxOptions *runtime.SELinuxOption) ([]string, error) {
|
|
||||||
var labels []string
|
|
||||||
|
|
||||||
if selinuxOptions == nil {
|
|
||||||
return nil, nil
|
|
||||||
}
|
|
||||||
if err := checkSelinuxLevel(selinuxOptions.Level); err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
if selinuxOptions.User != "" {
|
|
||||||
labels = append(labels, "user:"+selinuxOptions.User)
|
|
||||||
}
|
|
||||||
if selinuxOptions.Role != "" {
|
|
||||||
labels = append(labels, "role:"+selinuxOptions.Role)
|
|
||||||
}
|
|
||||||
if selinuxOptions.Type != "" {
|
|
||||||
labels = append(labels, "type:"+selinuxOptions.Type)
|
|
||||||
}
|
|
||||||
if selinuxOptions.Level != "" {
|
|
||||||
labels = append(labels, "level:"+selinuxOptions.Level)
|
|
||||||
}
|
|
||||||
|
|
||||||
return labels, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func checkSelinuxLevel(level string) error {
|
|
||||||
if len(level) == 0 {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
matched, err := regexp.MatchString(`^s\d(-s\d)??(:c\d{1,4}(\.c\d{1,4})?(,c\d{1,4}(\.c\d{1,4})?)*)?$`, level)
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("the format of 'level' %q is not correct: %w", level, err)
|
|
||||||
}
|
|
||||||
if !matched {
|
|
||||||
return fmt.Errorf("the format of 'level' %q is not correct", level)
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// apparmorEnabled returns true if apparmor is enabled, supported by the host,
|
// apparmorEnabled returns true if apparmor is enabled, supported by the host,
|
||||||
// if apparmor_parser is installed, and if we are not running docker-in-docker.
|
// if apparmor_parser is installed, and if we are not running docker-in-docker.
|
||||||
func (c *criService) apparmorEnabled() bool {
|
func (c *criService) apparmorEnabled() bool {
|
||||||
@ -270,3 +218,9 @@ func modifyProcessLabel(runtimeType string, spec *specs.Spec) error {
|
|||||||
spec.Process.SelinuxLabel = l
|
spec.Process.SelinuxLabel = l
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// getCgroupsMode returns cgropu mode.
|
||||||
|
// TODO: add build constraints to cgroups package and remove this helper
|
||||||
|
func isUnifiedCgroupsMode() bool {
|
||||||
|
return cgroups.Mode() == cgroups.Unified
|
||||||
|
}
|
||||||
|
@ -41,3 +41,7 @@ func ensureRemoveAll(ctx context.Context, dir string) error {
|
|||||||
func modifyProcessLabel(runtimeType string, spec *specs.Spec) error {
|
func modifyProcessLabel(runtimeType string, spec *specs.Spec) error {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func isUnifiedCgroupsMode() bool {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
@ -166,3 +166,7 @@ func ensureRemoveAll(_ context.Context, dir string) error {
|
|||||||
func modifyProcessLabel(runtimeType string, spec *specs.Spec) error {
|
func modifyProcessLabel(runtimeType string, spec *specs.Spec) error {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func isUnifiedCgroupsMode() bool {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
@ -21,12 +21,13 @@ import (
|
|||||||
"fmt"
|
"fmt"
|
||||||
"strconv"
|
"strconv"
|
||||||
|
|
||||||
"github.com/containerd/containerd/oci"
|
|
||||||
"github.com/containerd/containerd/snapshots"
|
|
||||||
imagespec "github.com/opencontainers/image-spec/specs-go/v1"
|
imagespec "github.com/opencontainers/image-spec/specs-go/v1"
|
||||||
runtimespec "github.com/opencontainers/runtime-spec/specs-go"
|
runtimespec "github.com/opencontainers/runtime-spec/specs-go"
|
||||||
runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
|
runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
|
||||||
|
|
||||||
|
"github.com/containerd/containerd/oci"
|
||||||
|
"github.com/containerd/containerd/snapshots"
|
||||||
|
|
||||||
"github.com/containerd/containerd/pkg/cri/annotations"
|
"github.com/containerd/containerd/pkg/cri/annotations"
|
||||||
"github.com/containerd/containerd/pkg/cri/config"
|
"github.com/containerd/containerd/pkg/cri/config"
|
||||||
customopts "github.com/containerd/containerd/pkg/cri/opts"
|
customopts "github.com/containerd/containerd/pkg/cri/opts"
|
||||||
@ -89,7 +90,7 @@ func (c *criService) containerSpec(
|
|||||||
oci.WithHostname(sandboxConfig.GetHostname()),
|
oci.WithHostname(sandboxConfig.GetHostname()),
|
||||||
)
|
)
|
||||||
|
|
||||||
specOpts = append(specOpts, customopts.WithWindowsMounts(c.os, config, extraMounts), customopts.WithDevices(config))
|
specOpts = append(specOpts, customopts.WithWindowsMounts(c.os, config, extraMounts), customopts.WithWindowsDevices(config))
|
||||||
|
|
||||||
// Start with the image config user and override below if RunAsUsername is not "".
|
// Start with the image config user and override below if RunAsUsername is not "".
|
||||||
username := imageConfig.User
|
username := imageConfig.User
|
||||||
|
Loading…
Reference in New Issue
Block a user