
Motivation: For pod-level user namespaces, it's impossible to force the container runtime to join an existing network namespace after creating a new user namespace. According to the capabilities section in [user_namespaces(7)][1], a network namespace created by containerd is owned by the root user namespace. When the container runtime (like runc or crun) creates a new user namespace, it becomes a child of the root user namespace. Processes within this child user namespace are not permitted to access resources owned by the parent user namespace. If the network namespace is not owned by the new user namespace, the container runtime will fail to mount /sys due to the [sysfs: Restrict mounting sysfs][2] patch. Referencing the [cap_capable][3] function in Linux, a process can access a resource if: * The resource is owned by the process's user namespace, and the process has the required capability. * The resource is owned by a child of the process's user namespace, and the owner's user namespace was created by the process's UID. In the context of pod-level user namespaces, the CRI plugin delegates the creation of the network namespace to the container runtime when running the pause container. After the pause container is initialized, the CRI plugin pins the pause container's network namespace into `/run/netns` and then executes the `CNI_ADD` command over it. However, if the pause container is terminated during the pinning process, the CRI plugin might encounter a PID cycle, leading to the `CNI_ADD` command operating on an incorrect network namespace. Moreover, rolling back the `RunPodSandbox` API is complex due to the delegation of network namespace creation. As highlighted in issue #10363, the CRI plugin can lose IP information after a containerd restart, making it challenging to maintain robustness in the RunPodSandbox API. Solution: Allow containerd to create a new user namespace and then create the network namespace within that user namespace. This way, the CRI plugin can force the container runtime to join both the user namespace and the network namespace. Since the network namespace is owned by the newly created user namespace, the container runtime will have the necessary permissions to mount `/sys` on the container's root filesystem. As a result, delegation of network namespace creation is no longer needed. NOTE: * The CRI plugin does not need to pin the newly created user namespace as it does with the network namespace, because the kernel allows retrieving a user namespace reference via [ioctl_ns(2)][4]. As a result, the podsandbox implementation can obtain the user namespace using the `netnsPath` parameter. [1]: <https://man7.org/linux/man-pages/man7/user_namespaces.7.html> [2]: <7dc5dbc879
> [3]: <2c85ebc57b/security/commoncap.c (L65)
> [4]: <https://man7.org/linux/man-pages/man2/ioctl_ns.2.html> Signed-off-by: Wei Fu <fuweid89@gmail.com>
384 lines
12 KiB
Go
384 lines
12 KiB
Go
/*
|
|
Copyright The containerd Authors.
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
*/
|
|
|
|
package opts
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"fmt"
|
|
"os"
|
|
"path/filepath"
|
|
"sort"
|
|
"strings"
|
|
|
|
imagespec "github.com/opencontainers/image-spec/specs-go/v1"
|
|
runtimespec "github.com/opencontainers/runtime-spec/specs-go"
|
|
runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
|
|
|
|
"github.com/containerd/containerd/v2/core/containers"
|
|
"github.com/containerd/containerd/v2/internal/cri/util"
|
|
"github.com/containerd/containerd/v2/pkg/oci"
|
|
)
|
|
|
|
// DefaultSandboxCPUshares is default cpu shares for sandbox container.
|
|
// TODO(windows): Revisit cpu shares for windows (https://github.com/containerd/cri/issues/1297)
|
|
const DefaultSandboxCPUshares = 2
|
|
|
|
// WithRelativeRoot sets the root for the container
|
|
func WithRelativeRoot(root string) oci.SpecOpts {
|
|
return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) (err error) {
|
|
if s.Root == nil {
|
|
s.Root = &runtimespec.Root{}
|
|
}
|
|
s.Root.Path = root
|
|
return nil
|
|
}
|
|
}
|
|
|
|
// WithoutRoot sets the root to nil for the container.
|
|
func WithoutRoot(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) error {
|
|
s.Root = nil
|
|
return nil
|
|
}
|
|
|
|
// WithProcessArgs sets the process args on the spec based on the image and runtime config
|
|
func WithProcessArgs(config *runtime.ContainerConfig, image *imagespec.ImageConfig) oci.SpecOpts {
|
|
return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) (err error) {
|
|
command, args := config.GetCommand(), config.GetArgs()
|
|
// The following logic is migrated from https://github.com/moby/moby/blob/master/daemon/commit.go
|
|
// TODO(random-liu): Clearly define the commands overwrite behavior.
|
|
if len(command) == 0 {
|
|
// Copy array to avoid data race.
|
|
if len(args) == 0 {
|
|
args = append([]string{}, image.Cmd...)
|
|
}
|
|
if command == nil {
|
|
if !(len(image.Entrypoint) == 1 && image.Entrypoint[0] == "") {
|
|
command = append([]string{}, image.Entrypoint...)
|
|
}
|
|
}
|
|
}
|
|
if len(command) == 0 && len(args) == 0 {
|
|
return errors.New("no command specified")
|
|
}
|
|
return oci.WithProcessArgs(append(command, args...)...)(ctx, client, c, s)
|
|
}
|
|
}
|
|
|
|
// mounts defines how to sort runtime.Mount.
|
|
// This is the same with the Docker implementation:
|
|
//
|
|
// https://github.com/moby/moby/blob/17.05.x/daemon/volumes.go#L26
|
|
type orderedMounts []*runtime.Mount
|
|
|
|
// Len returns the number of mounts. Used in sorting.
|
|
func (m orderedMounts) Len() int {
|
|
return len(m)
|
|
}
|
|
|
|
// Less returns true if the number of parts (a/b/c would be 3 parts) in the
|
|
// mount indexed by parameter 1 is less than that of the mount indexed by
|
|
// parameter 2. Used in sorting.
|
|
func (m orderedMounts) Less(i, j int) bool {
|
|
return m.parts(i) < m.parts(j)
|
|
}
|
|
|
|
// Swap swaps two items in an array of mounts. Used in sorting
|
|
func (m orderedMounts) Swap(i, j int) {
|
|
m[i], m[j] = m[j], m[i]
|
|
}
|
|
|
|
// parts returns the number of parts in the destination of a mount. Used in sorting.
|
|
func (m orderedMounts) parts(i int) int {
|
|
return strings.Count(filepath.Clean(m[i].ContainerPath), string(os.PathSeparator))
|
|
}
|
|
|
|
// WithAnnotation sets the provided annotation
|
|
func WithAnnotation(k, v string) oci.SpecOpts {
|
|
return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) error {
|
|
if s.Annotations == nil {
|
|
s.Annotations = make(map[string]string)
|
|
}
|
|
s.Annotations[k] = v
|
|
return nil
|
|
}
|
|
}
|
|
|
|
// WithAdditionalGIDs adds any additional groups listed for a particular user in the
|
|
// /etc/groups file of the image's root filesystem to the OCI spec's additionalGids array.
|
|
func WithAdditionalGIDs(userstr string) oci.SpecOpts {
|
|
return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) (err error) {
|
|
if s.Process == nil {
|
|
s.Process = &runtimespec.Process{}
|
|
}
|
|
gids := s.Process.User.AdditionalGids
|
|
if err := oci.WithAdditionalGIDs(userstr)(ctx, client, c, s); err != nil {
|
|
return err
|
|
}
|
|
// Merge existing gids and new gids.
|
|
s.Process.User.AdditionalGids = mergeGids(s.Process.User.AdditionalGids, gids)
|
|
return nil
|
|
}
|
|
}
|
|
|
|
func mergeGids(gids1, gids2 []uint32) []uint32 {
|
|
gidsMap := make(map[uint32]struct{})
|
|
for _, gid1 := range gids1 {
|
|
gidsMap[gid1] = struct{}{}
|
|
}
|
|
for _, gid2 := range gids2 {
|
|
gidsMap[gid2] = struct{}{}
|
|
}
|
|
var gids []uint32
|
|
for gid := range gidsMap {
|
|
gids = append(gids, gid)
|
|
}
|
|
sort.Slice(gids, func(i, j int) bool { return gids[i] < gids[j] })
|
|
return gids
|
|
}
|
|
|
|
// WithoutDefaultSecuritySettings removes the default security settings generated on a spec
|
|
func WithoutDefaultSecuritySettings(_ context.Context, _ oci.Client, c *containers.Container, s *runtimespec.Spec) error {
|
|
if s.Process == nil {
|
|
s.Process = &runtimespec.Process{}
|
|
}
|
|
// Make sure no default seccomp/apparmor is specified
|
|
s.Process.ApparmorProfile = ""
|
|
if s.Linux != nil {
|
|
s.Linux.Seccomp = nil
|
|
}
|
|
// Remove default rlimits (See https://github.com/containerd/cri/issues/515)
|
|
s.Process.Rlimits = nil
|
|
return nil
|
|
}
|
|
|
|
// WithCapabilities sets the provided capabilities from the security context
|
|
func WithCapabilities(sc *runtime.LinuxContainerSecurityContext, allCaps []string) oci.SpecOpts {
|
|
capabilities := sc.GetCapabilities()
|
|
if capabilities == nil {
|
|
return nullOpt
|
|
}
|
|
|
|
var opts []oci.SpecOpts
|
|
// Add/drop all capabilities if "all" is specified, so that
|
|
// following individual add/drop could still work. E.g.
|
|
// AddCapabilities: []string{"ALL"}, DropCapabilities: []string{"CHOWN"}
|
|
// will be all capabilities without `CAP_CHOWN`.
|
|
if util.InStringSlice(capabilities.GetAddCapabilities(), "ALL") {
|
|
opts = append(opts, oci.WithCapabilities(allCaps))
|
|
}
|
|
if util.InStringSlice(capabilities.GetDropCapabilities(), "ALL") {
|
|
opts = append(opts, oci.WithCapabilities(nil))
|
|
}
|
|
|
|
var caps []string
|
|
for _, c := range capabilities.GetAddCapabilities() {
|
|
if strings.ToUpper(c) == "ALL" {
|
|
continue
|
|
}
|
|
// Capabilities in CRI doesn't have `CAP_` prefix, so add it.
|
|
caps = append(caps, "CAP_"+strings.ToUpper(c))
|
|
}
|
|
opts = append(opts, oci.WithAddedCapabilities(caps))
|
|
|
|
caps = []string{}
|
|
for _, c := range capabilities.GetDropCapabilities() {
|
|
if strings.ToUpper(c) == "ALL" {
|
|
continue
|
|
}
|
|
caps = append(caps, "CAP_"+strings.ToUpper(c))
|
|
}
|
|
opts = append(opts, oci.WithDroppedCapabilities(caps))
|
|
return oci.Compose(opts...)
|
|
}
|
|
|
|
func nullOpt(_ context.Context, _ oci.Client, _ *containers.Container, _ *runtimespec.Spec) error {
|
|
return nil
|
|
}
|
|
|
|
// WithoutAmbientCaps removes the ambient caps from the spec
|
|
func WithoutAmbientCaps(_ context.Context, _ oci.Client, c *containers.Container, s *runtimespec.Spec) error {
|
|
if s.Process == nil {
|
|
s.Process = &runtimespec.Process{}
|
|
}
|
|
if s.Process.Capabilities == nil {
|
|
s.Process.Capabilities = &runtimespec.LinuxCapabilities{}
|
|
}
|
|
s.Process.Capabilities.Ambient = nil
|
|
return nil
|
|
}
|
|
|
|
// WithSelinuxLabels sets the mount and process labels
|
|
func WithSelinuxLabels(process, mount string) oci.SpecOpts {
|
|
return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) (err error) {
|
|
if s.Linux == nil {
|
|
s.Linux = &runtimespec.Linux{}
|
|
}
|
|
if s.Process == nil {
|
|
s.Process = &runtimespec.Process{}
|
|
}
|
|
s.Linux.MountLabel = mount
|
|
s.Process.SelinuxLabel = process
|
|
return nil
|
|
}
|
|
}
|
|
|
|
// WithSysctls sets the provided sysctls onto the spec
|
|
func WithSysctls(sysctls map[string]string) oci.SpecOpts {
|
|
return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) error {
|
|
if s.Linux == nil {
|
|
s.Linux = &runtimespec.Linux{}
|
|
}
|
|
if s.Linux.Sysctl == nil {
|
|
s.Linux.Sysctl = make(map[string]string)
|
|
}
|
|
for k, v := range sysctls {
|
|
s.Linux.Sysctl[k] = v
|
|
}
|
|
return nil
|
|
}
|
|
}
|
|
|
|
// WithSupplementalGroups sets the supplemental groups for the process
|
|
func WithSupplementalGroups(groups []int64) oci.SpecOpts {
|
|
return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) error {
|
|
if s.Process == nil {
|
|
s.Process = &runtimespec.Process{}
|
|
}
|
|
var guids []uint32
|
|
for _, g := range groups {
|
|
guids = append(guids, uint32(g))
|
|
}
|
|
s.Process.User.AdditionalGids = mergeGids(s.Process.User.AdditionalGids, guids)
|
|
return nil
|
|
}
|
|
}
|
|
|
|
// WithDefaultSandboxShares sets the default sandbox CPU shares
|
|
func WithDefaultSandboxShares(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) error {
|
|
if s.Linux == nil {
|
|
s.Linux = &runtimespec.Linux{}
|
|
}
|
|
if s.Linux.Resources == nil {
|
|
s.Linux.Resources = &runtimespec.LinuxResources{}
|
|
}
|
|
if s.Linux.Resources.CPU == nil {
|
|
s.Linux.Resources.CPU = &runtimespec.LinuxCPU{}
|
|
}
|
|
i := uint64(DefaultSandboxCPUshares)
|
|
s.Linux.Resources.CPU.Shares = &i
|
|
return nil
|
|
}
|
|
|
|
// WithoutNamespace removes the provided namespace
|
|
func WithoutNamespace(t runtimespec.LinuxNamespaceType) oci.SpecOpts {
|
|
return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) error {
|
|
if s.Linux == nil {
|
|
return nil
|
|
}
|
|
var namespaces []runtimespec.LinuxNamespace
|
|
for i, ns := range s.Linux.Namespaces {
|
|
if ns.Type != t {
|
|
namespaces = append(namespaces, s.Linux.Namespaces[i])
|
|
}
|
|
}
|
|
s.Linux.Namespaces = namespaces
|
|
return nil
|
|
}
|
|
}
|
|
|
|
// WithNamespacePath updates namespace with existing path.
|
|
func WithNamespacePath(t runtimespec.LinuxNamespaceType, nsPath string) oci.SpecOpts {
|
|
return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) error {
|
|
if s.Linux == nil {
|
|
return fmt.Errorf("Linux spec is required")
|
|
}
|
|
|
|
for i, ns := range s.Linux.Namespaces {
|
|
if ns.Type == t {
|
|
s.Linux.Namespaces[i].Path = nsPath
|
|
return nil
|
|
}
|
|
}
|
|
return fmt.Errorf("no such namespace %s", t)
|
|
}
|
|
}
|
|
|
|
// WithPodNamespaces sets the pod namespaces for the container
|
|
func WithPodNamespaces(config *runtime.LinuxContainerSecurityContext, sandboxPid uint32, targetPid uint32, uids, gids []runtimespec.LinuxIDMapping) oci.SpecOpts {
|
|
namespaces := config.GetNamespaceOptions()
|
|
|
|
opts := []oci.SpecOpts{
|
|
oci.WithLinuxNamespace(runtimespec.LinuxNamespace{Type: runtimespec.NetworkNamespace, Path: GetNetworkNamespace(sandboxPid)}),
|
|
oci.WithLinuxNamespace(runtimespec.LinuxNamespace{Type: runtimespec.IPCNamespace, Path: GetIPCNamespace(sandboxPid)}),
|
|
oci.WithLinuxNamespace(runtimespec.LinuxNamespace{Type: runtimespec.UTSNamespace, Path: GetUTSNamespace(sandboxPid)}),
|
|
}
|
|
if namespaces.GetPid() != runtime.NamespaceMode_CONTAINER {
|
|
opts = append(opts, oci.WithLinuxNamespace(runtimespec.LinuxNamespace{Type: runtimespec.PIDNamespace, Path: GetPIDNamespace(targetPid)}))
|
|
}
|
|
|
|
if namespaces.GetUsernsOptions() != nil {
|
|
switch namespaces.GetUsernsOptions().GetMode() {
|
|
case runtime.NamespaceMode_NODE:
|
|
// Nothing to do. Not adding userns field uses the node userns.
|
|
case runtime.NamespaceMode_POD:
|
|
opts = append(opts, oci.WithLinuxNamespace(runtimespec.LinuxNamespace{Type: runtimespec.UserNamespace, Path: GetUserNamespace(sandboxPid)}))
|
|
opts = append(opts, oci.WithUserNamespace(uids, gids))
|
|
}
|
|
}
|
|
|
|
return oci.Compose(opts...)
|
|
}
|
|
|
|
const (
|
|
// netNSFormat is the format of network namespace of a process.
|
|
netNSFormat = "/proc/%v/ns/net"
|
|
// ipcNSFormat is the format of ipc namespace of a process.
|
|
ipcNSFormat = "/proc/%v/ns/ipc"
|
|
// utsNSFormat is the format of uts namespace of a process.
|
|
utsNSFormat = "/proc/%v/ns/uts"
|
|
// pidNSFormat is the format of pid namespace of a process.
|
|
pidNSFormat = "/proc/%v/ns/pid"
|
|
// userNSFormat is the format of user namespace of a process.
|
|
userNSFormat = "/proc/%v/ns/user"
|
|
)
|
|
|
|
// GetNetworkNamespace returns the network namespace of a process.
|
|
func GetNetworkNamespace(pid uint32) string {
|
|
return fmt.Sprintf(netNSFormat, pid)
|
|
}
|
|
|
|
// GetIPCNamespace returns the ipc namespace of a process.
|
|
func GetIPCNamespace(pid uint32) string {
|
|
return fmt.Sprintf(ipcNSFormat, pid)
|
|
}
|
|
|
|
// GetUTSNamespace returns the uts namespace of a process.
|
|
func GetUTSNamespace(pid uint32) string {
|
|
return fmt.Sprintf(utsNSFormat, pid)
|
|
}
|
|
|
|
// GetPIDNamespace returns the pid namespace of a process.
|
|
func GetPIDNamespace(pid uint32) string {
|
|
return fmt.Sprintf(pidNSFormat, pid)
|
|
}
|
|
|
|
// GetUserNamespace returns the user namespace of a process.
|
|
func GetUserNamespace(pid uint32) string {
|
|
return fmt.Sprintf(userNSFormat, pid)
|
|
}
|