Merge pull request #8803 from kinvolk/rata/userns-sbserver
cri/sbserver: Add support for user namespaces (KEP-127)
This commit is contained in:
commit
a94918b591
@ -94,9 +94,6 @@ func TestPodUserNS(t *testing.T) {
|
||||
},
|
||||
} {
|
||||
t.Run(name, func(t *testing.T) {
|
||||
if os.Getenv("ENABLE_CRI_SANDBOXES") == "'sandboxed'" {
|
||||
t.Skip("skipping test: userns not supported/needed in sanboxed runtimes")
|
||||
}
|
||||
cmd := exec.Command("true")
|
||||
cmd.SysProcAttr = &syscall.SysProcAttr{
|
||||
Cloneflags: syscall.CLONE_NEWUSER,
|
||||
|
@ -206,7 +206,10 @@ func (c *criService) CreateContainer(ctx context.Context, r *runtime.CreateConta
|
||||
log.G(ctx).Debugf("Container %q spec: %#+v", id, spew.NewFormatter(spec))
|
||||
|
||||
// Grab any platform specific snapshotter opts.
|
||||
sOpts := snapshotterOpts(c.config.ContainerdConfig.Snapshotter, config)
|
||||
sOpts, err := snapshotterOpts(c.config.ContainerdConfig.Snapshotter, config)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Set snapshotter before any other options.
|
||||
opts := []containerd.NewContainerOpts{
|
||||
|
@ -264,6 +264,7 @@ func appArmorProfileExists(profile string) (bool, error) {
|
||||
}
|
||||
|
||||
// snapshotterOpts returns any Linux specific snapshotter options for the rootfs snapshot
|
||||
func snapshotterOpts(snapshotterName string, config *runtime.ContainerConfig) []snapshots.Opt {
|
||||
return []snapshots.Opt{}
|
||||
func snapshotterOpts(snapshotterName string, config *runtime.ContainerConfig) ([]snapshots.Opt, error) {
|
||||
nsOpts := config.GetLinux().GetSecurityContext().GetNamespaceOptions()
|
||||
return snapshotterRemapOpts(nsOpts)
|
||||
}
|
||||
|
@ -31,6 +31,6 @@ func (c *criService) containerSpecOpts(config *runtime.ContainerConfig, imageCon
|
||||
}
|
||||
|
||||
// snapshotterOpts returns snapshotter options for the rootfs snapshot
|
||||
func snapshotterOpts(snapshotterName string, config *runtime.ContainerConfig) []snapshots.Opt {
|
||||
return []snapshots.Opt{}
|
||||
func snapshotterOpts(snapshotterName string, config *runtime.ContainerConfig) ([]snapshots.Opt, error) {
|
||||
return []snapshots.Opt{}, nil
|
||||
}
|
||||
|
@ -32,7 +32,7 @@ func (c *criService) containerSpecOpts(config *runtime.ContainerConfig, imageCon
|
||||
}
|
||||
|
||||
// snapshotterOpts returns any Windows specific snapshotter options for the r/w layer
|
||||
func snapshotterOpts(snapshotterName string, config *runtime.ContainerConfig) []snapshots.Opt {
|
||||
func snapshotterOpts(snapshotterName string, config *runtime.ContainerConfig) ([]snapshots.Opt, error) {
|
||||
var opts []snapshots.Opt
|
||||
|
||||
switch snapshotterName {
|
||||
@ -47,5 +47,5 @@ func snapshotterOpts(snapshotterName string, config *runtime.ContainerConfig) []
|
||||
}
|
||||
}
|
||||
|
||||
return opts
|
||||
return opts, nil
|
||||
}
|
||||
|
@ -30,12 +30,15 @@ import (
|
||||
"github.com/moby/sys/mountinfo"
|
||||
"github.com/opencontainers/runtime-spec/specs-go"
|
||||
"golang.org/x/sys/unix"
|
||||
runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
|
||||
|
||||
"github.com/containerd/containerd"
|
||||
"github.com/containerd/containerd/log"
|
||||
"github.com/containerd/containerd/mount"
|
||||
"github.com/containerd/containerd/pkg/apparmor"
|
||||
"github.com/containerd/containerd/pkg/seccomp"
|
||||
"github.com/containerd/containerd/pkg/seutil"
|
||||
"github.com/containerd/containerd/snapshots"
|
||||
)
|
||||
|
||||
// apparmorEnabled returns true if apparmor is enabled, supported by the host,
|
||||
@ -181,3 +184,21 @@ func modifyProcessLabel(runtimeType string, spec *specs.Spec) error {
|
||||
func isUnifiedCgroupsMode() bool {
|
||||
return cgroups.Mode() == cgroups.Unified
|
||||
}
|
||||
|
||||
func snapshotterRemapOpts(nsOpts *runtime.NamespaceOption) ([]snapshots.Opt, error) {
|
||||
snapshotOpt := []snapshots.Opt{}
|
||||
usernsOpts := nsOpts.GetUsernsOptions()
|
||||
if usernsOpts == nil {
|
||||
return snapshotOpt, nil
|
||||
}
|
||||
|
||||
uids, gids, err := parseUsernsIDs(usernsOpts)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("user namespace configuration: %w", err)
|
||||
}
|
||||
|
||||
if usernsOpts.GetMode() == runtime.NamespaceMode_POD {
|
||||
snapshotOpt = append(snapshotOpt, containerd.WithRemapperLabels(0, uids[0].HostID, 0, gids[0].HostID, uids[0].Size))
|
||||
}
|
||||
return snapshotOpt, nil
|
||||
}
|
||||
|
@ -28,12 +28,15 @@ import (
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
"github.com/containerd/containerd"
|
||||
"github.com/containerd/containerd/log"
|
||||
"github.com/containerd/containerd/mount"
|
||||
"github.com/containerd/containerd/pkg/seccomp"
|
||||
"github.com/containerd/containerd/pkg/seutil"
|
||||
"github.com/containerd/containerd/snapshots"
|
||||
|
||||
"github.com/moby/sys/mountinfo"
|
||||
"github.com/opencontainers/runtime-spec/specs-go"
|
||||
runtimespec "github.com/opencontainers/runtime-spec/specs-go"
|
||||
"github.com/opencontainers/selinux/go-selinux/label"
|
||||
"golang.org/x/sys/unix"
|
||||
runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
|
||||
@ -242,7 +245,7 @@ func isVMBasedRuntime(runtimeType string) bool {
|
||||
return false
|
||||
}
|
||||
|
||||
func modifyProcessLabel(runtimeType string, spec *specs.Spec) error {
|
||||
func modifyProcessLabel(runtimeType string, spec *runtimespec.Spec) error {
|
||||
if !isVMBasedRuntime(runtimeType) {
|
||||
return nil
|
||||
}
|
||||
@ -253,3 +256,89 @@ func modifyProcessLabel(runtimeType string, spec *specs.Spec) error {
|
||||
spec.Process.SelinuxLabel = l
|
||||
return nil
|
||||
}
|
||||
|
||||
func parseUsernsIDMap(runtimeIDMap []*runtime.IDMapping) ([]runtimespec.LinuxIDMapping, error) {
|
||||
var m []runtimespec.LinuxIDMapping
|
||||
|
||||
if len(runtimeIDMap) == 0 {
|
||||
return m, nil
|
||||
}
|
||||
|
||||
if len(runtimeIDMap) > 1 {
|
||||
// We only accept 1 line, because containerd.WithRemappedSnapshot() only supports that.
|
||||
return m, fmt.Errorf("only one mapping line supported, got %v mapping lines", len(runtimeIDMap))
|
||||
}
|
||||
|
||||
// We know len is 1 now.
|
||||
if runtimeIDMap[0] == nil {
|
||||
return m, nil
|
||||
}
|
||||
uidMap := *runtimeIDMap[0]
|
||||
|
||||
if uidMap.Length < 1 {
|
||||
return m, fmt.Errorf("invalid mapping length: %v", uidMap.Length)
|
||||
}
|
||||
|
||||
m = []runtimespec.LinuxIDMapping{
|
||||
{
|
||||
ContainerID: uidMap.ContainerId,
|
||||
HostID: uidMap.HostId,
|
||||
Size: uidMap.Length,
|
||||
},
|
||||
}
|
||||
|
||||
return m, nil
|
||||
}
|
||||
|
||||
func parseUsernsIDs(userns *runtime.UserNamespace) (uids, gids []runtimespec.LinuxIDMapping, retErr error) {
|
||||
if userns == nil {
|
||||
// If userns is not set, the kubelet doesn't support this option
|
||||
// and we should just fallback to no userns. This is completely
|
||||
// valid.
|
||||
return nil, nil, nil
|
||||
}
|
||||
|
||||
uids, err := parseUsernsIDMap(userns.GetUids())
|
||||
if err != nil {
|
||||
return nil, nil, fmt.Errorf("UID mapping: %w", err)
|
||||
}
|
||||
|
||||
gids, err = parseUsernsIDMap(userns.GetGids())
|
||||
if err != nil {
|
||||
return nil, nil, fmt.Errorf("GID mapping: %w", err)
|
||||
}
|
||||
|
||||
switch mode := userns.GetMode(); mode {
|
||||
case runtime.NamespaceMode_NODE:
|
||||
if len(uids) != 0 || len(gids) != 0 {
|
||||
return nil, nil, fmt.Errorf("can't use user namespace mode %q with mappings. Got %v UID mappings and %v GID mappings", mode, len(uids), len(gids))
|
||||
}
|
||||
case runtime.NamespaceMode_POD:
|
||||
// This is valid, we will handle it in WithPodNamespaces().
|
||||
if len(uids) == 0 || len(gids) == 0 {
|
||||
return nil, nil, fmt.Errorf("can't use user namespace mode %q without UID and GID mappings", mode)
|
||||
}
|
||||
default:
|
||||
return nil, nil, fmt.Errorf("unsupported user namespace mode: %q", mode)
|
||||
}
|
||||
|
||||
return uids, gids, nil
|
||||
}
|
||||
|
||||
func snapshotterRemapOpts(nsOpts *runtime.NamespaceOption) ([]snapshots.Opt, error) {
|
||||
snapshotOpt := []snapshots.Opt{}
|
||||
usernsOpts := nsOpts.GetUsernsOptions()
|
||||
if usernsOpts == nil {
|
||||
return snapshotOpt, nil
|
||||
}
|
||||
|
||||
uids, gids, err := parseUsernsIDs(usernsOpts)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("user namespace configuration: %w", err)
|
||||
}
|
||||
|
||||
if usernsOpts.GetMode() == runtime.NamespaceMode_POD {
|
||||
snapshotOpt = append(snapshotOpt, containerd.WithRemapperLabels(0, uids[0].HostID, 0, gids[0].HostID, uids[0].Size))
|
||||
}
|
||||
return snapshotOpt, nil
|
||||
}
|
||||
|
@ -136,10 +136,16 @@ func (c *Controller) Start(ctx context.Context, id string) (cin sandbox.Controll
|
||||
|
||||
sandboxLabels := buildLabels(config.Labels, image.ImageSpec.Config.Labels, containerKindSandbox)
|
||||
|
||||
snapshotterOpt := snapshots.WithLabels(snapshots.FilterInheritedLabels(config.Annotations))
|
||||
snapshotterOpt := []snapshots.Opt{snapshots.WithLabels(snapshots.FilterInheritedLabels(config.Annotations))}
|
||||
extraSOpts, err := sandboxSnapshotterOpts(config)
|
||||
if err != nil {
|
||||
return cin, err
|
||||
}
|
||||
snapshotterOpt = append(snapshotterOpt, extraSOpts...)
|
||||
|
||||
opts := []containerd.NewContainerOpts{
|
||||
containerd.WithSnapshotter(c.runtimeSnapshotter(ctx, ociRuntime)),
|
||||
customopts.WithNewSnapshot(id, containerdImage, snapshotterOpt),
|
||||
customopts.WithNewSnapshot(id, containerdImage, snapshotterOpt...),
|
||||
containerd.WithSpec(spec, specOpts...),
|
||||
containerd.WithContainerLabels(sandboxLabels),
|
||||
containerd.WithContainerExtension(sandboxMetadataExtension, &metadata),
|
||||
|
@ -32,6 +32,7 @@ import (
|
||||
"github.com/containerd/containerd/pkg/cri/annotations"
|
||||
customopts "github.com/containerd/containerd/pkg/cri/opts"
|
||||
"github.com/containerd/containerd/pkg/userns"
|
||||
"github.com/containerd/containerd/snapshots"
|
||||
)
|
||||
|
||||
func (c *Controller) sandboxContainerSpec(id string, config *runtime.PodSandboxConfig,
|
||||
@ -92,6 +93,25 @@ func (c *Controller) sandboxContainerSpec(id string, config *runtime.PodSandboxC
|
||||
specOpts = append(specOpts, customopts.WithoutNamespace(runtimespec.IPCNamespace))
|
||||
}
|
||||
|
||||
usernsOpts := nsOptions.GetUsernsOptions()
|
||||
uids, gids, err := parseUsernsIDs(usernsOpts)
|
||||
var usernsEnabled bool
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("user namespace configuration: %w", err)
|
||||
}
|
||||
|
||||
if usernsOpts != nil {
|
||||
switch mode := usernsOpts.GetMode(); mode {
|
||||
case runtime.NamespaceMode_NODE:
|
||||
specOpts = append(specOpts, customopts.WithoutNamespace(runtimespec.UserNamespace))
|
||||
case runtime.NamespaceMode_POD:
|
||||
specOpts = append(specOpts, oci.WithUserNamespace(uids, gids))
|
||||
usernsEnabled = true
|
||||
default:
|
||||
return nil, fmt.Errorf("unsupported user namespace mode: %q", mode)
|
||||
}
|
||||
}
|
||||
|
||||
// It's fine to generate the spec before the sandbox /dev/shm
|
||||
// is actually created.
|
||||
sandboxDevShm := c.getSandboxDevShm(id)
|
||||
@ -100,9 +120,9 @@ func (c *Controller) sandboxContainerSpec(id string, config *runtime.PodSandboxC
|
||||
}
|
||||
// Remove the default /dev/shm mount from defaultMounts, it is added in oci/mounts.go.
|
||||
specOpts = append(specOpts, oci.WithoutMounts(devShm))
|
||||
// In future the when user-namespace is enabled, the `nosuid, nodev, noexec` flags are
|
||||
// required, otherwise the remount will fail with EPERM. Just use them unconditionally,
|
||||
// they are nice to have anyways.
|
||||
// When user-namespace is enabled, the `nosuid, nodev, noexec` flags are
|
||||
// required, otherwise the remount will fail with EPERM. Just use them
|
||||
// unconditionally, they are nice to have anyways.
|
||||
specOpts = append(specOpts, oci.WithMounts([]runtimespec.Mount{
|
||||
{
|
||||
Source: sandboxDevShm,
|
||||
@ -146,10 +166,7 @@ func (c *Controller) sandboxContainerSpec(id string, config *runtime.PodSandboxC
|
||||
if c.config.EnableUnprivilegedPorts && !ipUnprivilegedPortStart {
|
||||
sysctls["net.ipv4.ip_unprivileged_port_start"] = "0"
|
||||
}
|
||||
// TODO (rata): We need to set this only if the pod will
|
||||
// **not** use user namespaces either.
|
||||
// This will be done when user namespaces is ported to sbserver.
|
||||
if c.config.EnableUnprivilegedICMP && !pingGroupRange && !userns.RunningInUserNS() {
|
||||
if c.config.EnableUnprivilegedICMP && !pingGroupRange && !userns.RunningInUserNS() && !usernsEnabled {
|
||||
sysctls["net.ipv4.ping_group_range"] = "0 2147483647"
|
||||
}
|
||||
}
|
||||
@ -326,3 +343,10 @@ func (c *Controller) cleanupSandboxFiles(id string, config *runtime.PodSandboxCo
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// sandboxSnapshotterOpts generates any platform specific snapshotter options
|
||||
// for a sandbox container.
|
||||
func sandboxSnapshotterOpts(config *runtime.PodSandboxConfig) ([]snapshots.Opt, error) {
|
||||
nsOpts := config.GetLinux().GetSecurityContext().GetNamespaceOptions()
|
||||
return snapshotterRemapOpts(nsOpts)
|
||||
}
|
||||
|
@ -106,6 +106,17 @@ func getRunPodSandboxTestData() (*runtime.PodSandboxConfig, *imagespec.ImageConf
|
||||
func TestLinuxSandboxContainerSpec(t *testing.T) {
|
||||
testID := "test-id"
|
||||
nsPath := "test-cni"
|
||||
idMap := runtime.IDMapping{
|
||||
HostId: 1000,
|
||||
ContainerId: 1000,
|
||||
Length: 10,
|
||||
}
|
||||
expIDMap := runtimespec.LinuxIDMapping{
|
||||
HostID: 1000,
|
||||
ContainerID: 1000,
|
||||
Size: 10,
|
||||
}
|
||||
|
||||
for _, test := range []struct {
|
||||
desc string
|
||||
configChange func(*runtime.PodSandboxConfig)
|
||||
@ -134,6 +145,27 @@ func TestLinuxSandboxContainerSpec(t *testing.T) {
|
||||
assert.Contains(t, spec.Linux.Sysctl["net.ipv4.ping_group_range"], "0 2147483647")
|
||||
},
|
||||
},
|
||||
{
|
||||
desc: "spec shouldn't have ping_group_range if userns are in use",
|
||||
configChange: func(c *runtime.PodSandboxConfig) {
|
||||
c.Linux.SecurityContext = &runtime.LinuxSandboxSecurityContext{
|
||||
NamespaceOptions: &runtime.NamespaceOption{
|
||||
UsernsOptions: &runtime.UserNamespace{
|
||||
Mode: runtime.NamespaceMode_POD,
|
||||
Uids: []*runtime.IDMapping{&idMap},
|
||||
Gids: []*runtime.IDMapping{&idMap},
|
||||
},
|
||||
},
|
||||
}
|
||||
},
|
||||
specCheck: func(t *testing.T, spec *runtimespec.Spec) {
|
||||
require.NotNil(t, spec.Linux)
|
||||
assert.Contains(t, spec.Linux.Namespaces, runtimespec.LinuxNamespace{
|
||||
Type: runtimespec.UserNamespace,
|
||||
})
|
||||
assert.NotContains(t, spec.Linux.Sysctl["net.ipv4.ping_group_range"], "0 2147483647")
|
||||
},
|
||||
},
|
||||
{
|
||||
desc: "host namespace",
|
||||
configChange: func(c *runtime.PodSandboxConfig) {
|
||||
@ -164,6 +196,113 @@ func TestLinuxSandboxContainerSpec(t *testing.T) {
|
||||
assert.NotContains(t, spec.Linux.Sysctl["net.ipv4.ping_group_range"], "0 2147483647")
|
||||
},
|
||||
},
|
||||
{
|
||||
desc: "user namespace",
|
||||
configChange: func(c *runtime.PodSandboxConfig) {
|
||||
c.Linux.SecurityContext = &runtime.LinuxSandboxSecurityContext{
|
||||
NamespaceOptions: &runtime.NamespaceOption{
|
||||
UsernsOptions: &runtime.UserNamespace{
|
||||
Mode: runtime.NamespaceMode_POD,
|
||||
Uids: []*runtime.IDMapping{&idMap},
|
||||
Gids: []*runtime.IDMapping{&idMap},
|
||||
},
|
||||
},
|
||||
}
|
||||
},
|
||||
specCheck: func(t *testing.T, spec *runtimespec.Spec) {
|
||||
require.NotNil(t, spec.Linux)
|
||||
assert.Contains(t, spec.Linux.Namespaces, runtimespec.LinuxNamespace{
|
||||
Type: runtimespec.UserNamespace,
|
||||
})
|
||||
require.Equal(t, spec.Linux.UIDMappings, []runtimespec.LinuxIDMapping{expIDMap})
|
||||
require.Equal(t, spec.Linux.GIDMappings, []runtimespec.LinuxIDMapping{expIDMap})
|
||||
|
||||
},
|
||||
},
|
||||
{
|
||||
desc: "user namespace mode node and mappings",
|
||||
configChange: func(c *runtime.PodSandboxConfig) {
|
||||
c.Linux.SecurityContext = &runtime.LinuxSandboxSecurityContext{
|
||||
NamespaceOptions: &runtime.NamespaceOption{
|
||||
UsernsOptions: &runtime.UserNamespace{
|
||||
Mode: runtime.NamespaceMode_NODE,
|
||||
Uids: []*runtime.IDMapping{&idMap},
|
||||
Gids: []*runtime.IDMapping{&idMap},
|
||||
},
|
||||
},
|
||||
}
|
||||
},
|
||||
expectErr: true,
|
||||
},
|
||||
{
|
||||
desc: "user namespace with several mappings",
|
||||
configChange: func(c *runtime.PodSandboxConfig) {
|
||||
c.Linux.SecurityContext = &runtime.LinuxSandboxSecurityContext{
|
||||
NamespaceOptions: &runtime.NamespaceOption{
|
||||
UsernsOptions: &runtime.UserNamespace{
|
||||
Mode: runtime.NamespaceMode_NODE,
|
||||
Uids: []*runtime.IDMapping{&idMap, &idMap},
|
||||
Gids: []*runtime.IDMapping{&idMap, &idMap},
|
||||
},
|
||||
},
|
||||
}
|
||||
},
|
||||
expectErr: true,
|
||||
},
|
||||
{
|
||||
desc: "user namespace with uneven mappings",
|
||||
configChange: func(c *runtime.PodSandboxConfig) {
|
||||
c.Linux.SecurityContext = &runtime.LinuxSandboxSecurityContext{
|
||||
NamespaceOptions: &runtime.NamespaceOption{
|
||||
UsernsOptions: &runtime.UserNamespace{
|
||||
Mode: runtime.NamespaceMode_NODE,
|
||||
Uids: []*runtime.IDMapping{&idMap, &idMap},
|
||||
Gids: []*runtime.IDMapping{&idMap},
|
||||
},
|
||||
},
|
||||
}
|
||||
},
|
||||
expectErr: true,
|
||||
},
|
||||
{
|
||||
desc: "user namespace mode container",
|
||||
configChange: func(c *runtime.PodSandboxConfig) {
|
||||
c.Linux.SecurityContext = &runtime.LinuxSandboxSecurityContext{
|
||||
NamespaceOptions: &runtime.NamespaceOption{
|
||||
UsernsOptions: &runtime.UserNamespace{
|
||||
Mode: runtime.NamespaceMode_CONTAINER,
|
||||
},
|
||||
},
|
||||
}
|
||||
},
|
||||
expectErr: true,
|
||||
},
|
||||
{
|
||||
desc: "user namespace mode target",
|
||||
configChange: func(c *runtime.PodSandboxConfig) {
|
||||
c.Linux.SecurityContext = &runtime.LinuxSandboxSecurityContext{
|
||||
NamespaceOptions: &runtime.NamespaceOption{
|
||||
UsernsOptions: &runtime.UserNamespace{
|
||||
Mode: runtime.NamespaceMode_TARGET,
|
||||
},
|
||||
},
|
||||
}
|
||||
},
|
||||
expectErr: true,
|
||||
},
|
||||
{
|
||||
desc: "user namespace unknown mode",
|
||||
configChange: func(c *runtime.PodSandboxConfig) {
|
||||
c.Linux.SecurityContext = &runtime.LinuxSandboxSecurityContext{
|
||||
NamespaceOptions: &runtime.NamespaceOption{
|
||||
UsernsOptions: &runtime.UserNamespace{
|
||||
Mode: runtime.NamespaceMode(100),
|
||||
},
|
||||
},
|
||||
}
|
||||
},
|
||||
expectErr: true,
|
||||
},
|
||||
{
|
||||
desc: "should set supplemental groups correctly",
|
||||
configChange: func(c *runtime.PodSandboxConfig) {
|
||||
|
@ -21,6 +21,7 @@ package podsandbox
|
||||
import (
|
||||
"github.com/containerd/containerd/oci"
|
||||
"github.com/containerd/containerd/pkg/cri/annotations"
|
||||
"github.com/containerd/containerd/snapshots"
|
||||
imagespec "github.com/opencontainers/image-spec/specs-go/v1"
|
||||
runtimespec "github.com/opencontainers/runtime-spec/specs-go"
|
||||
runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
|
||||
@ -48,3 +49,9 @@ func (c *Controller) setupSandboxFiles(id string, config *runtime.PodSandboxConf
|
||||
func (c *Controller) cleanupSandboxFiles(id string, config *runtime.PodSandboxConfig) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// sandboxSnapshotterOpts generates any platform specific snapshotter options
|
||||
// for a sandbox container.
|
||||
func sandboxSnapshotterOpts(config *runtime.PodSandboxConfig) ([]snapshots.Opt, error) {
|
||||
return []snapshots.Opt{}, nil
|
||||
}
|
||||
|
@ -27,6 +27,7 @@ import (
|
||||
|
||||
"github.com/containerd/containerd/pkg/cri/annotations"
|
||||
customopts "github.com/containerd/containerd/pkg/cri/opts"
|
||||
"github.com/containerd/containerd/snapshots"
|
||||
)
|
||||
|
||||
func (c *Controller) sandboxContainerSpec(id string, config *runtime.PodSandboxConfig,
|
||||
@ -101,3 +102,8 @@ func (c *Controller) setupSandboxFiles(id string, config *runtime.PodSandboxConf
|
||||
func (c *Controller) cleanupSandboxFiles(id string, config *runtime.PodSandboxConfig) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// No sandbox snapshotter options needed for windows.
|
||||
func sandboxSnapshotterOpts(config *runtime.PodSandboxConfig) ([]snapshots.Opt, error) {
|
||||
return []snapshots.Opt{}, nil
|
||||
}
|
||||
|
@ -23,6 +23,7 @@ import (
|
||||
"fmt"
|
||||
"math"
|
||||
"path/filepath"
|
||||
goruntime "runtime"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
@ -94,6 +95,7 @@ func (c *criService) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandbox
|
||||
|
||||
sandboxInfo.Runtime.Name = ociRuntime.Type
|
||||
|
||||
runtimeStart := time.Now()
|
||||
// Retrieve runtime options
|
||||
runtimeOpts, err := generateRuntimeOptions(ociRuntime)
|
||||
if err != nil {
|
||||
@ -142,8 +144,27 @@ func (c *criService) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandbox
|
||||
}
|
||||
}()
|
||||
|
||||
userNsEnabled := false
|
||||
if goruntime.GOOS != "windows" {
|
||||
usernsOpts := config.GetLinux().GetSecurityContext().GetNamespaceOptions().GetUsernsOptions()
|
||||
if usernsOpts != nil && usernsOpts.GetMode() == runtime.NamespaceMode_POD {
|
||||
userNsEnabled = true
|
||||
}
|
||||
}
|
||||
|
||||
// Setup the network namespace if host networking wasn't requested.
|
||||
if !hostNetwork(config) {
|
||||
if !hostNetwork(config) && !userNsEnabled {
|
||||
// XXX: We do c&p of this code later for the podNetwork && userNsEnabled case too.
|
||||
// We can't move this to a function, as the defer calls need to be executed if other
|
||||
// errors are returned in this function. So, we would need more refactors to move
|
||||
// this code to a function and the idea was to not change the current code for
|
||||
// !userNsEnabled case, therefore doing it would defeat the purpose.
|
||||
//
|
||||
// The difference between the cases is the use of netns.NewNetNS() vs
|
||||
// netns.NewNetNSFromPID().
|
||||
//
|
||||
// To simplify this, in the future, we should just remove this case (podNetwork &&
|
||||
// !userNsEnabled) and just keep the other case (podNetwork && userNsEnabled).
|
||||
netStart := time.Now()
|
||||
// If it is not in host network namespace then create a namespace and set the sandbox
|
||||
// handle. NetNSPath in sandbox metadata and NetNS is non empty only for non host network
|
||||
@ -222,8 +243,6 @@ func (c *criService) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandbox
|
||||
return nil, fmt.Errorf("unable to update extensions for sandbox %q: %w", id, err)
|
||||
}
|
||||
|
||||
runtimeStart := time.Now()
|
||||
|
||||
if err := controller.Create(ctx, id, sb.WithOptions(config), sb.WithNetNSPath(sandbox.NetNSPath)); err != nil {
|
||||
return nil, fmt.Errorf("failed to create sandbox %q: %w", id, err)
|
||||
}
|
||||
@ -245,6 +264,88 @@ func (c *criService) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandbox
|
||||
return nil, fmt.Errorf("failed to start sandbox %q: %w", id, err)
|
||||
}
|
||||
|
||||
if !hostNetwork(config) && userNsEnabled {
|
||||
// If userns is enabled, then the netns was created by the OCI runtime
|
||||
// on controller.Start(). The OCI runtime needs to create the netns
|
||||
// because, if userns is in use, the netns needs to be owned by the
|
||||
// userns. So, let the OCI runtime just handle this for us.
|
||||
// If the netns is not owned by the userns several problems will happen.
|
||||
// For instance, the container will lack permission (even if
|
||||
// capabilities are present) to modify the netns or, even worse, the OCI
|
||||
// runtime will fail to mount sysfs:
|
||||
// https://github.com/torvalds/linux/commit/7dc5dbc879bd0779924b5132a48b731a0bc04a1e#diff-4839664cd0c8eab716e064323c7cd71fR1164
|
||||
//
|
||||
// Note we do this after controller.Start(), as before that we
|
||||
// can't get the PID for the sandbox that we need for the netns.
|
||||
// Doing a controller.Status() call before that fails (can't
|
||||
// find the sandbox) so we can't get the PID.
|
||||
netStart := time.Now()
|
||||
|
||||
// If it is not in host network namespace then create a namespace and set the sandbox
|
||||
// handle. NetNSPath in sandbox metadata and NetNS is non empty only for non host network
|
||||
// namespaces. If the pod is in host network namespace then both are empty and should not
|
||||
// be used.
|
||||
var netnsMountDir = "/var/run/netns"
|
||||
if c.config.NetNSMountsUnderStateDir {
|
||||
netnsMountDir = filepath.Join(c.config.StateDir, "netns")
|
||||
}
|
||||
|
||||
sandbox.NetNS, err = netns.NewNetNSFromPID(netnsMountDir, ctrl.Pid)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create network namespace for sandbox %q: %w", id, err)
|
||||
}
|
||||
|
||||
// Update network namespace in the store, which is used to generate the container's spec
|
||||
sandbox.NetNSPath = sandbox.NetNS.GetPath()
|
||||
defer func() {
|
||||
// Remove the network namespace only if all the resource cleanup is done
|
||||
if retErr != nil && cleanupErr == nil {
|
||||
if cleanupErr = sandbox.NetNS.Remove(); cleanupErr != nil {
|
||||
log.G(ctx).WithError(cleanupErr).Errorf("Failed to remove network namespace %s for sandbox %q", sandbox.NetNSPath, id)
|
||||
return
|
||||
}
|
||||
sandbox.NetNSPath = ""
|
||||
}
|
||||
}()
|
||||
|
||||
if err := sandboxInfo.AddExtension(podsandbox.MetadataKey, &sandbox.Metadata); err != nil {
|
||||
return nil, fmt.Errorf("unable to save sandbox %q to store: %w", id, err)
|
||||
}
|
||||
// Save sandbox metadata to store
|
||||
if sandboxInfo, err = c.client.SandboxStore().Update(ctx, sandboxInfo, "extensions"); err != nil {
|
||||
return nil, fmt.Errorf("unable to update extensions for sandbox %q: %w", id, err)
|
||||
}
|
||||
|
||||
// Define this defer to teardownPodNetwork prior to the setupPodNetwork function call.
|
||||
// This is because in setupPodNetwork the resource is allocated even if it returns error, unlike other resource
|
||||
// creation functions.
|
||||
defer func() {
|
||||
// Remove the network namespace only if all the resource cleanup is done.
|
||||
if retErr != nil && cleanupErr == nil {
|
||||
deferCtx, deferCancel := util.DeferContext()
|
||||
defer deferCancel()
|
||||
// Teardown network if an error is returned.
|
||||
if cleanupErr = c.teardownPodNetwork(deferCtx, sandbox); cleanupErr != nil {
|
||||
log.G(ctx).WithError(cleanupErr).Errorf("Failed to destroy network for sandbox %q", id)
|
||||
}
|
||||
|
||||
}
|
||||
}()
|
||||
|
||||
// Setup network for sandbox.
|
||||
// Certain VM based solutions like clear containers (Issue containerd/cri-containerd#524)
|
||||
// rely on the assumption that CRI shim will not be querying the network namespace to check the
|
||||
// network states such as IP.
|
||||
// In future runtime implementation should avoid relying on CRI shim implementation details.
|
||||
// In this case however caching the IP will add a subtle performance enhancement by avoiding
|
||||
// calls to network namespace of the pod to query the IP of the veth interface on every
|
||||
// SandboxStatus request.
|
||||
if err := c.setupPodNetwork(ctx, &sandbox); err != nil {
|
||||
return nil, fmt.Errorf("failed to setup network for sandbox %q: %w", id, err)
|
||||
}
|
||||
sandboxCreateNetworkTimer.UpdateSince(netStart)
|
||||
}
|
||||
|
||||
// TODO: get rid of this. sandbox object should no longer have Container field.
|
||||
if ociRuntime.SandboxMode == string(criconfig.ModePodSandbox) {
|
||||
container, err := c.client.LoadContainer(ctx, id)
|
||||
|
@ -31,7 +31,7 @@ const (
|
||||
|
||||
// WithRemapperLabels creates the labels used by any supporting snapshotter
|
||||
// to shift the filesystem ownership (user namespace mapping) automatically; currently
|
||||
// supported by the fuse-overlayfs snapshotter
|
||||
// supported by the fuse-overlayfs and overlay snapshotters
|
||||
func WithRemapperLabels(ctrUID, hostUID, ctrGID, hostGID, length uint32) snapshots.Opt {
|
||||
return snapshots.WithLabels(map[string]string{
|
||||
snapshots.LabelSnapshotUIDMapping: fmt.Sprintf("%d:%d:%d", ctrUID, hostUID, length),
|
||||
|
Loading…
Reference in New Issue
Block a user