Merge pull request #8803 from kinvolk/rata/userns-sbserver
cri/sbserver: Add support for user namespaces (KEP-127)
This commit is contained in:
commit
a94918b591
@ -94,9 +94,6 @@ func TestPodUserNS(t *testing.T) {
|
|||||||
},
|
},
|
||||||
} {
|
} {
|
||||||
t.Run(name, func(t *testing.T) {
|
t.Run(name, func(t *testing.T) {
|
||||||
if os.Getenv("ENABLE_CRI_SANDBOXES") == "'sandboxed'" {
|
|
||||||
t.Skip("skipping test: userns not supported/needed in sanboxed runtimes")
|
|
||||||
}
|
|
||||||
cmd := exec.Command("true")
|
cmd := exec.Command("true")
|
||||||
cmd.SysProcAttr = &syscall.SysProcAttr{
|
cmd.SysProcAttr = &syscall.SysProcAttr{
|
||||||
Cloneflags: syscall.CLONE_NEWUSER,
|
Cloneflags: syscall.CLONE_NEWUSER,
|
||||||
|
@ -206,7 +206,10 @@ func (c *criService) CreateContainer(ctx context.Context, r *runtime.CreateConta
|
|||||||
log.G(ctx).Debugf("Container %q spec: %#+v", id, spew.NewFormatter(spec))
|
log.G(ctx).Debugf("Container %q spec: %#+v", id, spew.NewFormatter(spec))
|
||||||
|
|
||||||
// Grab any platform specific snapshotter opts.
|
// Grab any platform specific snapshotter opts.
|
||||||
sOpts := snapshotterOpts(c.config.ContainerdConfig.Snapshotter, config)
|
sOpts, err := snapshotterOpts(c.config.ContainerdConfig.Snapshotter, config)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
// Set snapshotter before any other options.
|
// Set snapshotter before any other options.
|
||||||
opts := []containerd.NewContainerOpts{
|
opts := []containerd.NewContainerOpts{
|
||||||
|
@ -264,6 +264,7 @@ func appArmorProfileExists(profile string) (bool, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// snapshotterOpts returns any Linux specific snapshotter options for the rootfs snapshot
|
// snapshotterOpts returns any Linux specific snapshotter options for the rootfs snapshot
|
||||||
func snapshotterOpts(snapshotterName string, config *runtime.ContainerConfig) []snapshots.Opt {
|
func snapshotterOpts(snapshotterName string, config *runtime.ContainerConfig) ([]snapshots.Opt, error) {
|
||||||
return []snapshots.Opt{}
|
nsOpts := config.GetLinux().GetSecurityContext().GetNamespaceOptions()
|
||||||
|
return snapshotterRemapOpts(nsOpts)
|
||||||
}
|
}
|
||||||
|
@ -31,6 +31,6 @@ func (c *criService) containerSpecOpts(config *runtime.ContainerConfig, imageCon
|
|||||||
}
|
}
|
||||||
|
|
||||||
// snapshotterOpts returns snapshotter options for the rootfs snapshot
|
// snapshotterOpts returns snapshotter options for the rootfs snapshot
|
||||||
func snapshotterOpts(snapshotterName string, config *runtime.ContainerConfig) []snapshots.Opt {
|
func snapshotterOpts(snapshotterName string, config *runtime.ContainerConfig) ([]snapshots.Opt, error) {
|
||||||
return []snapshots.Opt{}
|
return []snapshots.Opt{}, nil
|
||||||
}
|
}
|
||||||
|
@ -32,7 +32,7 @@ func (c *criService) containerSpecOpts(config *runtime.ContainerConfig, imageCon
|
|||||||
}
|
}
|
||||||
|
|
||||||
// snapshotterOpts returns any Windows specific snapshotter options for the r/w layer
|
// snapshotterOpts returns any Windows specific snapshotter options for the r/w layer
|
||||||
func snapshotterOpts(snapshotterName string, config *runtime.ContainerConfig) []snapshots.Opt {
|
func snapshotterOpts(snapshotterName string, config *runtime.ContainerConfig) ([]snapshots.Opt, error) {
|
||||||
var opts []snapshots.Opt
|
var opts []snapshots.Opt
|
||||||
|
|
||||||
switch snapshotterName {
|
switch snapshotterName {
|
||||||
@ -47,5 +47,5 @@ func snapshotterOpts(snapshotterName string, config *runtime.ContainerConfig) []
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return opts
|
return opts, nil
|
||||||
}
|
}
|
||||||
|
@ -30,12 +30,15 @@ import (
|
|||||||
"github.com/moby/sys/mountinfo"
|
"github.com/moby/sys/mountinfo"
|
||||||
"github.com/opencontainers/runtime-spec/specs-go"
|
"github.com/opencontainers/runtime-spec/specs-go"
|
||||||
"golang.org/x/sys/unix"
|
"golang.org/x/sys/unix"
|
||||||
|
runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
|
||||||
|
|
||||||
|
"github.com/containerd/containerd"
|
||||||
"github.com/containerd/containerd/log"
|
"github.com/containerd/containerd/log"
|
||||||
"github.com/containerd/containerd/mount"
|
"github.com/containerd/containerd/mount"
|
||||||
"github.com/containerd/containerd/pkg/apparmor"
|
"github.com/containerd/containerd/pkg/apparmor"
|
||||||
"github.com/containerd/containerd/pkg/seccomp"
|
"github.com/containerd/containerd/pkg/seccomp"
|
||||||
"github.com/containerd/containerd/pkg/seutil"
|
"github.com/containerd/containerd/pkg/seutil"
|
||||||
|
"github.com/containerd/containerd/snapshots"
|
||||||
)
|
)
|
||||||
|
|
||||||
// apparmorEnabled returns true if apparmor is enabled, supported by the host,
|
// apparmorEnabled returns true if apparmor is enabled, supported by the host,
|
||||||
@ -181,3 +184,21 @@ func modifyProcessLabel(runtimeType string, spec *specs.Spec) error {
|
|||||||
func isUnifiedCgroupsMode() bool {
|
func isUnifiedCgroupsMode() bool {
|
||||||
return cgroups.Mode() == cgroups.Unified
|
return cgroups.Mode() == cgroups.Unified
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func snapshotterRemapOpts(nsOpts *runtime.NamespaceOption) ([]snapshots.Opt, error) {
|
||||||
|
snapshotOpt := []snapshots.Opt{}
|
||||||
|
usernsOpts := nsOpts.GetUsernsOptions()
|
||||||
|
if usernsOpts == nil {
|
||||||
|
return snapshotOpt, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
uids, gids, err := parseUsernsIDs(usernsOpts)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("user namespace configuration: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if usernsOpts.GetMode() == runtime.NamespaceMode_POD {
|
||||||
|
snapshotOpt = append(snapshotOpt, containerd.WithRemapperLabels(0, uids[0].HostID, 0, gids[0].HostID, uids[0].Size))
|
||||||
|
}
|
||||||
|
return snapshotOpt, nil
|
||||||
|
}
|
||||||
|
@ -28,12 +28,15 @@ import (
|
|||||||
"syscall"
|
"syscall"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
|
"github.com/containerd/containerd"
|
||||||
"github.com/containerd/containerd/log"
|
"github.com/containerd/containerd/log"
|
||||||
"github.com/containerd/containerd/mount"
|
"github.com/containerd/containerd/mount"
|
||||||
"github.com/containerd/containerd/pkg/seccomp"
|
"github.com/containerd/containerd/pkg/seccomp"
|
||||||
"github.com/containerd/containerd/pkg/seutil"
|
"github.com/containerd/containerd/pkg/seutil"
|
||||||
|
"github.com/containerd/containerd/snapshots"
|
||||||
|
|
||||||
"github.com/moby/sys/mountinfo"
|
"github.com/moby/sys/mountinfo"
|
||||||
"github.com/opencontainers/runtime-spec/specs-go"
|
runtimespec "github.com/opencontainers/runtime-spec/specs-go"
|
||||||
"github.com/opencontainers/selinux/go-selinux/label"
|
"github.com/opencontainers/selinux/go-selinux/label"
|
||||||
"golang.org/x/sys/unix"
|
"golang.org/x/sys/unix"
|
||||||
runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
|
runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
|
||||||
@ -242,7 +245,7 @@ func isVMBasedRuntime(runtimeType string) bool {
|
|||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|
||||||
func modifyProcessLabel(runtimeType string, spec *specs.Spec) error {
|
func modifyProcessLabel(runtimeType string, spec *runtimespec.Spec) error {
|
||||||
if !isVMBasedRuntime(runtimeType) {
|
if !isVMBasedRuntime(runtimeType) {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
@ -253,3 +256,89 @@ func modifyProcessLabel(runtimeType string, spec *specs.Spec) error {
|
|||||||
spec.Process.SelinuxLabel = l
|
spec.Process.SelinuxLabel = l
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func parseUsernsIDMap(runtimeIDMap []*runtime.IDMapping) ([]runtimespec.LinuxIDMapping, error) {
|
||||||
|
var m []runtimespec.LinuxIDMapping
|
||||||
|
|
||||||
|
if len(runtimeIDMap) == 0 {
|
||||||
|
return m, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(runtimeIDMap) > 1 {
|
||||||
|
// We only accept 1 line, because containerd.WithRemappedSnapshot() only supports that.
|
||||||
|
return m, fmt.Errorf("only one mapping line supported, got %v mapping lines", len(runtimeIDMap))
|
||||||
|
}
|
||||||
|
|
||||||
|
// We know len is 1 now.
|
||||||
|
if runtimeIDMap[0] == nil {
|
||||||
|
return m, nil
|
||||||
|
}
|
||||||
|
uidMap := *runtimeIDMap[0]
|
||||||
|
|
||||||
|
if uidMap.Length < 1 {
|
||||||
|
return m, fmt.Errorf("invalid mapping length: %v", uidMap.Length)
|
||||||
|
}
|
||||||
|
|
||||||
|
m = []runtimespec.LinuxIDMapping{
|
||||||
|
{
|
||||||
|
ContainerID: uidMap.ContainerId,
|
||||||
|
HostID: uidMap.HostId,
|
||||||
|
Size: uidMap.Length,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
return m, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func parseUsernsIDs(userns *runtime.UserNamespace) (uids, gids []runtimespec.LinuxIDMapping, retErr error) {
|
||||||
|
if userns == nil {
|
||||||
|
// If userns is not set, the kubelet doesn't support this option
|
||||||
|
// and we should just fallback to no userns. This is completely
|
||||||
|
// valid.
|
||||||
|
return nil, nil, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
uids, err := parseUsernsIDMap(userns.GetUids())
|
||||||
|
if err != nil {
|
||||||
|
return nil, nil, fmt.Errorf("UID mapping: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
gids, err = parseUsernsIDMap(userns.GetGids())
|
||||||
|
if err != nil {
|
||||||
|
return nil, nil, fmt.Errorf("GID mapping: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
switch mode := userns.GetMode(); mode {
|
||||||
|
case runtime.NamespaceMode_NODE:
|
||||||
|
if len(uids) != 0 || len(gids) != 0 {
|
||||||
|
return nil, nil, fmt.Errorf("can't use user namespace mode %q with mappings. Got %v UID mappings and %v GID mappings", mode, len(uids), len(gids))
|
||||||
|
}
|
||||||
|
case runtime.NamespaceMode_POD:
|
||||||
|
// This is valid, we will handle it in WithPodNamespaces().
|
||||||
|
if len(uids) == 0 || len(gids) == 0 {
|
||||||
|
return nil, nil, fmt.Errorf("can't use user namespace mode %q without UID and GID mappings", mode)
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
return nil, nil, fmt.Errorf("unsupported user namespace mode: %q", mode)
|
||||||
|
}
|
||||||
|
|
||||||
|
return uids, gids, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func snapshotterRemapOpts(nsOpts *runtime.NamespaceOption) ([]snapshots.Opt, error) {
|
||||||
|
snapshotOpt := []snapshots.Opt{}
|
||||||
|
usernsOpts := nsOpts.GetUsernsOptions()
|
||||||
|
if usernsOpts == nil {
|
||||||
|
return snapshotOpt, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
uids, gids, err := parseUsernsIDs(usernsOpts)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("user namespace configuration: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if usernsOpts.GetMode() == runtime.NamespaceMode_POD {
|
||||||
|
snapshotOpt = append(snapshotOpt, containerd.WithRemapperLabels(0, uids[0].HostID, 0, gids[0].HostID, uids[0].Size))
|
||||||
|
}
|
||||||
|
return snapshotOpt, nil
|
||||||
|
}
|
||||||
|
@ -136,10 +136,16 @@ func (c *Controller) Start(ctx context.Context, id string) (cin sandbox.Controll
|
|||||||
|
|
||||||
sandboxLabels := buildLabels(config.Labels, image.ImageSpec.Config.Labels, containerKindSandbox)
|
sandboxLabels := buildLabels(config.Labels, image.ImageSpec.Config.Labels, containerKindSandbox)
|
||||||
|
|
||||||
snapshotterOpt := snapshots.WithLabels(snapshots.FilterInheritedLabels(config.Annotations))
|
snapshotterOpt := []snapshots.Opt{snapshots.WithLabels(snapshots.FilterInheritedLabels(config.Annotations))}
|
||||||
|
extraSOpts, err := sandboxSnapshotterOpts(config)
|
||||||
|
if err != nil {
|
||||||
|
return cin, err
|
||||||
|
}
|
||||||
|
snapshotterOpt = append(snapshotterOpt, extraSOpts...)
|
||||||
|
|
||||||
opts := []containerd.NewContainerOpts{
|
opts := []containerd.NewContainerOpts{
|
||||||
containerd.WithSnapshotter(c.runtimeSnapshotter(ctx, ociRuntime)),
|
containerd.WithSnapshotter(c.runtimeSnapshotter(ctx, ociRuntime)),
|
||||||
customopts.WithNewSnapshot(id, containerdImage, snapshotterOpt),
|
customopts.WithNewSnapshot(id, containerdImage, snapshotterOpt...),
|
||||||
containerd.WithSpec(spec, specOpts...),
|
containerd.WithSpec(spec, specOpts...),
|
||||||
containerd.WithContainerLabels(sandboxLabels),
|
containerd.WithContainerLabels(sandboxLabels),
|
||||||
containerd.WithContainerExtension(sandboxMetadataExtension, &metadata),
|
containerd.WithContainerExtension(sandboxMetadataExtension, &metadata),
|
||||||
|
@ -32,6 +32,7 @@ import (
|
|||||||
"github.com/containerd/containerd/pkg/cri/annotations"
|
"github.com/containerd/containerd/pkg/cri/annotations"
|
||||||
customopts "github.com/containerd/containerd/pkg/cri/opts"
|
customopts "github.com/containerd/containerd/pkg/cri/opts"
|
||||||
"github.com/containerd/containerd/pkg/userns"
|
"github.com/containerd/containerd/pkg/userns"
|
||||||
|
"github.com/containerd/containerd/snapshots"
|
||||||
)
|
)
|
||||||
|
|
||||||
func (c *Controller) sandboxContainerSpec(id string, config *runtime.PodSandboxConfig,
|
func (c *Controller) sandboxContainerSpec(id string, config *runtime.PodSandboxConfig,
|
||||||
@ -92,6 +93,25 @@ func (c *Controller) sandboxContainerSpec(id string, config *runtime.PodSandboxC
|
|||||||
specOpts = append(specOpts, customopts.WithoutNamespace(runtimespec.IPCNamespace))
|
specOpts = append(specOpts, customopts.WithoutNamespace(runtimespec.IPCNamespace))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
usernsOpts := nsOptions.GetUsernsOptions()
|
||||||
|
uids, gids, err := parseUsernsIDs(usernsOpts)
|
||||||
|
var usernsEnabled bool
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("user namespace configuration: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if usernsOpts != nil {
|
||||||
|
switch mode := usernsOpts.GetMode(); mode {
|
||||||
|
case runtime.NamespaceMode_NODE:
|
||||||
|
specOpts = append(specOpts, customopts.WithoutNamespace(runtimespec.UserNamespace))
|
||||||
|
case runtime.NamespaceMode_POD:
|
||||||
|
specOpts = append(specOpts, oci.WithUserNamespace(uids, gids))
|
||||||
|
usernsEnabled = true
|
||||||
|
default:
|
||||||
|
return nil, fmt.Errorf("unsupported user namespace mode: %q", mode)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// It's fine to generate the spec before the sandbox /dev/shm
|
// It's fine to generate the spec before the sandbox /dev/shm
|
||||||
// is actually created.
|
// is actually created.
|
||||||
sandboxDevShm := c.getSandboxDevShm(id)
|
sandboxDevShm := c.getSandboxDevShm(id)
|
||||||
@ -100,9 +120,9 @@ func (c *Controller) sandboxContainerSpec(id string, config *runtime.PodSandboxC
|
|||||||
}
|
}
|
||||||
// Remove the default /dev/shm mount from defaultMounts, it is added in oci/mounts.go.
|
// Remove the default /dev/shm mount from defaultMounts, it is added in oci/mounts.go.
|
||||||
specOpts = append(specOpts, oci.WithoutMounts(devShm))
|
specOpts = append(specOpts, oci.WithoutMounts(devShm))
|
||||||
// In future the when user-namespace is enabled, the `nosuid, nodev, noexec` flags are
|
// When user-namespace is enabled, the `nosuid, nodev, noexec` flags are
|
||||||
// required, otherwise the remount will fail with EPERM. Just use them unconditionally,
|
// required, otherwise the remount will fail with EPERM. Just use them
|
||||||
// they are nice to have anyways.
|
// unconditionally, they are nice to have anyways.
|
||||||
specOpts = append(specOpts, oci.WithMounts([]runtimespec.Mount{
|
specOpts = append(specOpts, oci.WithMounts([]runtimespec.Mount{
|
||||||
{
|
{
|
||||||
Source: sandboxDevShm,
|
Source: sandboxDevShm,
|
||||||
@ -146,10 +166,7 @@ func (c *Controller) sandboxContainerSpec(id string, config *runtime.PodSandboxC
|
|||||||
if c.config.EnableUnprivilegedPorts && !ipUnprivilegedPortStart {
|
if c.config.EnableUnprivilegedPorts && !ipUnprivilegedPortStart {
|
||||||
sysctls["net.ipv4.ip_unprivileged_port_start"] = "0"
|
sysctls["net.ipv4.ip_unprivileged_port_start"] = "0"
|
||||||
}
|
}
|
||||||
// TODO (rata): We need to set this only if the pod will
|
if c.config.EnableUnprivilegedICMP && !pingGroupRange && !userns.RunningInUserNS() && !usernsEnabled {
|
||||||
// **not** use user namespaces either.
|
|
||||||
// This will be done when user namespaces is ported to sbserver.
|
|
||||||
if c.config.EnableUnprivilegedICMP && !pingGroupRange && !userns.RunningInUserNS() {
|
|
||||||
sysctls["net.ipv4.ping_group_range"] = "0 2147483647"
|
sysctls["net.ipv4.ping_group_range"] = "0 2147483647"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -326,3 +343,10 @@ func (c *Controller) cleanupSandboxFiles(id string, config *runtime.PodSandboxCo
|
|||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// sandboxSnapshotterOpts generates any platform specific snapshotter options
|
||||||
|
// for a sandbox container.
|
||||||
|
func sandboxSnapshotterOpts(config *runtime.PodSandboxConfig) ([]snapshots.Opt, error) {
|
||||||
|
nsOpts := config.GetLinux().GetSecurityContext().GetNamespaceOptions()
|
||||||
|
return snapshotterRemapOpts(nsOpts)
|
||||||
|
}
|
||||||
|
@ -106,6 +106,17 @@ func getRunPodSandboxTestData() (*runtime.PodSandboxConfig, *imagespec.ImageConf
|
|||||||
func TestLinuxSandboxContainerSpec(t *testing.T) {
|
func TestLinuxSandboxContainerSpec(t *testing.T) {
|
||||||
testID := "test-id"
|
testID := "test-id"
|
||||||
nsPath := "test-cni"
|
nsPath := "test-cni"
|
||||||
|
idMap := runtime.IDMapping{
|
||||||
|
HostId: 1000,
|
||||||
|
ContainerId: 1000,
|
||||||
|
Length: 10,
|
||||||
|
}
|
||||||
|
expIDMap := runtimespec.LinuxIDMapping{
|
||||||
|
HostID: 1000,
|
||||||
|
ContainerID: 1000,
|
||||||
|
Size: 10,
|
||||||
|
}
|
||||||
|
|
||||||
for _, test := range []struct {
|
for _, test := range []struct {
|
||||||
desc string
|
desc string
|
||||||
configChange func(*runtime.PodSandboxConfig)
|
configChange func(*runtime.PodSandboxConfig)
|
||||||
@ -134,6 +145,27 @@ func TestLinuxSandboxContainerSpec(t *testing.T) {
|
|||||||
assert.Contains(t, spec.Linux.Sysctl["net.ipv4.ping_group_range"], "0 2147483647")
|
assert.Contains(t, spec.Linux.Sysctl["net.ipv4.ping_group_range"], "0 2147483647")
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
desc: "spec shouldn't have ping_group_range if userns are in use",
|
||||||
|
configChange: func(c *runtime.PodSandboxConfig) {
|
||||||
|
c.Linux.SecurityContext = &runtime.LinuxSandboxSecurityContext{
|
||||||
|
NamespaceOptions: &runtime.NamespaceOption{
|
||||||
|
UsernsOptions: &runtime.UserNamespace{
|
||||||
|
Mode: runtime.NamespaceMode_POD,
|
||||||
|
Uids: []*runtime.IDMapping{&idMap},
|
||||||
|
Gids: []*runtime.IDMapping{&idMap},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
},
|
||||||
|
specCheck: func(t *testing.T, spec *runtimespec.Spec) {
|
||||||
|
require.NotNil(t, spec.Linux)
|
||||||
|
assert.Contains(t, spec.Linux.Namespaces, runtimespec.LinuxNamespace{
|
||||||
|
Type: runtimespec.UserNamespace,
|
||||||
|
})
|
||||||
|
assert.NotContains(t, spec.Linux.Sysctl["net.ipv4.ping_group_range"], "0 2147483647")
|
||||||
|
},
|
||||||
|
},
|
||||||
{
|
{
|
||||||
desc: "host namespace",
|
desc: "host namespace",
|
||||||
configChange: func(c *runtime.PodSandboxConfig) {
|
configChange: func(c *runtime.PodSandboxConfig) {
|
||||||
@ -164,6 +196,113 @@ func TestLinuxSandboxContainerSpec(t *testing.T) {
|
|||||||
assert.NotContains(t, spec.Linux.Sysctl["net.ipv4.ping_group_range"], "0 2147483647")
|
assert.NotContains(t, spec.Linux.Sysctl["net.ipv4.ping_group_range"], "0 2147483647")
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
desc: "user namespace",
|
||||||
|
configChange: func(c *runtime.PodSandboxConfig) {
|
||||||
|
c.Linux.SecurityContext = &runtime.LinuxSandboxSecurityContext{
|
||||||
|
NamespaceOptions: &runtime.NamespaceOption{
|
||||||
|
UsernsOptions: &runtime.UserNamespace{
|
||||||
|
Mode: runtime.NamespaceMode_POD,
|
||||||
|
Uids: []*runtime.IDMapping{&idMap},
|
||||||
|
Gids: []*runtime.IDMapping{&idMap},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
},
|
||||||
|
specCheck: func(t *testing.T, spec *runtimespec.Spec) {
|
||||||
|
require.NotNil(t, spec.Linux)
|
||||||
|
assert.Contains(t, spec.Linux.Namespaces, runtimespec.LinuxNamespace{
|
||||||
|
Type: runtimespec.UserNamespace,
|
||||||
|
})
|
||||||
|
require.Equal(t, spec.Linux.UIDMappings, []runtimespec.LinuxIDMapping{expIDMap})
|
||||||
|
require.Equal(t, spec.Linux.GIDMappings, []runtimespec.LinuxIDMapping{expIDMap})
|
||||||
|
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
desc: "user namespace mode node and mappings",
|
||||||
|
configChange: func(c *runtime.PodSandboxConfig) {
|
||||||
|
c.Linux.SecurityContext = &runtime.LinuxSandboxSecurityContext{
|
||||||
|
NamespaceOptions: &runtime.NamespaceOption{
|
||||||
|
UsernsOptions: &runtime.UserNamespace{
|
||||||
|
Mode: runtime.NamespaceMode_NODE,
|
||||||
|
Uids: []*runtime.IDMapping{&idMap},
|
||||||
|
Gids: []*runtime.IDMapping{&idMap},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
},
|
||||||
|
expectErr: true,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
desc: "user namespace with several mappings",
|
||||||
|
configChange: func(c *runtime.PodSandboxConfig) {
|
||||||
|
c.Linux.SecurityContext = &runtime.LinuxSandboxSecurityContext{
|
||||||
|
NamespaceOptions: &runtime.NamespaceOption{
|
||||||
|
UsernsOptions: &runtime.UserNamespace{
|
||||||
|
Mode: runtime.NamespaceMode_NODE,
|
||||||
|
Uids: []*runtime.IDMapping{&idMap, &idMap},
|
||||||
|
Gids: []*runtime.IDMapping{&idMap, &idMap},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
},
|
||||||
|
expectErr: true,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
desc: "user namespace with uneven mappings",
|
||||||
|
configChange: func(c *runtime.PodSandboxConfig) {
|
||||||
|
c.Linux.SecurityContext = &runtime.LinuxSandboxSecurityContext{
|
||||||
|
NamespaceOptions: &runtime.NamespaceOption{
|
||||||
|
UsernsOptions: &runtime.UserNamespace{
|
||||||
|
Mode: runtime.NamespaceMode_NODE,
|
||||||
|
Uids: []*runtime.IDMapping{&idMap, &idMap},
|
||||||
|
Gids: []*runtime.IDMapping{&idMap},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
},
|
||||||
|
expectErr: true,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
desc: "user namespace mode container",
|
||||||
|
configChange: func(c *runtime.PodSandboxConfig) {
|
||||||
|
c.Linux.SecurityContext = &runtime.LinuxSandboxSecurityContext{
|
||||||
|
NamespaceOptions: &runtime.NamespaceOption{
|
||||||
|
UsernsOptions: &runtime.UserNamespace{
|
||||||
|
Mode: runtime.NamespaceMode_CONTAINER,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
},
|
||||||
|
expectErr: true,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
desc: "user namespace mode target",
|
||||||
|
configChange: func(c *runtime.PodSandboxConfig) {
|
||||||
|
c.Linux.SecurityContext = &runtime.LinuxSandboxSecurityContext{
|
||||||
|
NamespaceOptions: &runtime.NamespaceOption{
|
||||||
|
UsernsOptions: &runtime.UserNamespace{
|
||||||
|
Mode: runtime.NamespaceMode_TARGET,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
},
|
||||||
|
expectErr: true,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
desc: "user namespace unknown mode",
|
||||||
|
configChange: func(c *runtime.PodSandboxConfig) {
|
||||||
|
c.Linux.SecurityContext = &runtime.LinuxSandboxSecurityContext{
|
||||||
|
NamespaceOptions: &runtime.NamespaceOption{
|
||||||
|
UsernsOptions: &runtime.UserNamespace{
|
||||||
|
Mode: runtime.NamespaceMode(100),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
},
|
||||||
|
expectErr: true,
|
||||||
|
},
|
||||||
{
|
{
|
||||||
desc: "should set supplemental groups correctly",
|
desc: "should set supplemental groups correctly",
|
||||||
configChange: func(c *runtime.PodSandboxConfig) {
|
configChange: func(c *runtime.PodSandboxConfig) {
|
||||||
|
@ -21,6 +21,7 @@ package podsandbox
|
|||||||
import (
|
import (
|
||||||
"github.com/containerd/containerd/oci"
|
"github.com/containerd/containerd/oci"
|
||||||
"github.com/containerd/containerd/pkg/cri/annotations"
|
"github.com/containerd/containerd/pkg/cri/annotations"
|
||||||
|
"github.com/containerd/containerd/snapshots"
|
||||||
imagespec "github.com/opencontainers/image-spec/specs-go/v1"
|
imagespec "github.com/opencontainers/image-spec/specs-go/v1"
|
||||||
runtimespec "github.com/opencontainers/runtime-spec/specs-go"
|
runtimespec "github.com/opencontainers/runtime-spec/specs-go"
|
||||||
runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
|
runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
|
||||||
@ -48,3 +49,9 @@ func (c *Controller) setupSandboxFiles(id string, config *runtime.PodSandboxConf
|
|||||||
func (c *Controller) cleanupSandboxFiles(id string, config *runtime.PodSandboxConfig) error {
|
func (c *Controller) cleanupSandboxFiles(id string, config *runtime.PodSandboxConfig) error {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// sandboxSnapshotterOpts generates any platform specific snapshotter options
|
||||||
|
// for a sandbox container.
|
||||||
|
func sandboxSnapshotterOpts(config *runtime.PodSandboxConfig) ([]snapshots.Opt, error) {
|
||||||
|
return []snapshots.Opt{}, nil
|
||||||
|
}
|
||||||
|
@ -27,6 +27,7 @@ import (
|
|||||||
|
|
||||||
"github.com/containerd/containerd/pkg/cri/annotations"
|
"github.com/containerd/containerd/pkg/cri/annotations"
|
||||||
customopts "github.com/containerd/containerd/pkg/cri/opts"
|
customopts "github.com/containerd/containerd/pkg/cri/opts"
|
||||||
|
"github.com/containerd/containerd/snapshots"
|
||||||
)
|
)
|
||||||
|
|
||||||
func (c *Controller) sandboxContainerSpec(id string, config *runtime.PodSandboxConfig,
|
func (c *Controller) sandboxContainerSpec(id string, config *runtime.PodSandboxConfig,
|
||||||
@ -101,3 +102,8 @@ func (c *Controller) setupSandboxFiles(id string, config *runtime.PodSandboxConf
|
|||||||
func (c *Controller) cleanupSandboxFiles(id string, config *runtime.PodSandboxConfig) error {
|
func (c *Controller) cleanupSandboxFiles(id string, config *runtime.PodSandboxConfig) error {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// No sandbox snapshotter options needed for windows.
|
||||||
|
func sandboxSnapshotterOpts(config *runtime.PodSandboxConfig) ([]snapshots.Opt, error) {
|
||||||
|
return []snapshots.Opt{}, nil
|
||||||
|
}
|
||||||
|
@ -23,6 +23,7 @@ import (
|
|||||||
"fmt"
|
"fmt"
|
||||||
"math"
|
"math"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
|
goruntime "runtime"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
@ -94,6 +95,7 @@ func (c *criService) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandbox
|
|||||||
|
|
||||||
sandboxInfo.Runtime.Name = ociRuntime.Type
|
sandboxInfo.Runtime.Name = ociRuntime.Type
|
||||||
|
|
||||||
|
runtimeStart := time.Now()
|
||||||
// Retrieve runtime options
|
// Retrieve runtime options
|
||||||
runtimeOpts, err := generateRuntimeOptions(ociRuntime)
|
runtimeOpts, err := generateRuntimeOptions(ociRuntime)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@ -142,8 +144,27 @@ func (c *criService) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandbox
|
|||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
|
|
||||||
|
userNsEnabled := false
|
||||||
|
if goruntime.GOOS != "windows" {
|
||||||
|
usernsOpts := config.GetLinux().GetSecurityContext().GetNamespaceOptions().GetUsernsOptions()
|
||||||
|
if usernsOpts != nil && usernsOpts.GetMode() == runtime.NamespaceMode_POD {
|
||||||
|
userNsEnabled = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Setup the network namespace if host networking wasn't requested.
|
// Setup the network namespace if host networking wasn't requested.
|
||||||
if !hostNetwork(config) {
|
if !hostNetwork(config) && !userNsEnabled {
|
||||||
|
// XXX: We do c&p of this code later for the podNetwork && userNsEnabled case too.
|
||||||
|
// We can't move this to a function, as the defer calls need to be executed if other
|
||||||
|
// errors are returned in this function. So, we would need more refactors to move
|
||||||
|
// this code to a function and the idea was to not change the current code for
|
||||||
|
// !userNsEnabled case, therefore doing it would defeat the purpose.
|
||||||
|
//
|
||||||
|
// The difference between the cases is the use of netns.NewNetNS() vs
|
||||||
|
// netns.NewNetNSFromPID().
|
||||||
|
//
|
||||||
|
// To simplify this, in the future, we should just remove this case (podNetwork &&
|
||||||
|
// !userNsEnabled) and just keep the other case (podNetwork && userNsEnabled).
|
||||||
netStart := time.Now()
|
netStart := time.Now()
|
||||||
// If it is not in host network namespace then create a namespace and set the sandbox
|
// If it is not in host network namespace then create a namespace and set the sandbox
|
||||||
// handle. NetNSPath in sandbox metadata and NetNS is non empty only for non host network
|
// handle. NetNSPath in sandbox metadata and NetNS is non empty only for non host network
|
||||||
@ -222,8 +243,6 @@ func (c *criService) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandbox
|
|||||||
return nil, fmt.Errorf("unable to update extensions for sandbox %q: %w", id, err)
|
return nil, fmt.Errorf("unable to update extensions for sandbox %q: %w", id, err)
|
||||||
}
|
}
|
||||||
|
|
||||||
runtimeStart := time.Now()
|
|
||||||
|
|
||||||
if err := controller.Create(ctx, id, sb.WithOptions(config), sb.WithNetNSPath(sandbox.NetNSPath)); err != nil {
|
if err := controller.Create(ctx, id, sb.WithOptions(config), sb.WithNetNSPath(sandbox.NetNSPath)); err != nil {
|
||||||
return nil, fmt.Errorf("failed to create sandbox %q: %w", id, err)
|
return nil, fmt.Errorf("failed to create sandbox %q: %w", id, err)
|
||||||
}
|
}
|
||||||
@ -245,6 +264,88 @@ func (c *criService) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandbox
|
|||||||
return nil, fmt.Errorf("failed to start sandbox %q: %w", id, err)
|
return nil, fmt.Errorf("failed to start sandbox %q: %w", id, err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if !hostNetwork(config) && userNsEnabled {
|
||||||
|
// If userns is enabled, then the netns was created by the OCI runtime
|
||||||
|
// on controller.Start(). The OCI runtime needs to create the netns
|
||||||
|
// because, if userns is in use, the netns needs to be owned by the
|
||||||
|
// userns. So, let the OCI runtime just handle this for us.
|
||||||
|
// If the netns is not owned by the userns several problems will happen.
|
||||||
|
// For instance, the container will lack permission (even if
|
||||||
|
// capabilities are present) to modify the netns or, even worse, the OCI
|
||||||
|
// runtime will fail to mount sysfs:
|
||||||
|
// https://github.com/torvalds/linux/commit/7dc5dbc879bd0779924b5132a48b731a0bc04a1e#diff-4839664cd0c8eab716e064323c7cd71fR1164
|
||||||
|
//
|
||||||
|
// Note we do this after controller.Start(), as before that we
|
||||||
|
// can't get the PID for the sandbox that we need for the netns.
|
||||||
|
// Doing a controller.Status() call before that fails (can't
|
||||||
|
// find the sandbox) so we can't get the PID.
|
||||||
|
netStart := time.Now()
|
||||||
|
|
||||||
|
// If it is not in host network namespace then create a namespace and set the sandbox
|
||||||
|
// handle. NetNSPath in sandbox metadata and NetNS is non empty only for non host network
|
||||||
|
// namespaces. If the pod is in host network namespace then both are empty and should not
|
||||||
|
// be used.
|
||||||
|
var netnsMountDir = "/var/run/netns"
|
||||||
|
if c.config.NetNSMountsUnderStateDir {
|
||||||
|
netnsMountDir = filepath.Join(c.config.StateDir, "netns")
|
||||||
|
}
|
||||||
|
|
||||||
|
sandbox.NetNS, err = netns.NewNetNSFromPID(netnsMountDir, ctrl.Pid)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to create network namespace for sandbox %q: %w", id, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update network namespace in the store, which is used to generate the container's spec
|
||||||
|
sandbox.NetNSPath = sandbox.NetNS.GetPath()
|
||||||
|
defer func() {
|
||||||
|
// Remove the network namespace only if all the resource cleanup is done
|
||||||
|
if retErr != nil && cleanupErr == nil {
|
||||||
|
if cleanupErr = sandbox.NetNS.Remove(); cleanupErr != nil {
|
||||||
|
log.G(ctx).WithError(cleanupErr).Errorf("Failed to remove network namespace %s for sandbox %q", sandbox.NetNSPath, id)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
sandbox.NetNSPath = ""
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
|
if err := sandboxInfo.AddExtension(podsandbox.MetadataKey, &sandbox.Metadata); err != nil {
|
||||||
|
return nil, fmt.Errorf("unable to save sandbox %q to store: %w", id, err)
|
||||||
|
}
|
||||||
|
// Save sandbox metadata to store
|
||||||
|
if sandboxInfo, err = c.client.SandboxStore().Update(ctx, sandboxInfo, "extensions"); err != nil {
|
||||||
|
return nil, fmt.Errorf("unable to update extensions for sandbox %q: %w", id, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Define this defer to teardownPodNetwork prior to the setupPodNetwork function call.
|
||||||
|
// This is because in setupPodNetwork the resource is allocated even if it returns error, unlike other resource
|
||||||
|
// creation functions.
|
||||||
|
defer func() {
|
||||||
|
// Remove the network namespace only if all the resource cleanup is done.
|
||||||
|
if retErr != nil && cleanupErr == nil {
|
||||||
|
deferCtx, deferCancel := util.DeferContext()
|
||||||
|
defer deferCancel()
|
||||||
|
// Teardown network if an error is returned.
|
||||||
|
if cleanupErr = c.teardownPodNetwork(deferCtx, sandbox); cleanupErr != nil {
|
||||||
|
log.G(ctx).WithError(cleanupErr).Errorf("Failed to destroy network for sandbox %q", id)
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
|
// Setup network for sandbox.
|
||||||
|
// Certain VM based solutions like clear containers (Issue containerd/cri-containerd#524)
|
||||||
|
// rely on the assumption that CRI shim will not be querying the network namespace to check the
|
||||||
|
// network states such as IP.
|
||||||
|
// In future runtime implementation should avoid relying on CRI shim implementation details.
|
||||||
|
// In this case however caching the IP will add a subtle performance enhancement by avoiding
|
||||||
|
// calls to network namespace of the pod to query the IP of the veth interface on every
|
||||||
|
// SandboxStatus request.
|
||||||
|
if err := c.setupPodNetwork(ctx, &sandbox); err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to setup network for sandbox %q: %w", id, err)
|
||||||
|
}
|
||||||
|
sandboxCreateNetworkTimer.UpdateSince(netStart)
|
||||||
|
}
|
||||||
|
|
||||||
// TODO: get rid of this. sandbox object should no longer have Container field.
|
// TODO: get rid of this. sandbox object should no longer have Container field.
|
||||||
if ociRuntime.SandboxMode == string(criconfig.ModePodSandbox) {
|
if ociRuntime.SandboxMode == string(criconfig.ModePodSandbox) {
|
||||||
container, err := c.client.LoadContainer(ctx, id)
|
container, err := c.client.LoadContainer(ctx, id)
|
||||||
|
@ -31,7 +31,7 @@ const (
|
|||||||
|
|
||||||
// WithRemapperLabels creates the labels used by any supporting snapshotter
|
// WithRemapperLabels creates the labels used by any supporting snapshotter
|
||||||
// to shift the filesystem ownership (user namespace mapping) automatically; currently
|
// to shift the filesystem ownership (user namespace mapping) automatically; currently
|
||||||
// supported by the fuse-overlayfs snapshotter
|
// supported by the fuse-overlayfs and overlay snapshotters
|
||||||
func WithRemapperLabels(ctrUID, hostUID, ctrGID, hostGID, length uint32) snapshots.Opt {
|
func WithRemapperLabels(ctrUID, hostUID, ctrGID, hostGID, length uint32) snapshots.Opt {
|
||||||
return snapshots.WithLabels(map[string]string{
|
return snapshots.WithLabels(map[string]string{
|
||||||
snapshots.LabelSnapshotUIDMapping: fmt.Sprintf("%d:%d:%d", ctrUID, hostUID, length),
|
snapshots.LabelSnapshotUIDMapping: fmt.Sprintf("%d:%d:%d", ctrUID, hostUID, length),
|
||||||
|
Loading…
Reference in New Issue
Block a user