Merge pull request #10607 from fuweid/pin-userns
internal/cri: simplify netns setup with pinned userns
This commit is contained in:
commit
8c64a2f6a1
@ -301,6 +301,23 @@ func WithoutNamespace(t runtimespec.LinuxNamespaceType) oci.SpecOpts {
|
||||
}
|
||||
}
|
||||
|
||||
// WithNamespacePath updates namespace with existing path.
|
||||
func WithNamespacePath(t runtimespec.LinuxNamespaceType, nsPath string) oci.SpecOpts {
|
||||
return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) error {
|
||||
if s.Linux == nil {
|
||||
return fmt.Errorf("Linux spec is required")
|
||||
}
|
||||
|
||||
for i, ns := range s.Linux.Namespaces {
|
||||
if ns.Type == t {
|
||||
s.Linux.Namespaces[i].Path = nsPath
|
||||
return nil
|
||||
}
|
||||
}
|
||||
return fmt.Errorf("no such namespace %s", t)
|
||||
}
|
||||
}
|
||||
|
||||
// WithPodNamespaces sets the pod namespaces for the container
|
||||
func WithPodNamespaces(config *runtime.LinuxContainerSecurityContext, sandboxPid uint32, targetPid uint32, uids, gids []runtimespec.LinuxIDMapping) oci.SpecOpts {
|
||||
namespaces := config.GetNamespaceOptions()
|
||||
|
@ -40,6 +40,7 @@ import (
|
||||
"github.com/containerd/containerd/v2/core/snapshots"
|
||||
"github.com/containerd/containerd/v2/internal/cri/seutil"
|
||||
"github.com/containerd/containerd/v2/pkg/seccomp"
|
||||
"github.com/containerd/containerd/v2/pkg/sys"
|
||||
)
|
||||
|
||||
const (
|
||||
@ -88,6 +89,50 @@ func (c *Controller) getSandboxDevShm(id string) string {
|
||||
return filepath.Join(c.getVolatileSandboxRootDir(id), "shm")
|
||||
}
|
||||
|
||||
// getSandboxPinnedNamespaces returns the pinned namespaces directory inside the
|
||||
// sandbox state directory.
|
||||
func (c *Controller) getSandboxPinnedNamespaces(id string) string {
|
||||
return filepath.Join(c.getVolatileSandboxRootDir(id), "pinned-namespaces")
|
||||
}
|
||||
|
||||
// getSandboxPinnedUserNamespace returns the pinned user namespace file.
|
||||
func (c *Controller) getSandboxPinnedUserNamespace(id string) string {
|
||||
return filepath.Join(c.getSandboxPinnedNamespaces(id), "user")
|
||||
}
|
||||
|
||||
// pinUserNamespace persists user namespace in namespace filesystem.
|
||||
func (c *Controller) pinUserNamespace(sandboxID string, netnsPath string) error {
|
||||
nsPath := c.getSandboxPinnedUserNamespace(sandboxID)
|
||||
|
||||
baseDir := filepath.Dir(nsPath)
|
||||
if err := os.MkdirAll(baseDir, 0755); err != nil {
|
||||
return fmt.Errorf("failed to init pinned-namespaces directory %s: %w", baseDir, err)
|
||||
}
|
||||
|
||||
emptyFd, err := os.OpenFile(nsPath, os.O_RDWR|os.O_CREATE|os.O_EXCL, 0666)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create empty file %s: %w", nsPath, err)
|
||||
}
|
||||
emptyFd.Close()
|
||||
|
||||
netnsFd, err := os.Open(netnsPath)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to open netns(%s): %w", netnsPath, err)
|
||||
}
|
||||
defer netnsFd.Close()
|
||||
|
||||
usernsFd, err := sys.GetUsernsForNamespace(netnsFd.Fd())
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to get user namespace for netns(%s): %w", netnsPath, err)
|
||||
}
|
||||
defer usernsFd.Close()
|
||||
|
||||
if err = unix.Mount(usernsFd.Name(), nsPath, "none", unix.MS_BIND, ""); err != nil {
|
||||
return fmt.Errorf("failed to bind mount ns src: %v at %s: %w", usernsFd.Name(), nsPath, err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func toLabel(selinuxOptions *runtime.SELinuxOption) ([]string, error) {
|
||||
var labels []string
|
||||
|
||||
|
@ -95,6 +95,39 @@ func (c *Controller) Start(ctx context.Context, id string) (cin sandbox.Controll
|
||||
|
||||
labels["oci_runtime_type"] = ociRuntime.Type
|
||||
|
||||
// Create sandbox container root directories.
|
||||
sandboxRootDir := c.getSandboxRootDir(id)
|
||||
if err := c.os.MkdirAll(sandboxRootDir, 0755); err != nil {
|
||||
return cin, fmt.Errorf("failed to create sandbox root directory %q: %w",
|
||||
sandboxRootDir, err)
|
||||
}
|
||||
defer func() {
|
||||
if retErr != nil && cleanupErr == nil {
|
||||
// Cleanup the sandbox root directory.
|
||||
if cleanupErr = c.os.RemoveAll(sandboxRootDir); cleanupErr != nil {
|
||||
log.G(ctx).WithError(cleanupErr).Errorf("Failed to remove sandbox root directory %q",
|
||||
sandboxRootDir)
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
volatileSandboxRootDir := c.getVolatileSandboxRootDir(id)
|
||||
if err := c.os.MkdirAll(volatileSandboxRootDir, 0755); err != nil {
|
||||
return cin, fmt.Errorf("failed to create volatile sandbox root directory %q: %w",
|
||||
volatileSandboxRootDir, err)
|
||||
}
|
||||
defer func() {
|
||||
if retErr != nil && cleanupErr == nil {
|
||||
deferCtx, deferCancel := ctrdutil.DeferContext()
|
||||
defer deferCancel()
|
||||
// Cleanup the volatile sandbox root directory.
|
||||
if cleanupErr = ensureRemoveAll(deferCtx, volatileSandboxRootDir); cleanupErr != nil {
|
||||
log.G(ctx).WithError(cleanupErr).Errorf("Failed to remove volatile sandbox root directory %q",
|
||||
volatileSandboxRootDir)
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
// Create sandbox container.
|
||||
// NOTE: sandboxContainerSpec SHOULD NOT have side
|
||||
// effect, e.g. accessing/creating files, so that we can test
|
||||
@ -164,37 +197,6 @@ func (c *Controller) Start(ctx context.Context, id string) (cin sandbox.Controll
|
||||
}
|
||||
}()
|
||||
|
||||
// Create sandbox container root directories.
|
||||
sandboxRootDir := c.getSandboxRootDir(id)
|
||||
if err := c.os.MkdirAll(sandboxRootDir, 0755); err != nil {
|
||||
return cin, fmt.Errorf("failed to create sandbox root directory %q: %w",
|
||||
sandboxRootDir, err)
|
||||
}
|
||||
defer func() {
|
||||
if retErr != nil && cleanupErr == nil {
|
||||
// Cleanup the sandbox root directory.
|
||||
if cleanupErr = c.os.RemoveAll(sandboxRootDir); cleanupErr != nil {
|
||||
log.G(ctx).WithError(cleanupErr).Errorf("Failed to remove sandbox root directory %q",
|
||||
sandboxRootDir)
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
volatileSandboxRootDir := c.getVolatileSandboxRootDir(id)
|
||||
if err := c.os.MkdirAll(volatileSandboxRootDir, 0755); err != nil {
|
||||
return cin, fmt.Errorf("failed to create volatile sandbox root directory %q: %w",
|
||||
volatileSandboxRootDir, err)
|
||||
}
|
||||
defer func() {
|
||||
if retErr != nil && cleanupErr == nil {
|
||||
// Cleanup the volatile sandbox root directory.
|
||||
if cleanupErr = c.os.RemoveAll(volatileSandboxRootDir); cleanupErr != nil {
|
||||
log.G(ctx).WithError(cleanupErr).Errorf("Failed to remove volatile sandbox root directory %q",
|
||||
volatileSandboxRootDir)
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
// Setup files required for the sandbox.
|
||||
if err = c.setupSandboxFiles(id, config); err != nil {
|
||||
return cin, fmt.Errorf("failed to setup sandbox files: %w", err)
|
||||
|
@ -103,6 +103,11 @@ func (c *Controller) sandboxContainerSpec(id string, config *runtime.PodSandboxC
|
||||
case runtime.NamespaceMode_POD:
|
||||
specOpts = append(specOpts, oci.WithUserNamespace(uids, gids))
|
||||
usernsEnabled = true
|
||||
|
||||
if err := c.pinUserNamespace(id, nsPath); err != nil {
|
||||
return nil, fmt.Errorf("failed to pin user namespace: %w", err)
|
||||
}
|
||||
specOpts = append(specOpts, customopts.WithNamespacePath(runtimespec.UserNamespace, c.getSandboxPinnedUserNamespace(id)))
|
||||
default:
|
||||
return nil, fmt.Errorf("unsupported user namespace mode: %q", mode)
|
||||
}
|
||||
|
@ -17,9 +17,11 @@
|
||||
package podsandbox
|
||||
|
||||
import (
|
||||
"context"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"syscall"
|
||||
"testing"
|
||||
|
||||
"github.com/moby/sys/userns"
|
||||
@ -32,11 +34,15 @@ import (
|
||||
v1 "k8s.io/cri-api/pkg/apis/runtime/v1"
|
||||
|
||||
"github.com/containerd/containerd/v2/internal/cri/annotations"
|
||||
criconfig "github.com/containerd/containerd/v2/internal/cri/config"
|
||||
"github.com/containerd/containerd/v2/internal/cri/opts"
|
||||
"github.com/containerd/containerd/v2/pkg/netns"
|
||||
ostesting "github.com/containerd/containerd/v2/pkg/os/testing"
|
||||
"github.com/containerd/containerd/v2/pkg/sys"
|
||||
"github.com/containerd/containerd/v2/pkg/testutil"
|
||||
)
|
||||
|
||||
func getRunPodSandboxTestData() (*runtime.PodSandboxConfig, *imagespec.ImageConfig, func(*testing.T, string, *runtimespec.Spec)) {
|
||||
func getRunPodSandboxTestData(criCfg criconfig.Config) (*runtime.PodSandboxConfig, *imagespec.ImageConfig, func(*testing.T, string, *runtimespec.Spec)) {
|
||||
config := &runtime.PodSandboxConfig{
|
||||
Metadata: &runtime.PodSandboxMetadata{
|
||||
Name: "test-name",
|
||||
@ -94,7 +100,7 @@ func getRunPodSandboxTestData() (*runtime.PodSandboxConfig, *imagespec.ImageConf
|
||||
}
|
||||
|
||||
assert.Contains(t, spec.Mounts, runtimespec.Mount{
|
||||
Source: "/test/root/sandboxes/test-id/resolv.conf",
|
||||
Source: filepath.Join(criCfg.RootDir, "sandboxes/test-id/resolv.conf"),
|
||||
Destination: resolvConfPath,
|
||||
Type: "bind",
|
||||
Options: []string{"rbind", "ro", "nosuid", "nodev", "noexec"},
|
||||
@ -105,8 +111,10 @@ func getRunPodSandboxTestData() (*runtime.PodSandboxConfig, *imagespec.ImageConf
|
||||
}
|
||||
|
||||
func TestLinuxSandboxContainerSpec(t *testing.T) {
|
||||
testutil.RequiresRoot(t)
|
||||
|
||||
testID := "test-id"
|
||||
nsPath := "test-cni"
|
||||
|
||||
idMap := runtime.IDMapping{
|
||||
HostId: 1000,
|
||||
ContainerId: 1000,
|
||||
@ -118,15 +126,30 @@ func TestLinuxSandboxContainerSpec(t *testing.T) {
|
||||
Size: 10,
|
||||
}
|
||||
|
||||
netnsBasedir := t.TempDir()
|
||||
t.Cleanup(func() {
|
||||
assert.NoError(t, unmountRecursive(context.Background(), netnsBasedir))
|
||||
})
|
||||
|
||||
var netNs *netns.NetNS
|
||||
uerr := sys.UnshareAfterEnterUserns("1000:1000:10", "1000:1000:10", syscall.CLONE_NEWNET, func(pid int) error {
|
||||
var err error
|
||||
netNs, err = netns.NewNetNSFromPID(netnsBasedir, uint32(pid))
|
||||
return err
|
||||
})
|
||||
require.NoError(t, uerr)
|
||||
|
||||
nsPath := netNs.GetPath()
|
||||
|
||||
for _, test := range []struct {
|
||||
desc string
|
||||
configChange func(*runtime.PodSandboxConfig)
|
||||
specCheck func(*testing.T, *runtimespec.Spec)
|
||||
specCheck func(*testing.T, *Controller, *runtimespec.Spec)
|
||||
expectErr bool
|
||||
}{
|
||||
{
|
||||
desc: "spec should reflect original config",
|
||||
specCheck: func(t *testing.T, spec *runtimespec.Spec) {
|
||||
specCheck: func(t *testing.T, _ *Controller, spec *runtimespec.Spec) {
|
||||
// runtime spec should have expected namespaces enabled by default.
|
||||
require.NotNil(t, spec.Linux)
|
||||
assert.Contains(t, spec.Linux.Namespaces, runtimespec.LinuxNamespace{
|
||||
@ -162,10 +185,11 @@ func TestLinuxSandboxContainerSpec(t *testing.T) {
|
||||
},
|
||||
}
|
||||
},
|
||||
specCheck: func(t *testing.T, spec *runtimespec.Spec) {
|
||||
specCheck: func(t *testing.T, c *Controller, spec *runtimespec.Spec) {
|
||||
require.NotNil(t, spec.Linux)
|
||||
assert.Contains(t, spec.Linux.Namespaces, runtimespec.LinuxNamespace{
|
||||
Type: runtimespec.UserNamespace,
|
||||
Path: filepath.Join(c.config.StateDir, "sandboxes", testID, "pinned-namespaces", "user"),
|
||||
})
|
||||
assert.NotContains(t, spec.Linux.Sysctl["net.ipv4.ping_group_range"], "0 2147483647")
|
||||
},
|
||||
@ -181,7 +205,7 @@ func TestLinuxSandboxContainerSpec(t *testing.T) {
|
||||
},
|
||||
}
|
||||
},
|
||||
specCheck: func(t *testing.T, spec *runtimespec.Spec) {
|
||||
specCheck: func(t *testing.T, _ *Controller, spec *runtimespec.Spec) {
|
||||
// runtime spec should disable expected namespaces in host mode.
|
||||
require.NotNil(t, spec.Linux)
|
||||
assert.NotContains(t, spec.Linux.Namespaces, runtimespec.LinuxNamespace{
|
||||
@ -213,10 +237,11 @@ func TestLinuxSandboxContainerSpec(t *testing.T) {
|
||||
},
|
||||
}
|
||||
},
|
||||
specCheck: func(t *testing.T, spec *runtimespec.Spec) {
|
||||
specCheck: func(t *testing.T, c *Controller, spec *runtimespec.Spec) {
|
||||
require.NotNil(t, spec.Linux)
|
||||
assert.Contains(t, spec.Linux.Namespaces, runtimespec.LinuxNamespace{
|
||||
Type: runtimespec.UserNamespace,
|
||||
Path: filepath.Join(c.config.StateDir, "sandboxes", testID, "pinned-namespaces", "user"),
|
||||
})
|
||||
require.Equal(t, spec.Linux.UIDMappings, []runtimespec.LinuxIDMapping{expIDMap})
|
||||
require.Equal(t, spec.Linux.GIDMappings, []runtimespec.LinuxIDMapping{expIDMap})
|
||||
@ -314,7 +339,7 @@ func TestLinuxSandboxContainerSpec(t *testing.T) {
|
||||
SupplementalGroups: []int64{1111, 2222},
|
||||
}
|
||||
},
|
||||
specCheck: func(t *testing.T, spec *runtimespec.Spec) {
|
||||
specCheck: func(t *testing.T, _ *Controller, spec *runtimespec.Spec) {
|
||||
require.NotNil(t, spec.Process)
|
||||
assert.Contains(t, spec.Process.User.AdditionalGids, uint32(1111))
|
||||
assert.Contains(t, spec.Process.User.AdditionalGids, uint32(2222))
|
||||
@ -328,7 +353,7 @@ func TestLinuxSandboxContainerSpec(t *testing.T) {
|
||||
"net.ipv4.ping_group_range": "1 1000",
|
||||
}
|
||||
},
|
||||
specCheck: func(t *testing.T, spec *runtimespec.Spec) {
|
||||
specCheck: func(t *testing.T, _ *Controller, spec *runtimespec.Spec) {
|
||||
require.NotNil(t, spec.Process)
|
||||
assert.Contains(t, spec.Linux.Sysctl["net.ipv4.ip_unprivileged_port_start"], "500")
|
||||
assert.Contains(t, spec.Linux.Sysctl["net.ipv4.ping_group_range"], "1 1000")
|
||||
@ -344,7 +369,7 @@ func TestLinuxSandboxContainerSpec(t *testing.T) {
|
||||
MemoryLimitInBytes: 1024,
|
||||
}
|
||||
},
|
||||
specCheck: func(t *testing.T, spec *runtimespec.Spec) {
|
||||
specCheck: func(t *testing.T, _ *Controller, spec *runtimespec.Spec) {
|
||||
value, ok := spec.Annotations[annotations.SandboxCPUPeriod]
|
||||
assert.True(t, ok)
|
||||
assert.EqualValues(t, strconv.FormatInt(100, 10), value)
|
||||
@ -365,7 +390,7 @@ func TestLinuxSandboxContainerSpec(t *testing.T) {
|
||||
},
|
||||
{
|
||||
desc: "sandbox sizing annotations should not be set if LinuxContainerResources were not provided",
|
||||
specCheck: func(t *testing.T, spec *runtimespec.Spec) {
|
||||
specCheck: func(t *testing.T, _ *Controller, spec *runtimespec.Spec) {
|
||||
_, ok := spec.Annotations[annotations.SandboxCPUPeriod]
|
||||
assert.False(t, ok)
|
||||
_, ok = spec.Annotations[annotations.SandboxCPUQuota]
|
||||
@ -381,7 +406,7 @@ func TestLinuxSandboxContainerSpec(t *testing.T) {
|
||||
configChange: func(c *runtime.PodSandboxConfig) {
|
||||
c.Linux.Resources = &v1.LinuxContainerResources{}
|
||||
},
|
||||
specCheck: func(t *testing.T, spec *runtimespec.Spec) {
|
||||
specCheck: func(t *testing.T, _ *Controller, spec *runtimespec.Spec) {
|
||||
value, ok := spec.Annotations[annotations.SandboxCPUPeriod]
|
||||
assert.True(t, ok)
|
||||
assert.EqualValues(t, "0", value)
|
||||
@ -400,9 +425,17 @@ func TestLinuxSandboxContainerSpec(t *testing.T) {
|
||||
test := test
|
||||
t.Run(test.desc, func(t *testing.T) {
|
||||
c := newControllerService()
|
||||
c.config.RootDir = t.TempDir()
|
||||
c.config.StateDir = t.TempDir()
|
||||
|
||||
defer func() {
|
||||
assert.NoError(t, unmountRecursive(context.Background(), c.config.StateDir))
|
||||
}()
|
||||
|
||||
c.config.EnableUnprivilegedICMP = true
|
||||
c.config.EnableUnprivilegedPorts = true
|
||||
config, imageConfig, specCheck := getRunPodSandboxTestData()
|
||||
|
||||
config, imageConfig, specCheck := getRunPodSandboxTestData(c.config)
|
||||
if test.configChange != nil {
|
||||
test.configChange(config)
|
||||
}
|
||||
@ -416,7 +449,7 @@ func TestLinuxSandboxContainerSpec(t *testing.T) {
|
||||
assert.NotNil(t, spec)
|
||||
specCheck(t, testID, spec)
|
||||
if test.specCheck != nil {
|
||||
test.specCheck(t, spec)
|
||||
test.specCheck(t, c, spec)
|
||||
}
|
||||
})
|
||||
}
|
||||
@ -757,6 +790,3 @@ options timeout:1
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TODO(random-liu): [P1] Add unit test for different error cases to make sure
|
||||
// the function cleans up on error properly.
|
||||
|
@ -21,12 +21,13 @@ package podsandbox
|
||||
import (
|
||||
"testing"
|
||||
|
||||
criconfig "github.com/containerd/containerd/v2/internal/cri/config"
|
||||
imagespec "github.com/opencontainers/image-spec/specs-go/v1"
|
||||
runtimespec "github.com/opencontainers/runtime-spec/specs-go"
|
||||
runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
|
||||
)
|
||||
|
||||
func getRunPodSandboxTestData() (*runtime.PodSandboxConfig, *imagespec.ImageConfig, func(*testing.T, string, *runtimespec.Spec)) {
|
||||
func getRunPodSandboxTestData(_ criconfig.Config) (*runtime.PodSandboxConfig, *imagespec.ImageConfig, func(*testing.T, string, *runtimespec.Spec)) {
|
||||
config := &runtime.PodSandboxConfig{}
|
||||
imageConfig := &imagespec.ImageConfig{}
|
||||
specCheck := func(t *testing.T, id string, spec *runtimespec.Spec) {
|
||||
|
@ -27,8 +27,14 @@ import (
|
||||
runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
|
||||
|
||||
sandboxstore "github.com/containerd/containerd/v2/internal/cri/store/sandbox"
|
||||
"github.com/containerd/containerd/v2/pkg/testutil"
|
||||
)
|
||||
|
||||
func TestEmpty(t *testing.T) {
|
||||
// NOTE: It's used to register -test.root for all platforms.
|
||||
testutil.RequiresRoot(t)
|
||||
}
|
||||
|
||||
func TestSandboxContainerSpec(t *testing.T) {
|
||||
switch goruntime.GOOS {
|
||||
case "darwin":
|
||||
@ -97,7 +103,7 @@ func TestSandboxContainerSpec(t *testing.T) {
|
||||
test := test
|
||||
t.Run(test.desc, func(t *testing.T) {
|
||||
c := newControllerService()
|
||||
config, imageConfig, specCheck := getRunPodSandboxTestData()
|
||||
config, imageConfig, specCheck := getRunPodSandboxTestData(c.config)
|
||||
if test.configChange != nil {
|
||||
test.configChange(config)
|
||||
}
|
||||
@ -154,7 +160,9 @@ func TestTypeurlMarshalUnmarshalSandboxMeta(t *testing.T) {
|
||||
Name: "sandbox_1",
|
||||
NetNSPath: "/home/cloud",
|
||||
}
|
||||
meta.Config, _, _ = getRunPodSandboxTestData()
|
||||
|
||||
c := newControllerService()
|
||||
meta.Config, _, _ = getRunPodSandboxTestData(c.config)
|
||||
if test.configChange != nil {
|
||||
test.configChange(meta.Config)
|
||||
}
|
||||
|
@ -25,10 +25,11 @@ import (
|
||||
runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
|
||||
|
||||
"github.com/containerd/containerd/v2/internal/cri/annotations"
|
||||
criconfig "github.com/containerd/containerd/v2/internal/cri/config"
|
||||
"github.com/containerd/containerd/v2/internal/cri/opts"
|
||||
)
|
||||
|
||||
func getRunPodSandboxTestData() (*runtime.PodSandboxConfig, *imagespec.ImageConfig, func(*testing.T, string, *runtimespec.Spec)) {
|
||||
func getRunPodSandboxTestData(criCfg criconfig.Config) (*runtime.PodSandboxConfig, *imagespec.ImageConfig, func(*testing.T, string, *runtimespec.Spec)) {
|
||||
config := &runtime.PodSandboxConfig{
|
||||
Metadata: &runtime.PodSandboxMetadata{
|
||||
Name: "test-name",
|
||||
@ -100,7 +101,7 @@ func TestSandboxWindowsNetworkNamespace(t *testing.T) {
|
||||
nsPath := "test-cni"
|
||||
c := newControllerService()
|
||||
|
||||
config, imageConfig, specCheck := getRunPodSandboxTestData()
|
||||
config, imageConfig, specCheck := getRunPodSandboxTestData(c.config)
|
||||
spec, err := c.sandboxContainerSpec(testID, config, imageConfig, nsPath, nil)
|
||||
assert.NoError(t, err)
|
||||
assert.NotNil(t, spec)
|
||||
|
@ -167,18 +167,7 @@ func (c *criService) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandbox
|
||||
}
|
||||
|
||||
// Setup the network namespace if host networking wasn't requested.
|
||||
if !hostNetwork(config) && !userNsEnabled {
|
||||
// XXX: We do c&p of this code later for the podNetwork && userNsEnabled case too.
|
||||
// We can't move this to a function, as the defer calls need to be executed if other
|
||||
// errors are returned in this function. So, we would need more refactors to move
|
||||
// this code to a function and the idea was to not change the current code for
|
||||
// !userNsEnabled case, therefore doing it would defeat the purpose.
|
||||
//
|
||||
// The difference between the cases is the use of netns.NewNetNS() vs
|
||||
// netns.NewNetNSFromPID().
|
||||
//
|
||||
// To simplify this, in the future, we should just remove this case (podNetwork &&
|
||||
// !userNsEnabled) and just keep the other case (podNetwork && userNsEnabled).
|
||||
if !hostNetwork(config) {
|
||||
span.AddEvent("setup pod network")
|
||||
netStart := time.Now()
|
||||
// If it is not in host network namespace then create a namespace and set the sandbox
|
||||
@ -189,7 +178,13 @@ func (c *criService) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandbox
|
||||
if c.config.NetNSMountsUnderStateDir {
|
||||
netnsMountDir = filepath.Join(c.config.StateDir, "netns")
|
||||
}
|
||||
sandbox.NetNS, err = netns.NewNetNS(netnsMountDir)
|
||||
|
||||
if !userNsEnabled {
|
||||
sandbox.NetNS, err = netns.NewNetNS(netnsMountDir)
|
||||
} else {
|
||||
usernsOpts := config.GetLinux().GetSecurityContext().GetNamespaceOptions().GetUsernsOptions()
|
||||
sandbox.NetNS, err = c.setupNetnsWithinUserns(netnsMountDir, usernsOpts)
|
||||
}
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create network namespace for sandbox %q: %w", id, err)
|
||||
}
|
||||
@ -284,92 +279,6 @@ func (c *criService) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandbox
|
||||
return nil, fmt.Errorf("unable to update extensions for sandbox %q: %w", id, err)
|
||||
}
|
||||
|
||||
if !hostNetwork(config) && userNsEnabled {
|
||||
// If userns is enabled, then the netns was created by the OCI runtime
|
||||
// on controller.Start(). The OCI runtime needs to create the netns
|
||||
// because, if userns is in use, the netns needs to be owned by the
|
||||
// userns. So, let the OCI runtime just handle this for us.
|
||||
// If the netns is not owned by the userns several problems will happen.
|
||||
// For instance, the container will lack permission (even if
|
||||
// capabilities are present) to modify the netns or, even worse, the OCI
|
||||
// runtime will fail to mount sysfs:
|
||||
// https://github.com/torvalds/linux/commit/7dc5dbc879bd0779924b5132a48b731a0bc04a1e#diff-4839664cd0c8eab716e064323c7cd71fR1164
|
||||
//
|
||||
// Note we do this after controller.Start(), as before that we
|
||||
// can't get the PID for the sandbox that we need for the netns.
|
||||
// Doing a controller.Status() call before that fails (can't
|
||||
// find the sandbox) so we can't get the PID.
|
||||
netStart := time.Now()
|
||||
|
||||
// If it is not in host network namespace then create a namespace and set the sandbox
|
||||
// handle. NetNSPath in sandbox metadata and NetNS is non empty only for non host network
|
||||
// namespaces. If the pod is in host network namespace then both are empty and should not
|
||||
// be used.
|
||||
var netnsMountDir = "/var/run/netns"
|
||||
if c.config.NetNSMountsUnderStateDir {
|
||||
netnsMountDir = filepath.Join(c.config.StateDir, "netns")
|
||||
}
|
||||
|
||||
sandbox.NetNS, err = netns.NewNetNSFromPID(netnsMountDir, ctrl.Pid)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create network namespace for sandbox %q: %w", id, err)
|
||||
}
|
||||
|
||||
// Update network namespace in the store, which is used to generate the container's spec
|
||||
sandbox.NetNSPath = sandbox.NetNS.GetPath()
|
||||
defer func() {
|
||||
// Remove the network namespace only if all the resource cleanup is done
|
||||
if retErr != nil && cleanupErr == nil {
|
||||
if cleanupErr = sandbox.NetNS.Remove(); cleanupErr != nil {
|
||||
log.G(ctx).WithError(cleanupErr).Errorf("Failed to remove network namespace %s for sandbox %q", sandbox.NetNSPath, id)
|
||||
return
|
||||
}
|
||||
sandbox.NetNSPath = ""
|
||||
}
|
||||
}()
|
||||
|
||||
if err := sandboxInfo.AddExtension(podsandbox.MetadataKey, &sandbox.Metadata); err != nil {
|
||||
return nil, fmt.Errorf("unable to save sandbox %q to store: %w", id, err)
|
||||
}
|
||||
// Save sandbox metadata to store
|
||||
if sandboxInfo, err = c.client.SandboxStore().Update(ctx, sandboxInfo, "extensions"); err != nil {
|
||||
return nil, fmt.Errorf("unable to update extensions for sandbox %q: %w", id, err)
|
||||
}
|
||||
|
||||
// Define this defer to teardownPodNetwork prior to the setupPodNetwork function call.
|
||||
// This is because in setupPodNetwork the resource is allocated even if it returns error, unlike other resource
|
||||
// creation functions.
|
||||
defer func() {
|
||||
// Remove the network namespace only if all the resource cleanup is done.
|
||||
if retErr != nil && cleanupErr == nil {
|
||||
deferCtx, deferCancel := util.DeferContext()
|
||||
defer deferCancel()
|
||||
// Teardown network if an error is returned.
|
||||
if cleanupErr = c.teardownPodNetwork(deferCtx, sandbox); cleanupErr != nil {
|
||||
log.G(ctx).WithError(cleanupErr).Errorf("Failed to destroy network for sandbox %q", id)
|
||||
}
|
||||
|
||||
}
|
||||
}()
|
||||
|
||||
// Setup network for sandbox.
|
||||
// Certain VM based solutions like clear containers (Issue containerd/cri-containerd#524)
|
||||
// rely on the assumption that CRI shim will not be querying the network namespace to check the
|
||||
// network states such as IP.
|
||||
// In future runtime implementation should avoid relying on CRI shim implementation details.
|
||||
// In this case however caching the IP will add a subtle performance enhancement by avoiding
|
||||
// calls to network namespace of the pod to query the IP of the veth interface on every
|
||||
// SandboxStatus request.
|
||||
if err := c.setupPodNetwork(ctx, &sandbox); err != nil {
|
||||
return nil, fmt.Errorf("failed to setup network for sandbox %q: %w", id, err)
|
||||
}
|
||||
sandboxCreateNetworkTimer.UpdateSince(netStart)
|
||||
|
||||
span.AddEvent("finished pod network setup",
|
||||
tracing.Attribute("pod.network.setup.duration", time.Since(netStart).String()),
|
||||
)
|
||||
}
|
||||
|
||||
// TODO: get rid of this. sandbox object should no longer have Container field.
|
||||
if ociRuntime.Sandboxer == string(criconfig.ModePodSandbox) {
|
||||
container, err := c.client.LoadContainer(ctx, id)
|
||||
|
@ -18,9 +18,14 @@ package server
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"syscall"
|
||||
|
||||
"github.com/containerd/containerd/v2/pkg/netns"
|
||||
"github.com/containerd/containerd/v2/pkg/sys"
|
||||
|
||||
"github.com/containernetworking/plugins/pkg/ns"
|
||||
"github.com/vishvananda/netlink"
|
||||
runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
|
||||
)
|
||||
|
||||
func (c *criService) bringUpLoopback(netns string) error {
|
||||
@ -35,3 +40,44 @@ func (c *criService) bringUpLoopback(netns string) error {
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *criService) setupNetnsWithinUserns(netnsMountDir string, opt *runtime.UserNamespace) (*netns.NetNS, error) {
|
||||
if opt.GetMode() != runtime.NamespaceMode_POD {
|
||||
return nil, fmt.Errorf("required pod-level user namespace setting")
|
||||
}
|
||||
|
||||
uidMaps := opt.GetUids()
|
||||
if len(uidMaps) != 1 {
|
||||
return nil, fmt.Errorf("required only one uid mapping, but got %d uid mapping(s)", len(uidMaps))
|
||||
}
|
||||
if uidMaps[0] == nil {
|
||||
return nil, fmt.Errorf("required only one uid mapping, but got empty uid mapping")
|
||||
}
|
||||
|
||||
gidMaps := opt.GetGids()
|
||||
if len(gidMaps) != 1 {
|
||||
return nil, fmt.Errorf("required only one gid mapping, but got %d gid mapping(s)", len(gidMaps))
|
||||
}
|
||||
if gidMaps[0] == nil {
|
||||
return nil, fmt.Errorf("required only one gid mapping, but got empty gid mapping")
|
||||
}
|
||||
|
||||
var netNs *netns.NetNS
|
||||
var err error
|
||||
uerr := sys.UnshareAfterEnterUserns(
|
||||
fmt.Sprintf("%d:%d:%d", uidMaps[0].ContainerId, uidMaps[0].HostId, uidMaps[0].Length),
|
||||
fmt.Sprintf("%d:%d:%d", gidMaps[0].ContainerId, gidMaps[0].HostId, gidMaps[0].Length),
|
||||
syscall.CLONE_NEWNET,
|
||||
func(pid int) error {
|
||||
netNs, err = netns.NewNetNSFromPID(netnsMountDir, uint32(pid))
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to mount netns from pid %d: %w", pid, err)
|
||||
}
|
||||
return nil
|
||||
},
|
||||
)
|
||||
if uerr != nil {
|
||||
return nil, uerr
|
||||
}
|
||||
return netNs, nil
|
||||
}
|
||||
|
@ -18,6 +18,17 @@
|
||||
|
||||
package server
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/containerd/containerd/v2/pkg/netns"
|
||||
runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
|
||||
)
|
||||
|
||||
func (c *criService) bringUpLoopback(string) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *criService) setupNetnsWithinUserns(basedir string, cfg *runtime.UserNamespace) (*netns.NetNS, error) {
|
||||
return nil, fmt.Errorf("unsupported to setup netns within userns on unix platform")
|
||||
}
|
||||
|
@ -16,6 +16,17 @@
|
||||
|
||||
package server
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/containerd/containerd/v2/pkg/netns"
|
||||
runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
|
||||
)
|
||||
|
||||
func (c *criService) bringUpLoopback(string) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *criService) setupNetnsWithinUserns(basedir string, cfg *runtime.UserNamespace) (*netns.NetNS, error) {
|
||||
return nil, fmt.Errorf("unsupported to setup netns within userns on windows platform")
|
||||
}
|
||||
|
38
pkg/sys/namespace_linux.go
Normal file
38
pkg/sys/namespace_linux.go
Normal file
@ -0,0 +1,38 @@
|
||||
/*
|
||||
Copyright The containerd Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package sys
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"syscall"
|
||||
|
||||
"golang.org/x/sys/unix"
|
||||
)
|
||||
|
||||
// GetUsernsForNamespace returns a file descriptor that refers to the owning
|
||||
// user namespace for the namespace referred to by fd.
|
||||
//
|
||||
// REF: https://man7.org/linux/man-pages/man2/ioctl_ns.2.html
|
||||
func GetUsernsForNamespace(fd uintptr) (*os.File, error) {
|
||||
fd, _, errno := unix.Syscall(syscall.SYS_IOCTL, fd, uintptr(unix.NS_GET_USERNS), 0)
|
||||
if errno != 0 {
|
||||
return nil, fmt.Errorf("failed to get user namespace fd: %w", errno)
|
||||
}
|
||||
|
||||
return os.NewFile(fd, fmt.Sprintf("/proc/%d/fd/%d", os.Getpid(), fd)), nil
|
||||
}
|
106
pkg/sys/namespace_linux_test.go
Normal file
106
pkg/sys/namespace_linux_test.go
Normal file
@ -0,0 +1,106 @@
|
||||
/*
|
||||
Copyright The containerd Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package sys
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"syscall"
|
||||
"testing"
|
||||
|
||||
kernel "github.com/containerd/containerd/v2/pkg/kernelversion"
|
||||
"github.com/containerd/continuity/testutil"
|
||||
"github.com/stretchr/testify/require"
|
||||
"golang.org/x/sys/unix"
|
||||
)
|
||||
|
||||
func TestGetUsernsForNamespace(t *testing.T) {
|
||||
testutil.RequiresRoot(t)
|
||||
|
||||
t.Parallel()
|
||||
|
||||
k409 := kernel.KernelVersion{Kernel: 4, Major: 9}
|
||||
ok, err := kernel.GreaterEqualThan(k409)
|
||||
require.NoError(t, err)
|
||||
if !ok {
|
||||
t.Skip("Requires kernel >= 4.9")
|
||||
}
|
||||
|
||||
tmpDir := t.TempDir()
|
||||
|
||||
f, err := os.CreateTemp(tmpDir, "netns")
|
||||
require.NoError(t, err)
|
||||
|
||||
netnsPath := f.Name()
|
||||
f.Close()
|
||||
|
||||
defer testutil.Unmount(t, netnsPath)
|
||||
|
||||
currentUsernsIno, err := getNamespaceInode(os.Getpid(), "user")
|
||||
require.NoError(t, err)
|
||||
|
||||
usernsIno := uint64(0)
|
||||
uerr := UnshareAfterEnterUserns("0:1000:10", "0:1000:10", syscall.CLONE_NEWNET, func(pid int) error {
|
||||
err := unix.Mount(
|
||||
fmt.Sprintf("/proc/%d/ns/net", pid),
|
||||
netnsPath,
|
||||
"",
|
||||
unix.MS_BIND|unix.MS_RDONLY,
|
||||
"",
|
||||
)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
usernsIno, err = getNamespaceInode(pid, "user")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
})
|
||||
require.NoError(t, uerr)
|
||||
|
||||
require.NotEqual(t, currentUsernsIno, usernsIno)
|
||||
t.Logf("Current user namespace [%d], new user namespace [%d]", currentUsernsIno, usernsIno)
|
||||
|
||||
netnsFd, err := os.Open(netnsPath)
|
||||
require.NoError(t, err)
|
||||
defer netnsFd.Close()
|
||||
|
||||
usernsFd, err := GetUsernsForNamespace(netnsFd.Fd())
|
||||
require.NoError(t, err)
|
||||
defer usernsFd.Close()
|
||||
|
||||
usernsInoFromNetnsFd := getInode(t, usernsFd)
|
||||
|
||||
t.Logf("Fetch netns namespace %s' user namespace owner %d", netnsPath, usernsInoFromNetnsFd)
|
||||
require.Equal(t, usernsIno, usernsInoFromNetnsFd)
|
||||
|
||||
parentUsernsFd, err := GetUsernsForNamespace(usernsFd.Fd())
|
||||
require.NoError(t, err)
|
||||
defer parentUsernsFd.Close()
|
||||
|
||||
parentUsernsIno := getInode(t, parentUsernsFd)
|
||||
t.Logf("User namespace %d's parent %d", usernsInoFromNetnsFd, parentUsernsIno)
|
||||
require.Equal(t, currentUsernsIno, parentUsernsIno)
|
||||
}
|
||||
|
||||
func getInode(t *testing.T, f *os.File) uint64 {
|
||||
info, err := f.Stat()
|
||||
require.NoError(t, err)
|
||||
return info.Sys().(*syscall.Stat_t).Ino
|
||||
}
|
153
pkg/sys/unshare_linux.go
Normal file
153
pkg/sys/unshare_linux.go
Normal file
@ -0,0 +1,153 @@
|
||||
/*
|
||||
Copyright The containerd Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package sys
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"os"
|
||||
"runtime"
|
||||
"strconv"
|
||||
"strings"
|
||||
"syscall"
|
||||
|
||||
"golang.org/x/sys/unix"
|
||||
)
|
||||
|
||||
// UnshareAfterEnterUserns allows to disassociate parts of its execution context
|
||||
// within a user namespace.
|
||||
func UnshareAfterEnterUserns(uidMap, gidMap string, unshareFlags uintptr, f func(pid int) error) (retErr error) {
|
||||
if unshareFlags&syscall.CLONE_NEWUSER == syscall.CLONE_NEWUSER {
|
||||
return fmt.Errorf("unshare flags should not include user namespace")
|
||||
}
|
||||
|
||||
uidMaps, err := parseIDMapping(uidMap)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
gidMaps, err := parseIDMapping(gidMap)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
var pidfd int
|
||||
proc, err := os.StartProcess("/proc/self/exe", []string{"UnshareAfterEnterUserns"}, &os.ProcAttr{
|
||||
Sys: &syscall.SysProcAttr{
|
||||
// clone new user namespace first and then unshare
|
||||
Cloneflags: unix.CLONE_NEWUSER,
|
||||
Unshareflags: unshareFlags,
|
||||
UidMappings: uidMaps,
|
||||
GidMappings: gidMaps,
|
||||
// NOTE: It's reexec but it's not heavy because subprocess
|
||||
// be in PTRACE_TRACEME mode before performing execve.
|
||||
Ptrace: true,
|
||||
Pdeathsig: syscall.SIGKILL,
|
||||
PidFD: &pidfd,
|
||||
},
|
||||
})
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to start noop process for unshare: %w", err)
|
||||
}
|
||||
|
||||
if pidfd == -1 || !SupportsPidFD() {
|
||||
proc.Kill()
|
||||
proc.Wait()
|
||||
return fmt.Errorf("kernel doesn't support CLONE_PIDFD")
|
||||
}
|
||||
|
||||
// Since go1.23.{0,1} has double close issue, we should dup it before using it.
|
||||
//
|
||||
// References:
|
||||
// - https://github.com/golang/go/issues/68984
|
||||
// - https://github.com/golang/go/milestone/371
|
||||
if goVer := runtime.Version(); goVer == "go1.23.0" || goVer == "go1.23.1" {
|
||||
dupPidfd, dupErr := unix.FcntlInt(uintptr(pidfd), syscall.F_DUPFD_CLOEXEC, 0)
|
||||
if dupErr != nil {
|
||||
proc.Kill()
|
||||
proc.Wait()
|
||||
return fmt.Errorf("failed to dupfd: %w", err)
|
||||
}
|
||||
pidfd = dupPidfd
|
||||
}
|
||||
|
||||
defer func() {
|
||||
derr := unix.PidfdSendSignal(pidfd, unix.SIGKILL, nil, 0)
|
||||
if derr != nil {
|
||||
if !errors.Is(derr, unix.ESRCH) {
|
||||
retErr = derr
|
||||
}
|
||||
return
|
||||
}
|
||||
pidfdWaitid(pidfd)
|
||||
}()
|
||||
|
||||
if f != nil {
|
||||
if err := f(proc.Pid); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
// Ensure the child process is still alive. If the err is ESRCH, we
|
||||
// should return error because the pid could be reused. It's safe to
|
||||
// return error and retry.
|
||||
if err := unix.PidfdSendSignal(pidfd, 0, nil, 0); err != nil {
|
||||
return fmt.Errorf("failed to ensure child process is alive: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// TODO: Support multiple mappings in future
|
||||
func parseIDMapping(mapping string) ([]syscall.SysProcIDMap, error) {
|
||||
parts := strings.Split(mapping, ":")
|
||||
if len(parts) != 3 {
|
||||
return nil, fmt.Errorf("user namespace mappings require the format `container-id:host-id:size`")
|
||||
}
|
||||
|
||||
cID, err := strconv.Atoi(parts[0])
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("invalid container id for user namespace remapping, %w", err)
|
||||
}
|
||||
|
||||
hID, err := strconv.Atoi(parts[1])
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("invalid host id for user namespace remapping, %w", err)
|
||||
}
|
||||
|
||||
size, err := strconv.Atoi(parts[2])
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("invalid size for user namespace remapping, %w", err)
|
||||
}
|
||||
|
||||
if cID < 0 || hID < 0 || size < 0 {
|
||||
return nil, fmt.Errorf("invalid mapping %s, all IDs and size must be positive integers", mapping)
|
||||
}
|
||||
|
||||
return []syscall.SysProcIDMap{
|
||||
{
|
||||
ContainerID: cID,
|
||||
HostID: hID,
|
||||
Size: size,
|
||||
},
|
||||
}, nil
|
||||
}
|
||||
|
||||
func pidfdWaitid(pidfd int) error {
|
||||
return IgnoringEINTR(func() error {
|
||||
return unix.Waitid(unix.P_PIDFD, pidfd, nil, unix.WEXITED, nil)
|
||||
})
|
||||
}
|
149
pkg/sys/unshare_linux_test.go
Normal file
149
pkg/sys/unshare_linux_test.go
Normal file
@ -0,0 +1,149 @@
|
||||
/*
|
||||
Copyright The containerd Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package sys
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"syscall"
|
||||
"testing"
|
||||
|
||||
kernel "github.com/containerd/containerd/v2/pkg/kernelversion"
|
||||
"github.com/containerd/continuity/testutil"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func TestUnshareAfterEnterUserns(t *testing.T) {
|
||||
testutil.RequiresRoot(t)
|
||||
|
||||
k510 := kernel.KernelVersion{Kernel: 5, Major: 10}
|
||||
ok, err := kernel.GreaterEqualThan(k510)
|
||||
require.NoError(t, err)
|
||||
if !ok {
|
||||
t.Skip("Requires kernel >= 5.10")
|
||||
}
|
||||
|
||||
err = UnshareAfterEnterUserns("0:1000:1", "0:1000:1", syscall.CLONE_NEWUSER|syscall.CLONE_NEWIPC, nil)
|
||||
require.Error(t, err)
|
||||
require.ErrorContains(t, err, "unshare flags should not include user namespace")
|
||||
|
||||
t.Run("should work", testUnshareAfterEnterUsernsShouldWork)
|
||||
t.Run("killpid", testUnshareAfterEnterUsernsKillPid)
|
||||
t.Run("invalid unshare flags", testUnshareAfterEnterUsernsInvalidFlags)
|
||||
}
|
||||
|
||||
func testUnshareAfterEnterUsernsShouldWork(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
currentNetNs, err := getNamespaceInode(os.Getpid(), "net")
|
||||
require.NoError(t, err)
|
||||
|
||||
currentUserNs, err := getNamespaceInode(os.Getpid(), "user")
|
||||
require.NoError(t, err)
|
||||
|
||||
currentIpcNs, err := getNamespaceInode(os.Getpid(), "ipc")
|
||||
require.NoError(t, err)
|
||||
|
||||
currentPidNs, err := getNamespaceInode(os.Getpid(), "pid")
|
||||
require.NoError(t, err)
|
||||
|
||||
uerr := UnshareAfterEnterUserns("0:1000:10", "0:1000:10", syscall.CLONE_NEWIPC|syscall.CLONE_NEWNET, func(pid int) error {
|
||||
netNs, err := getNamespaceInode(pid, "net")
|
||||
require.NoError(t, err)
|
||||
require.NotEqual(t, currentNetNs, netNs)
|
||||
|
||||
userNs, err := getNamespaceInode(pid, "user")
|
||||
require.NoError(t, err)
|
||||
require.NotEqual(t, currentUserNs, userNs)
|
||||
|
||||
ipcNs, err := getNamespaceInode(pid, "ipc")
|
||||
require.NoError(t, err)
|
||||
require.NotEqual(t, currentIpcNs, ipcNs)
|
||||
|
||||
pidNs, err := getNamespaceInode(pid, "pid")
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, currentPidNs, pidNs)
|
||||
|
||||
data, err := os.ReadFile(fmt.Sprintf("/proc/%d/uid_map", pid))
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, " 0 1000 10\n", string(data))
|
||||
|
||||
data, err = os.ReadFile(fmt.Sprintf("/proc/%d/gid_map", pid))
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, " 0 1000 10\n", string(data))
|
||||
return nil
|
||||
})
|
||||
require.NoError(t, uerr)
|
||||
}
|
||||
|
||||
func testUnshareAfterEnterUsernsKillPid(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
uerr := UnshareAfterEnterUserns("0:1000:1", "0:1000:1", syscall.CLONE_NEWIPC|syscall.CLONE_NEWNET, func(pid int) error {
|
||||
proc, err := os.FindProcess(pid)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to find process: %w", err)
|
||||
}
|
||||
|
||||
if err := proc.Kill(); err != nil {
|
||||
return fmt.Errorf("failed to kill process: %w", err)
|
||||
}
|
||||
|
||||
proc.Wait()
|
||||
|
||||
_, err = os.OpenFile(fmt.Sprintf("/proc/%d/ns/net", pid), os.O_RDONLY, 0600)
|
||||
require.Error(t, err)
|
||||
require.ErrorIs(t, err, os.ErrNotExist)
|
||||
return err
|
||||
})
|
||||
require.Error(t, uerr)
|
||||
require.ErrorIs(t, uerr, os.ErrNotExist)
|
||||
|
||||
uerr = UnshareAfterEnterUserns("0:1000:1", "0:1000:1", syscall.CLONE_NEWIPC|syscall.CLONE_NEWNET, func(pid int) error {
|
||||
proc, err := os.FindProcess(pid)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to find process: %w", err)
|
||||
}
|
||||
|
||||
if err := proc.Kill(); err != nil {
|
||||
return fmt.Errorf("failed to kill process: %w", err)
|
||||
}
|
||||
|
||||
proc.Wait()
|
||||
|
||||
return nil
|
||||
})
|
||||
require.Error(t, uerr)
|
||||
require.ErrorContains(t, uerr, "failed to ensure child process is alive: no such process")
|
||||
}
|
||||
|
||||
func testUnshareAfterEnterUsernsInvalidFlags(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
uerr := UnshareAfterEnterUserns("0:1000:1", "0:1000:1", syscall.CLONE_IO, nil)
|
||||
require.Error(t, uerr)
|
||||
require.ErrorContains(t, uerr, "fork/exec /proc/self/exe: invalid argument")
|
||||
}
|
||||
|
||||
func getNamespaceInode(pid int, typ string) (uint64, error) {
|
||||
info, err := os.Stat(fmt.Sprintf("/proc/%d/ns/%s", pid, typ))
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
return info.Sys().(*syscall.Stat_t).Ino, nil
|
||||
}
|
Loading…
Reference in New Issue
Block a user