Merge pull request #10607 from fuweid/pin-userns
internal/cri: simplify netns setup with pinned userns
This commit is contained in:
		| @@ -301,6 +301,23 @@ func WithoutNamespace(t runtimespec.LinuxNamespaceType) oci.SpecOpts { | ||||
| 	} | ||||
| } | ||||
|  | ||||
| // WithNamespacePath updates namespace with existing path. | ||||
| func WithNamespacePath(t runtimespec.LinuxNamespaceType, nsPath string) oci.SpecOpts { | ||||
| 	return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) error { | ||||
| 		if s.Linux == nil { | ||||
| 			return fmt.Errorf("Linux spec is required") | ||||
| 		} | ||||
|  | ||||
| 		for i, ns := range s.Linux.Namespaces { | ||||
| 			if ns.Type == t { | ||||
| 				s.Linux.Namespaces[i].Path = nsPath | ||||
| 				return nil | ||||
| 			} | ||||
| 		} | ||||
| 		return fmt.Errorf("no such namespace %s", t) | ||||
| 	} | ||||
| } | ||||
|  | ||||
| // WithPodNamespaces sets the pod namespaces for the container | ||||
| func WithPodNamespaces(config *runtime.LinuxContainerSecurityContext, sandboxPid uint32, targetPid uint32, uids, gids []runtimespec.LinuxIDMapping) oci.SpecOpts { | ||||
| 	namespaces := config.GetNamespaceOptions() | ||||
|   | ||||
| @@ -40,6 +40,7 @@ import ( | ||||
| 	"github.com/containerd/containerd/v2/core/snapshots" | ||||
| 	"github.com/containerd/containerd/v2/internal/cri/seutil" | ||||
| 	"github.com/containerd/containerd/v2/pkg/seccomp" | ||||
| 	"github.com/containerd/containerd/v2/pkg/sys" | ||||
| ) | ||||
|  | ||||
| const ( | ||||
| @@ -88,6 +89,50 @@ func (c *Controller) getSandboxDevShm(id string) string { | ||||
| 	return filepath.Join(c.getVolatileSandboxRootDir(id), "shm") | ||||
| } | ||||
|  | ||||
| // getSandboxPinnedNamespaces returns the pinned namespaces directory inside the | ||||
| // sandbox state directory. | ||||
| func (c *Controller) getSandboxPinnedNamespaces(id string) string { | ||||
| 	return filepath.Join(c.getVolatileSandboxRootDir(id), "pinned-namespaces") | ||||
| } | ||||
|  | ||||
| // getSandboxPinnedUserNamespace returns the pinned user namespace file. | ||||
| func (c *Controller) getSandboxPinnedUserNamespace(id string) string { | ||||
| 	return filepath.Join(c.getSandboxPinnedNamespaces(id), "user") | ||||
| } | ||||
|  | ||||
| // pinUserNamespace persists user namespace in namespace filesystem. | ||||
| func (c *Controller) pinUserNamespace(sandboxID string, netnsPath string) error { | ||||
| 	nsPath := c.getSandboxPinnedUserNamespace(sandboxID) | ||||
|  | ||||
| 	baseDir := filepath.Dir(nsPath) | ||||
| 	if err := os.MkdirAll(baseDir, 0755); err != nil { | ||||
| 		return fmt.Errorf("failed to init pinned-namespaces directory %s: %w", baseDir, err) | ||||
| 	} | ||||
|  | ||||
| 	emptyFd, err := os.OpenFile(nsPath, os.O_RDWR|os.O_CREATE|os.O_EXCL, 0666) | ||||
| 	if err != nil { | ||||
| 		return fmt.Errorf("failed to create empty file %s: %w", nsPath, err) | ||||
| 	} | ||||
| 	emptyFd.Close() | ||||
|  | ||||
| 	netnsFd, err := os.Open(netnsPath) | ||||
| 	if err != nil { | ||||
| 		return fmt.Errorf("failed to open netns(%s): %w", netnsPath, err) | ||||
| 	} | ||||
| 	defer netnsFd.Close() | ||||
|  | ||||
| 	usernsFd, err := sys.GetUsernsForNamespace(netnsFd.Fd()) | ||||
| 	if err != nil { | ||||
| 		return fmt.Errorf("failed to get user namespace for netns(%s): %w", netnsPath, err) | ||||
| 	} | ||||
| 	defer usernsFd.Close() | ||||
|  | ||||
| 	if err = unix.Mount(usernsFd.Name(), nsPath, "none", unix.MS_BIND, ""); err != nil { | ||||
| 		return fmt.Errorf("failed to bind mount ns src: %v at %s: %w", usernsFd.Name(), nsPath, err) | ||||
| 	} | ||||
| 	return nil | ||||
| } | ||||
|  | ||||
| func toLabel(selinuxOptions *runtime.SELinuxOption) ([]string, error) { | ||||
| 	var labels []string | ||||
|  | ||||
|   | ||||
| @@ -95,6 +95,39 @@ func (c *Controller) Start(ctx context.Context, id string) (cin sandbox.Controll | ||||
|  | ||||
| 	labels["oci_runtime_type"] = ociRuntime.Type | ||||
|  | ||||
| 	// Create sandbox container root directories. | ||||
| 	sandboxRootDir := c.getSandboxRootDir(id) | ||||
| 	if err := c.os.MkdirAll(sandboxRootDir, 0755); err != nil { | ||||
| 		return cin, fmt.Errorf("failed to create sandbox root directory %q: %w", | ||||
| 			sandboxRootDir, err) | ||||
| 	} | ||||
| 	defer func() { | ||||
| 		if retErr != nil && cleanupErr == nil { | ||||
| 			// Cleanup the sandbox root directory. | ||||
| 			if cleanupErr = c.os.RemoveAll(sandboxRootDir); cleanupErr != nil { | ||||
| 				log.G(ctx).WithError(cleanupErr).Errorf("Failed to remove sandbox root directory %q", | ||||
| 					sandboxRootDir) | ||||
| 			} | ||||
| 		} | ||||
| 	}() | ||||
|  | ||||
| 	volatileSandboxRootDir := c.getVolatileSandboxRootDir(id) | ||||
| 	if err := c.os.MkdirAll(volatileSandboxRootDir, 0755); err != nil { | ||||
| 		return cin, fmt.Errorf("failed to create volatile sandbox root directory %q: %w", | ||||
| 			volatileSandboxRootDir, err) | ||||
| 	} | ||||
| 	defer func() { | ||||
| 		if retErr != nil && cleanupErr == nil { | ||||
| 			deferCtx, deferCancel := ctrdutil.DeferContext() | ||||
| 			defer deferCancel() | ||||
| 			// Cleanup the volatile sandbox root directory. | ||||
| 			if cleanupErr = ensureRemoveAll(deferCtx, volatileSandboxRootDir); cleanupErr != nil { | ||||
| 				log.G(ctx).WithError(cleanupErr).Errorf("Failed to remove volatile sandbox root directory %q", | ||||
| 					volatileSandboxRootDir) | ||||
| 			} | ||||
| 		} | ||||
| 	}() | ||||
|  | ||||
| 	// Create sandbox container. | ||||
| 	// NOTE: sandboxContainerSpec SHOULD NOT have side | ||||
| 	// effect, e.g. accessing/creating files, so that we can test | ||||
| @@ -164,37 +197,6 @@ func (c *Controller) Start(ctx context.Context, id string) (cin sandbox.Controll | ||||
| 		} | ||||
| 	}() | ||||
|  | ||||
| 	// Create sandbox container root directories. | ||||
| 	sandboxRootDir := c.getSandboxRootDir(id) | ||||
| 	if err := c.os.MkdirAll(sandboxRootDir, 0755); err != nil { | ||||
| 		return cin, fmt.Errorf("failed to create sandbox root directory %q: %w", | ||||
| 			sandboxRootDir, err) | ||||
| 	} | ||||
| 	defer func() { | ||||
| 		if retErr != nil && cleanupErr == nil { | ||||
| 			// Cleanup the sandbox root directory. | ||||
| 			if cleanupErr = c.os.RemoveAll(sandboxRootDir); cleanupErr != nil { | ||||
| 				log.G(ctx).WithError(cleanupErr).Errorf("Failed to remove sandbox root directory %q", | ||||
| 					sandboxRootDir) | ||||
| 			} | ||||
| 		} | ||||
| 	}() | ||||
|  | ||||
| 	volatileSandboxRootDir := c.getVolatileSandboxRootDir(id) | ||||
| 	if err := c.os.MkdirAll(volatileSandboxRootDir, 0755); err != nil { | ||||
| 		return cin, fmt.Errorf("failed to create volatile sandbox root directory %q: %w", | ||||
| 			volatileSandboxRootDir, err) | ||||
| 	} | ||||
| 	defer func() { | ||||
| 		if retErr != nil && cleanupErr == nil { | ||||
| 			// Cleanup the volatile sandbox root directory. | ||||
| 			if cleanupErr = c.os.RemoveAll(volatileSandboxRootDir); cleanupErr != nil { | ||||
| 				log.G(ctx).WithError(cleanupErr).Errorf("Failed to remove volatile sandbox root directory %q", | ||||
| 					volatileSandboxRootDir) | ||||
| 			} | ||||
| 		} | ||||
| 	}() | ||||
|  | ||||
| 	// Setup files required for the sandbox. | ||||
| 	if err = c.setupSandboxFiles(id, config); err != nil { | ||||
| 		return cin, fmt.Errorf("failed to setup sandbox files: %w", err) | ||||
|   | ||||
| @@ -103,6 +103,11 @@ func (c *Controller) sandboxContainerSpec(id string, config *runtime.PodSandboxC | ||||
| 		case runtime.NamespaceMode_POD: | ||||
| 			specOpts = append(specOpts, oci.WithUserNamespace(uids, gids)) | ||||
| 			usernsEnabled = true | ||||
|  | ||||
| 			if err := c.pinUserNamespace(id, nsPath); err != nil { | ||||
| 				return nil, fmt.Errorf("failed to pin user namespace: %w", err) | ||||
| 			} | ||||
| 			specOpts = append(specOpts, customopts.WithNamespacePath(runtimespec.UserNamespace, c.getSandboxPinnedUserNamespace(id))) | ||||
| 		default: | ||||
| 			return nil, fmt.Errorf("unsupported user namespace mode: %q", mode) | ||||
| 		} | ||||
|   | ||||
| @@ -17,9 +17,11 @@ | ||||
| package podsandbox | ||||
|  | ||||
| import ( | ||||
| 	"context" | ||||
| 	"os" | ||||
| 	"path/filepath" | ||||
| 	"strconv" | ||||
| 	"syscall" | ||||
| 	"testing" | ||||
|  | ||||
| 	"github.com/moby/sys/userns" | ||||
| @@ -32,11 +34,15 @@ import ( | ||||
| 	v1 "k8s.io/cri-api/pkg/apis/runtime/v1" | ||||
|  | ||||
| 	"github.com/containerd/containerd/v2/internal/cri/annotations" | ||||
| 	criconfig "github.com/containerd/containerd/v2/internal/cri/config" | ||||
| 	"github.com/containerd/containerd/v2/internal/cri/opts" | ||||
| 	"github.com/containerd/containerd/v2/pkg/netns" | ||||
| 	ostesting "github.com/containerd/containerd/v2/pkg/os/testing" | ||||
| 	"github.com/containerd/containerd/v2/pkg/sys" | ||||
| 	"github.com/containerd/containerd/v2/pkg/testutil" | ||||
| ) | ||||
|  | ||||
| func getRunPodSandboxTestData() (*runtime.PodSandboxConfig, *imagespec.ImageConfig, func(*testing.T, string, *runtimespec.Spec)) { | ||||
| func getRunPodSandboxTestData(criCfg criconfig.Config) (*runtime.PodSandboxConfig, *imagespec.ImageConfig, func(*testing.T, string, *runtimespec.Spec)) { | ||||
| 	config := &runtime.PodSandboxConfig{ | ||||
| 		Metadata: &runtime.PodSandboxMetadata{ | ||||
| 			Name:      "test-name", | ||||
| @@ -94,7 +100,7 @@ func getRunPodSandboxTestData() (*runtime.PodSandboxConfig, *imagespec.ImageConf | ||||
| 		} | ||||
|  | ||||
| 		assert.Contains(t, spec.Mounts, runtimespec.Mount{ | ||||
| 			Source:      "/test/root/sandboxes/test-id/resolv.conf", | ||||
| 			Source:      filepath.Join(criCfg.RootDir, "sandboxes/test-id/resolv.conf"), | ||||
| 			Destination: resolvConfPath, | ||||
| 			Type:        "bind", | ||||
| 			Options:     []string{"rbind", "ro", "nosuid", "nodev", "noexec"}, | ||||
| @@ -105,8 +111,10 @@ func getRunPodSandboxTestData() (*runtime.PodSandboxConfig, *imagespec.ImageConf | ||||
| } | ||||
|  | ||||
| func TestLinuxSandboxContainerSpec(t *testing.T) { | ||||
| 	testutil.RequiresRoot(t) | ||||
|  | ||||
| 	testID := "test-id" | ||||
| 	nsPath := "test-cni" | ||||
|  | ||||
| 	idMap := runtime.IDMapping{ | ||||
| 		HostId:      1000, | ||||
| 		ContainerId: 1000, | ||||
| @@ -118,15 +126,30 @@ func TestLinuxSandboxContainerSpec(t *testing.T) { | ||||
| 		Size:        10, | ||||
| 	} | ||||
|  | ||||
| 	netnsBasedir := t.TempDir() | ||||
| 	t.Cleanup(func() { | ||||
| 		assert.NoError(t, unmountRecursive(context.Background(), netnsBasedir)) | ||||
| 	}) | ||||
|  | ||||
| 	var netNs *netns.NetNS | ||||
| 	uerr := sys.UnshareAfterEnterUserns("1000:1000:10", "1000:1000:10", syscall.CLONE_NEWNET, func(pid int) error { | ||||
| 		var err error | ||||
| 		netNs, err = netns.NewNetNSFromPID(netnsBasedir, uint32(pid)) | ||||
| 		return err | ||||
| 	}) | ||||
| 	require.NoError(t, uerr) | ||||
|  | ||||
| 	nsPath := netNs.GetPath() | ||||
|  | ||||
| 	for _, test := range []struct { | ||||
| 		desc         string | ||||
| 		configChange func(*runtime.PodSandboxConfig) | ||||
| 		specCheck    func(*testing.T, *runtimespec.Spec) | ||||
| 		specCheck    func(*testing.T, *Controller, *runtimespec.Spec) | ||||
| 		expectErr    bool | ||||
| 	}{ | ||||
| 		{ | ||||
| 			desc: "spec should reflect original config", | ||||
| 			specCheck: func(t *testing.T, spec *runtimespec.Spec) { | ||||
| 			specCheck: func(t *testing.T, _ *Controller, spec *runtimespec.Spec) { | ||||
| 				// runtime spec should have expected namespaces enabled by default. | ||||
| 				require.NotNil(t, spec.Linux) | ||||
| 				assert.Contains(t, spec.Linux.Namespaces, runtimespec.LinuxNamespace{ | ||||
| @@ -162,10 +185,11 @@ func TestLinuxSandboxContainerSpec(t *testing.T) { | ||||
| 					}, | ||||
| 				} | ||||
| 			}, | ||||
| 			specCheck: func(t *testing.T, spec *runtimespec.Spec) { | ||||
| 			specCheck: func(t *testing.T, c *Controller, spec *runtimespec.Spec) { | ||||
| 				require.NotNil(t, spec.Linux) | ||||
| 				assert.Contains(t, spec.Linux.Namespaces, runtimespec.LinuxNamespace{ | ||||
| 					Type: runtimespec.UserNamespace, | ||||
| 					Path: filepath.Join(c.config.StateDir, "sandboxes", testID, "pinned-namespaces", "user"), | ||||
| 				}) | ||||
| 				assert.NotContains(t, spec.Linux.Sysctl["net.ipv4.ping_group_range"], "0 2147483647") | ||||
| 			}, | ||||
| @@ -181,7 +205,7 @@ func TestLinuxSandboxContainerSpec(t *testing.T) { | ||||
| 					}, | ||||
| 				} | ||||
| 			}, | ||||
| 			specCheck: func(t *testing.T, spec *runtimespec.Spec) { | ||||
| 			specCheck: func(t *testing.T, _ *Controller, spec *runtimespec.Spec) { | ||||
| 				// runtime spec should disable expected namespaces in host mode. | ||||
| 				require.NotNil(t, spec.Linux) | ||||
| 				assert.NotContains(t, spec.Linux.Namespaces, runtimespec.LinuxNamespace{ | ||||
| @@ -213,10 +237,11 @@ func TestLinuxSandboxContainerSpec(t *testing.T) { | ||||
| 					}, | ||||
| 				} | ||||
| 			}, | ||||
| 			specCheck: func(t *testing.T, spec *runtimespec.Spec) { | ||||
| 			specCheck: func(t *testing.T, c *Controller, spec *runtimespec.Spec) { | ||||
| 				require.NotNil(t, spec.Linux) | ||||
| 				assert.Contains(t, spec.Linux.Namespaces, runtimespec.LinuxNamespace{ | ||||
| 					Type: runtimespec.UserNamespace, | ||||
| 					Path: filepath.Join(c.config.StateDir, "sandboxes", testID, "pinned-namespaces", "user"), | ||||
| 				}) | ||||
| 				require.Equal(t, spec.Linux.UIDMappings, []runtimespec.LinuxIDMapping{expIDMap}) | ||||
| 				require.Equal(t, spec.Linux.GIDMappings, []runtimespec.LinuxIDMapping{expIDMap}) | ||||
| @@ -314,7 +339,7 @@ func TestLinuxSandboxContainerSpec(t *testing.T) { | ||||
| 					SupplementalGroups: []int64{1111, 2222}, | ||||
| 				} | ||||
| 			}, | ||||
| 			specCheck: func(t *testing.T, spec *runtimespec.Spec) { | ||||
| 			specCheck: func(t *testing.T, _ *Controller, spec *runtimespec.Spec) { | ||||
| 				require.NotNil(t, spec.Process) | ||||
| 				assert.Contains(t, spec.Process.User.AdditionalGids, uint32(1111)) | ||||
| 				assert.Contains(t, spec.Process.User.AdditionalGids, uint32(2222)) | ||||
| @@ -328,7 +353,7 @@ func TestLinuxSandboxContainerSpec(t *testing.T) { | ||||
| 					"net.ipv4.ping_group_range":           "1 1000", | ||||
| 				} | ||||
| 			}, | ||||
| 			specCheck: func(t *testing.T, spec *runtimespec.Spec) { | ||||
| 			specCheck: func(t *testing.T, _ *Controller, spec *runtimespec.Spec) { | ||||
| 				require.NotNil(t, spec.Process) | ||||
| 				assert.Contains(t, spec.Linux.Sysctl["net.ipv4.ip_unprivileged_port_start"], "500") | ||||
| 				assert.Contains(t, spec.Linux.Sysctl["net.ipv4.ping_group_range"], "1 1000") | ||||
| @@ -344,7 +369,7 @@ func TestLinuxSandboxContainerSpec(t *testing.T) { | ||||
| 					MemoryLimitInBytes: 1024, | ||||
| 				} | ||||
| 			}, | ||||
| 			specCheck: func(t *testing.T, spec *runtimespec.Spec) { | ||||
| 			specCheck: func(t *testing.T, _ *Controller, spec *runtimespec.Spec) { | ||||
| 				value, ok := spec.Annotations[annotations.SandboxCPUPeriod] | ||||
| 				assert.True(t, ok) | ||||
| 				assert.EqualValues(t, strconv.FormatInt(100, 10), value) | ||||
| @@ -365,7 +390,7 @@ func TestLinuxSandboxContainerSpec(t *testing.T) { | ||||
| 		}, | ||||
| 		{ | ||||
| 			desc: "sandbox sizing annotations should not be set if LinuxContainerResources were not provided", | ||||
| 			specCheck: func(t *testing.T, spec *runtimespec.Spec) { | ||||
| 			specCheck: func(t *testing.T, _ *Controller, spec *runtimespec.Spec) { | ||||
| 				_, ok := spec.Annotations[annotations.SandboxCPUPeriod] | ||||
| 				assert.False(t, ok) | ||||
| 				_, ok = spec.Annotations[annotations.SandboxCPUQuota] | ||||
| @@ -381,7 +406,7 @@ func TestLinuxSandboxContainerSpec(t *testing.T) { | ||||
| 			configChange: func(c *runtime.PodSandboxConfig) { | ||||
| 				c.Linux.Resources = &v1.LinuxContainerResources{} | ||||
| 			}, | ||||
| 			specCheck: func(t *testing.T, spec *runtimespec.Spec) { | ||||
| 			specCheck: func(t *testing.T, _ *Controller, spec *runtimespec.Spec) { | ||||
| 				value, ok := spec.Annotations[annotations.SandboxCPUPeriod] | ||||
| 				assert.True(t, ok) | ||||
| 				assert.EqualValues(t, "0", value) | ||||
| @@ -400,9 +425,17 @@ func TestLinuxSandboxContainerSpec(t *testing.T) { | ||||
| 		test := test | ||||
| 		t.Run(test.desc, func(t *testing.T) { | ||||
| 			c := newControllerService() | ||||
| 			c.config.RootDir = t.TempDir() | ||||
| 			c.config.StateDir = t.TempDir() | ||||
|  | ||||
| 			defer func() { | ||||
| 				assert.NoError(t, unmountRecursive(context.Background(), c.config.StateDir)) | ||||
| 			}() | ||||
|  | ||||
| 			c.config.EnableUnprivilegedICMP = true | ||||
| 			c.config.EnableUnprivilegedPorts = true | ||||
| 			config, imageConfig, specCheck := getRunPodSandboxTestData() | ||||
|  | ||||
| 			config, imageConfig, specCheck := getRunPodSandboxTestData(c.config) | ||||
| 			if test.configChange != nil { | ||||
| 				test.configChange(config) | ||||
| 			} | ||||
| @@ -416,7 +449,7 @@ func TestLinuxSandboxContainerSpec(t *testing.T) { | ||||
| 			assert.NotNil(t, spec) | ||||
| 			specCheck(t, testID, spec) | ||||
| 			if test.specCheck != nil { | ||||
| 				test.specCheck(t, spec) | ||||
| 				test.specCheck(t, c, spec) | ||||
| 			} | ||||
| 		}) | ||||
| 	} | ||||
| @@ -757,6 +790,3 @@ options timeout:1 | ||||
| 		}) | ||||
| 	} | ||||
| } | ||||
|  | ||||
| // TODO(random-liu): [P1] Add unit test for different error cases to make sure | ||||
| // the function cleans up on error properly. | ||||
|   | ||||
| @@ -21,12 +21,13 @@ package podsandbox | ||||
| import ( | ||||
| 	"testing" | ||||
|  | ||||
| 	criconfig "github.com/containerd/containerd/v2/internal/cri/config" | ||||
| 	imagespec "github.com/opencontainers/image-spec/specs-go/v1" | ||||
| 	runtimespec "github.com/opencontainers/runtime-spec/specs-go" | ||||
| 	runtime "k8s.io/cri-api/pkg/apis/runtime/v1" | ||||
| ) | ||||
|  | ||||
| func getRunPodSandboxTestData() (*runtime.PodSandboxConfig, *imagespec.ImageConfig, func(*testing.T, string, *runtimespec.Spec)) { | ||||
| func getRunPodSandboxTestData(_ criconfig.Config) (*runtime.PodSandboxConfig, *imagespec.ImageConfig, func(*testing.T, string, *runtimespec.Spec)) { | ||||
| 	config := &runtime.PodSandboxConfig{} | ||||
| 	imageConfig := &imagespec.ImageConfig{} | ||||
| 	specCheck := func(t *testing.T, id string, spec *runtimespec.Spec) { | ||||
|   | ||||
| @@ -27,8 +27,14 @@ import ( | ||||
| 	runtime "k8s.io/cri-api/pkg/apis/runtime/v1" | ||||
|  | ||||
| 	sandboxstore "github.com/containerd/containerd/v2/internal/cri/store/sandbox" | ||||
| 	"github.com/containerd/containerd/v2/pkg/testutil" | ||||
| ) | ||||
|  | ||||
| func TestEmpty(t *testing.T) { | ||||
| 	// NOTE: It's used to register -test.root for all platforms. | ||||
| 	testutil.RequiresRoot(t) | ||||
| } | ||||
|  | ||||
| func TestSandboxContainerSpec(t *testing.T) { | ||||
| 	switch goruntime.GOOS { | ||||
| 	case "darwin": | ||||
| @@ -97,7 +103,7 @@ func TestSandboxContainerSpec(t *testing.T) { | ||||
| 		test := test | ||||
| 		t.Run(test.desc, func(t *testing.T) { | ||||
| 			c := newControllerService() | ||||
| 			config, imageConfig, specCheck := getRunPodSandboxTestData() | ||||
| 			config, imageConfig, specCheck := getRunPodSandboxTestData(c.config) | ||||
| 			if test.configChange != nil { | ||||
| 				test.configChange(config) | ||||
| 			} | ||||
| @@ -154,7 +160,9 @@ func TestTypeurlMarshalUnmarshalSandboxMeta(t *testing.T) { | ||||
| 				Name:      "sandbox_1", | ||||
| 				NetNSPath: "/home/cloud", | ||||
| 			} | ||||
| 			meta.Config, _, _ = getRunPodSandboxTestData() | ||||
|  | ||||
| 			c := newControllerService() | ||||
| 			meta.Config, _, _ = getRunPodSandboxTestData(c.config) | ||||
| 			if test.configChange != nil { | ||||
| 				test.configChange(meta.Config) | ||||
| 			} | ||||
|   | ||||
| @@ -25,10 +25,11 @@ import ( | ||||
| 	runtime "k8s.io/cri-api/pkg/apis/runtime/v1" | ||||
|  | ||||
| 	"github.com/containerd/containerd/v2/internal/cri/annotations" | ||||
| 	criconfig "github.com/containerd/containerd/v2/internal/cri/config" | ||||
| 	"github.com/containerd/containerd/v2/internal/cri/opts" | ||||
| ) | ||||
|  | ||||
| func getRunPodSandboxTestData() (*runtime.PodSandboxConfig, *imagespec.ImageConfig, func(*testing.T, string, *runtimespec.Spec)) { | ||||
| func getRunPodSandboxTestData(criCfg criconfig.Config) (*runtime.PodSandboxConfig, *imagespec.ImageConfig, func(*testing.T, string, *runtimespec.Spec)) { | ||||
| 	config := &runtime.PodSandboxConfig{ | ||||
| 		Metadata: &runtime.PodSandboxMetadata{ | ||||
| 			Name:      "test-name", | ||||
| @@ -100,7 +101,7 @@ func TestSandboxWindowsNetworkNamespace(t *testing.T) { | ||||
| 	nsPath := "test-cni" | ||||
| 	c := newControllerService() | ||||
|  | ||||
| 	config, imageConfig, specCheck := getRunPodSandboxTestData() | ||||
| 	config, imageConfig, specCheck := getRunPodSandboxTestData(c.config) | ||||
| 	spec, err := c.sandboxContainerSpec(testID, config, imageConfig, nsPath, nil) | ||||
| 	assert.NoError(t, err) | ||||
| 	assert.NotNil(t, spec) | ||||
|   | ||||
| @@ -167,18 +167,7 @@ func (c *criService) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandbox | ||||
| 	} | ||||
|  | ||||
| 	// Setup the network namespace if host networking wasn't requested. | ||||
| 	if !hostNetwork(config) && !userNsEnabled { | ||||
| 		// XXX: We do c&p of this code later for the podNetwork && userNsEnabled case too. | ||||
| 		// We can't move this to a function, as the defer calls need to be executed if other | ||||
| 		// errors are returned in this function. So, we would need more refactors to move | ||||
| 		// this code to a function and the idea was to not change the current code for | ||||
| 		// !userNsEnabled case, therefore doing it would defeat the purpose. | ||||
| 		// | ||||
| 		// The difference between the cases is the use of netns.NewNetNS() vs | ||||
| 		// netns.NewNetNSFromPID(). | ||||
| 		// | ||||
| 		// To simplify this, in the future, we should just remove this case (podNetwork && | ||||
| 		// !userNsEnabled) and just keep the other case (podNetwork && userNsEnabled). | ||||
| 	if !hostNetwork(config) { | ||||
| 		span.AddEvent("setup pod network") | ||||
| 		netStart := time.Now() | ||||
| 		// If it is not in host network namespace then create a namespace and set the sandbox | ||||
| @@ -189,7 +178,13 @@ func (c *criService) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandbox | ||||
| 		if c.config.NetNSMountsUnderStateDir { | ||||
| 			netnsMountDir = filepath.Join(c.config.StateDir, "netns") | ||||
| 		} | ||||
| 		sandbox.NetNS, err = netns.NewNetNS(netnsMountDir) | ||||
|  | ||||
| 		if !userNsEnabled { | ||||
| 			sandbox.NetNS, err = netns.NewNetNS(netnsMountDir) | ||||
| 		} else { | ||||
| 			usernsOpts := config.GetLinux().GetSecurityContext().GetNamespaceOptions().GetUsernsOptions() | ||||
| 			sandbox.NetNS, err = c.setupNetnsWithinUserns(netnsMountDir, usernsOpts) | ||||
| 		} | ||||
| 		if err != nil { | ||||
| 			return nil, fmt.Errorf("failed to create network namespace for sandbox %q: %w", id, err) | ||||
| 		} | ||||
| @@ -284,92 +279,6 @@ func (c *criService) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandbox | ||||
| 		return nil, fmt.Errorf("unable to update extensions for sandbox %q: %w", id, err) | ||||
| 	} | ||||
|  | ||||
| 	if !hostNetwork(config) && userNsEnabled { | ||||
| 		// If userns is enabled, then the netns was created by the OCI runtime | ||||
| 		// on controller.Start(). The OCI runtime needs to create the netns | ||||
| 		// because, if userns is in use, the netns needs to be owned by the | ||||
| 		// userns. So, let the OCI runtime just handle this for us. | ||||
| 		// If the netns is not owned by the userns several problems will happen. | ||||
| 		// For instance, the container will lack permission (even if | ||||
| 		// capabilities are present) to modify the netns or, even worse, the OCI | ||||
| 		// runtime will fail to mount sysfs: | ||||
| 		//      https://github.com/torvalds/linux/commit/7dc5dbc879bd0779924b5132a48b731a0bc04a1e#diff-4839664cd0c8eab716e064323c7cd71fR1164 | ||||
| 		// | ||||
| 		// Note we do this after controller.Start(), as before that we | ||||
| 		// can't get the PID for the sandbox that we need for the netns. | ||||
| 		// Doing a controller.Status() call before that fails (can't | ||||
| 		// find the sandbox) so we can't get the PID. | ||||
| 		netStart := time.Now() | ||||
|  | ||||
| 		// If it is not in host network namespace then create a namespace and set the sandbox | ||||
| 		// handle. NetNSPath in sandbox metadata and NetNS is non empty only for non host network | ||||
| 		// namespaces. If the pod is in host network namespace then both are empty and should not | ||||
| 		// be used. | ||||
| 		var netnsMountDir = "/var/run/netns" | ||||
| 		if c.config.NetNSMountsUnderStateDir { | ||||
| 			netnsMountDir = filepath.Join(c.config.StateDir, "netns") | ||||
| 		} | ||||
|  | ||||
| 		sandbox.NetNS, err = netns.NewNetNSFromPID(netnsMountDir, ctrl.Pid) | ||||
| 		if err != nil { | ||||
| 			return nil, fmt.Errorf("failed to create network namespace for sandbox %q: %w", id, err) | ||||
| 		} | ||||
|  | ||||
| 		// Update network namespace in the store, which is used to generate the container's spec | ||||
| 		sandbox.NetNSPath = sandbox.NetNS.GetPath() | ||||
| 		defer func() { | ||||
| 			// Remove the network namespace only if all the resource cleanup is done | ||||
| 			if retErr != nil && cleanupErr == nil { | ||||
| 				if cleanupErr = sandbox.NetNS.Remove(); cleanupErr != nil { | ||||
| 					log.G(ctx).WithError(cleanupErr).Errorf("Failed to remove network namespace %s for sandbox %q", sandbox.NetNSPath, id) | ||||
| 					return | ||||
| 				} | ||||
| 				sandbox.NetNSPath = "" | ||||
| 			} | ||||
| 		}() | ||||
|  | ||||
| 		if err := sandboxInfo.AddExtension(podsandbox.MetadataKey, &sandbox.Metadata); err != nil { | ||||
| 			return nil, fmt.Errorf("unable to save sandbox %q to store: %w", id, err) | ||||
| 		} | ||||
| 		// Save sandbox metadata to store | ||||
| 		if sandboxInfo, err = c.client.SandboxStore().Update(ctx, sandboxInfo, "extensions"); err != nil { | ||||
| 			return nil, fmt.Errorf("unable to update extensions for sandbox %q: %w", id, err) | ||||
| 		} | ||||
|  | ||||
| 		// Define this defer to teardownPodNetwork prior to the setupPodNetwork function call. | ||||
| 		// This is because in setupPodNetwork the resource is allocated even if it returns error, unlike other resource | ||||
| 		// creation functions. | ||||
| 		defer func() { | ||||
| 			// Remove the network namespace only if all the resource cleanup is done. | ||||
| 			if retErr != nil && cleanupErr == nil { | ||||
| 				deferCtx, deferCancel := util.DeferContext() | ||||
| 				defer deferCancel() | ||||
| 				// Teardown network if an error is returned. | ||||
| 				if cleanupErr = c.teardownPodNetwork(deferCtx, sandbox); cleanupErr != nil { | ||||
| 					log.G(ctx).WithError(cleanupErr).Errorf("Failed to destroy network for sandbox %q", id) | ||||
| 				} | ||||
|  | ||||
| 			} | ||||
| 		}() | ||||
|  | ||||
| 		// Setup network for sandbox. | ||||
| 		// Certain VM based solutions like clear containers (Issue containerd/cri-containerd#524) | ||||
| 		// rely on the assumption that CRI shim will not be querying the network namespace to check the | ||||
| 		// network states such as IP. | ||||
| 		// In future runtime implementation should avoid relying on CRI shim implementation details. | ||||
| 		// In this case however caching the IP will add a subtle performance enhancement by avoiding | ||||
| 		// calls to network namespace of the pod to query the IP of the veth interface on every | ||||
| 		// SandboxStatus request. | ||||
| 		if err := c.setupPodNetwork(ctx, &sandbox); err != nil { | ||||
| 			return nil, fmt.Errorf("failed to setup network for sandbox %q: %w", id, err) | ||||
| 		} | ||||
| 		sandboxCreateNetworkTimer.UpdateSince(netStart) | ||||
|  | ||||
| 		span.AddEvent("finished pod network setup", | ||||
| 			tracing.Attribute("pod.network.setup.duration", time.Since(netStart).String()), | ||||
| 		) | ||||
| 	} | ||||
|  | ||||
| 	// TODO: get rid of this. sandbox object should no longer have Container field. | ||||
| 	if ociRuntime.Sandboxer == string(criconfig.ModePodSandbox) { | ||||
| 		container, err := c.client.LoadContainer(ctx, id) | ||||
|   | ||||
| @@ -18,9 +18,14 @@ package server | ||||
|  | ||||
| import ( | ||||
| 	"fmt" | ||||
| 	"syscall" | ||||
|  | ||||
| 	"github.com/containerd/containerd/v2/pkg/netns" | ||||
| 	"github.com/containerd/containerd/v2/pkg/sys" | ||||
|  | ||||
| 	"github.com/containernetworking/plugins/pkg/ns" | ||||
| 	"github.com/vishvananda/netlink" | ||||
| 	runtime "k8s.io/cri-api/pkg/apis/runtime/v1" | ||||
| ) | ||||
|  | ||||
| func (c *criService) bringUpLoopback(netns string) error { | ||||
| @@ -35,3 +40,44 @@ func (c *criService) bringUpLoopback(netns string) error { | ||||
| 	} | ||||
| 	return nil | ||||
| } | ||||
|  | ||||
| func (c *criService) setupNetnsWithinUserns(netnsMountDir string, opt *runtime.UserNamespace) (*netns.NetNS, error) { | ||||
| 	if opt.GetMode() != runtime.NamespaceMode_POD { | ||||
| 		return nil, fmt.Errorf("required pod-level user namespace setting") | ||||
| 	} | ||||
|  | ||||
| 	uidMaps := opt.GetUids() | ||||
| 	if len(uidMaps) != 1 { | ||||
| 		return nil, fmt.Errorf("required only one uid mapping, but got %d uid mapping(s)", len(uidMaps)) | ||||
| 	} | ||||
| 	if uidMaps[0] == nil { | ||||
| 		return nil, fmt.Errorf("required only one uid mapping, but got empty uid mapping") | ||||
| 	} | ||||
|  | ||||
| 	gidMaps := opt.GetGids() | ||||
| 	if len(gidMaps) != 1 { | ||||
| 		return nil, fmt.Errorf("required only one gid mapping, but got %d gid mapping(s)", len(gidMaps)) | ||||
| 	} | ||||
| 	if gidMaps[0] == nil { | ||||
| 		return nil, fmt.Errorf("required only one gid mapping, but got empty gid mapping") | ||||
| 	} | ||||
|  | ||||
| 	var netNs *netns.NetNS | ||||
| 	var err error | ||||
| 	uerr := sys.UnshareAfterEnterUserns( | ||||
| 		fmt.Sprintf("%d:%d:%d", uidMaps[0].ContainerId, uidMaps[0].HostId, uidMaps[0].Length), | ||||
| 		fmt.Sprintf("%d:%d:%d", gidMaps[0].ContainerId, gidMaps[0].HostId, gidMaps[0].Length), | ||||
| 		syscall.CLONE_NEWNET, | ||||
| 		func(pid int) error { | ||||
| 			netNs, err = netns.NewNetNSFromPID(netnsMountDir, uint32(pid)) | ||||
| 			if err != nil { | ||||
| 				return fmt.Errorf("failed to mount netns from pid %d: %w", pid, err) | ||||
| 			} | ||||
| 			return nil | ||||
| 		}, | ||||
| 	) | ||||
| 	if uerr != nil { | ||||
| 		return nil, uerr | ||||
| 	} | ||||
| 	return netNs, nil | ||||
| } | ||||
|   | ||||
| @@ -18,6 +18,17 @@ | ||||
|  | ||||
| package server | ||||
|  | ||||
| import ( | ||||
| 	"fmt" | ||||
|  | ||||
| 	"github.com/containerd/containerd/v2/pkg/netns" | ||||
| 	runtime "k8s.io/cri-api/pkg/apis/runtime/v1" | ||||
| ) | ||||
|  | ||||
| func (c *criService) bringUpLoopback(string) error { | ||||
| 	return nil | ||||
| } | ||||
|  | ||||
| func (c *criService) setupNetnsWithinUserns(basedir string, cfg *runtime.UserNamespace) (*netns.NetNS, error) { | ||||
| 	return nil, fmt.Errorf("unsupported to setup netns within userns on unix platform") | ||||
| } | ||||
|   | ||||
| @@ -16,6 +16,17 @@ | ||||
|  | ||||
| package server | ||||
|  | ||||
| import ( | ||||
| 	"fmt" | ||||
|  | ||||
| 	"github.com/containerd/containerd/v2/pkg/netns" | ||||
| 	runtime "k8s.io/cri-api/pkg/apis/runtime/v1" | ||||
| ) | ||||
|  | ||||
| func (c *criService) bringUpLoopback(string) error { | ||||
| 	return nil | ||||
| } | ||||
|  | ||||
| func (c *criService) setupNetnsWithinUserns(basedir string, cfg *runtime.UserNamespace) (*netns.NetNS, error) { | ||||
| 	return nil, fmt.Errorf("unsupported to setup netns within userns on windows platform") | ||||
| } | ||||
|   | ||||
							
								
								
									
										38
									
								
								pkg/sys/namespace_linux.go
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										38
									
								
								pkg/sys/namespace_linux.go
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,38 @@ | ||||
| /* | ||||
|    Copyright The containerd Authors. | ||||
|  | ||||
|    Licensed under the Apache License, Version 2.0 (the "License"); | ||||
|    you may not use this file except in compliance with the License. | ||||
|    You may obtain a copy of the License at | ||||
|  | ||||
|        http://www.apache.org/licenses/LICENSE-2.0 | ||||
|  | ||||
|    Unless required by applicable law or agreed to in writing, software | ||||
|    distributed under the License is distributed on an "AS IS" BASIS, | ||||
|    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||
|    See the License for the specific language governing permissions and | ||||
|    limitations under the License. | ||||
| */ | ||||
|  | ||||
| package sys | ||||
|  | ||||
| import ( | ||||
| 	"fmt" | ||||
| 	"os" | ||||
| 	"syscall" | ||||
|  | ||||
| 	"golang.org/x/sys/unix" | ||||
| ) | ||||
|  | ||||
| // GetUsernsForNamespace returns a file descriptor that refers to the owning | ||||
| // user namespace for the namespace referred to by fd. | ||||
| // | ||||
| // REF: https://man7.org/linux/man-pages/man2/ioctl_ns.2.html | ||||
| func GetUsernsForNamespace(fd uintptr) (*os.File, error) { | ||||
| 	fd, _, errno := unix.Syscall(syscall.SYS_IOCTL, fd, uintptr(unix.NS_GET_USERNS), 0) | ||||
| 	if errno != 0 { | ||||
| 		return nil, fmt.Errorf("failed to get user namespace fd: %w", errno) | ||||
| 	} | ||||
|  | ||||
| 	return os.NewFile(fd, fmt.Sprintf("/proc/%d/fd/%d", os.Getpid(), fd)), nil | ||||
| } | ||||
							
								
								
									
										106
									
								
								pkg/sys/namespace_linux_test.go
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										106
									
								
								pkg/sys/namespace_linux_test.go
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,106 @@ | ||||
| /* | ||||
|    Copyright The containerd Authors. | ||||
|  | ||||
|    Licensed under the Apache License, Version 2.0 (the "License"); | ||||
|    you may not use this file except in compliance with the License. | ||||
|    You may obtain a copy of the License at | ||||
|  | ||||
|        http://www.apache.org/licenses/LICENSE-2.0 | ||||
|  | ||||
|    Unless required by applicable law or agreed to in writing, software | ||||
|    distributed under the License is distributed on an "AS IS" BASIS, | ||||
|    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||
|    See the License for the specific language governing permissions and | ||||
|    limitations under the License. | ||||
| */ | ||||
|  | ||||
| package sys | ||||
|  | ||||
| import ( | ||||
| 	"fmt" | ||||
| 	"os" | ||||
| 	"syscall" | ||||
| 	"testing" | ||||
|  | ||||
| 	kernel "github.com/containerd/containerd/v2/pkg/kernelversion" | ||||
| 	"github.com/containerd/continuity/testutil" | ||||
| 	"github.com/stretchr/testify/require" | ||||
| 	"golang.org/x/sys/unix" | ||||
| ) | ||||
|  | ||||
| func TestGetUsernsForNamespace(t *testing.T) { | ||||
| 	testutil.RequiresRoot(t) | ||||
|  | ||||
| 	t.Parallel() | ||||
|  | ||||
| 	k409 := kernel.KernelVersion{Kernel: 4, Major: 9} | ||||
| 	ok, err := kernel.GreaterEqualThan(k409) | ||||
| 	require.NoError(t, err) | ||||
| 	if !ok { | ||||
| 		t.Skip("Requires kernel >= 4.9") | ||||
| 	} | ||||
|  | ||||
| 	tmpDir := t.TempDir() | ||||
|  | ||||
| 	f, err := os.CreateTemp(tmpDir, "netns") | ||||
| 	require.NoError(t, err) | ||||
|  | ||||
| 	netnsPath := f.Name() | ||||
| 	f.Close() | ||||
|  | ||||
| 	defer testutil.Unmount(t, netnsPath) | ||||
|  | ||||
| 	currentUsernsIno, err := getNamespaceInode(os.Getpid(), "user") | ||||
| 	require.NoError(t, err) | ||||
|  | ||||
| 	usernsIno := uint64(0) | ||||
| 	uerr := UnshareAfterEnterUserns("0:1000:10", "0:1000:10", syscall.CLONE_NEWNET, func(pid int) error { | ||||
| 		err := unix.Mount( | ||||
| 			fmt.Sprintf("/proc/%d/ns/net", pid), | ||||
| 			netnsPath, | ||||
| 			"", | ||||
| 			unix.MS_BIND|unix.MS_RDONLY, | ||||
| 			"", | ||||
| 		) | ||||
| 		if err != nil { | ||||
| 			return err | ||||
| 		} | ||||
|  | ||||
| 		usernsIno, err = getNamespaceInode(pid, "user") | ||||
| 		if err != nil { | ||||
| 			return err | ||||
| 		} | ||||
| 		return nil | ||||
| 	}) | ||||
| 	require.NoError(t, uerr) | ||||
|  | ||||
| 	require.NotEqual(t, currentUsernsIno, usernsIno) | ||||
| 	t.Logf("Current user namespace [%d], new user namespace [%d]", currentUsernsIno, usernsIno) | ||||
|  | ||||
| 	netnsFd, err := os.Open(netnsPath) | ||||
| 	require.NoError(t, err) | ||||
| 	defer netnsFd.Close() | ||||
|  | ||||
| 	usernsFd, err := GetUsernsForNamespace(netnsFd.Fd()) | ||||
| 	require.NoError(t, err) | ||||
| 	defer usernsFd.Close() | ||||
|  | ||||
| 	usernsInoFromNetnsFd := getInode(t, usernsFd) | ||||
|  | ||||
| 	t.Logf("Fetch netns namespace %s' user namespace owner %d", netnsPath, usernsInoFromNetnsFd) | ||||
| 	require.Equal(t, usernsIno, usernsInoFromNetnsFd) | ||||
|  | ||||
| 	parentUsernsFd, err := GetUsernsForNamespace(usernsFd.Fd()) | ||||
| 	require.NoError(t, err) | ||||
| 	defer parentUsernsFd.Close() | ||||
|  | ||||
| 	parentUsernsIno := getInode(t, parentUsernsFd) | ||||
| 	t.Logf("User namespace %d's parent %d", usernsInoFromNetnsFd, parentUsernsIno) | ||||
| 	require.Equal(t, currentUsernsIno, parentUsernsIno) | ||||
| } | ||||
|  | ||||
| func getInode(t *testing.T, f *os.File) uint64 { | ||||
| 	info, err := f.Stat() | ||||
| 	require.NoError(t, err) | ||||
| 	return info.Sys().(*syscall.Stat_t).Ino | ||||
| } | ||||
							
								
								
									
										153
									
								
								pkg/sys/unshare_linux.go
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										153
									
								
								pkg/sys/unshare_linux.go
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,153 @@ | ||||
| /* | ||||
|    Copyright The containerd Authors. | ||||
|  | ||||
|    Licensed under the Apache License, Version 2.0 (the "License"); | ||||
|    you may not use this file except in compliance with the License. | ||||
|    You may obtain a copy of the License at | ||||
|  | ||||
|        http://www.apache.org/licenses/LICENSE-2.0 | ||||
|  | ||||
|    Unless required by applicable law or agreed to in writing, software | ||||
|    distributed under the License is distributed on an "AS IS" BASIS, | ||||
|    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||
|    See the License for the specific language governing permissions and | ||||
|    limitations under the License. | ||||
| */ | ||||
|  | ||||
| package sys | ||||
|  | ||||
| import ( | ||||
| 	"errors" | ||||
| 	"fmt" | ||||
| 	"os" | ||||
| 	"runtime" | ||||
| 	"strconv" | ||||
| 	"strings" | ||||
| 	"syscall" | ||||
|  | ||||
| 	"golang.org/x/sys/unix" | ||||
| ) | ||||
|  | ||||
| // UnshareAfterEnterUserns allows to disassociate parts of its execution context | ||||
| // within a user namespace. | ||||
| func UnshareAfterEnterUserns(uidMap, gidMap string, unshareFlags uintptr, f func(pid int) error) (retErr error) { | ||||
| 	if unshareFlags&syscall.CLONE_NEWUSER == syscall.CLONE_NEWUSER { | ||||
| 		return fmt.Errorf("unshare flags should not include user namespace") | ||||
| 	} | ||||
|  | ||||
| 	uidMaps, err := parseIDMapping(uidMap) | ||||
| 	if err != nil { | ||||
| 		return err | ||||
| 	} | ||||
|  | ||||
| 	gidMaps, err := parseIDMapping(gidMap) | ||||
| 	if err != nil { | ||||
| 		return err | ||||
| 	} | ||||
|  | ||||
| 	var pidfd int | ||||
| 	proc, err := os.StartProcess("/proc/self/exe", []string{"UnshareAfterEnterUserns"}, &os.ProcAttr{ | ||||
| 		Sys: &syscall.SysProcAttr{ | ||||
| 			// clone new user namespace first and then unshare | ||||
| 			Cloneflags:   unix.CLONE_NEWUSER, | ||||
| 			Unshareflags: unshareFlags, | ||||
| 			UidMappings:  uidMaps, | ||||
| 			GidMappings:  gidMaps, | ||||
| 			// NOTE: It's reexec but it's not heavy because subprocess | ||||
| 			// be in PTRACE_TRACEME mode before performing execve. | ||||
| 			Ptrace:    true, | ||||
| 			Pdeathsig: syscall.SIGKILL, | ||||
| 			PidFD:     &pidfd, | ||||
| 		}, | ||||
| 	}) | ||||
| 	if err != nil { | ||||
| 		return fmt.Errorf("failed to start noop process for unshare: %w", err) | ||||
| 	} | ||||
|  | ||||
| 	if pidfd == -1 || !SupportsPidFD() { | ||||
| 		proc.Kill() | ||||
| 		proc.Wait() | ||||
| 		return fmt.Errorf("kernel doesn't support CLONE_PIDFD") | ||||
| 	} | ||||
|  | ||||
| 	// Since go1.23.{0,1} has double close issue, we should dup it before using it. | ||||
| 	// | ||||
| 	// References: | ||||
| 	// - https://github.com/golang/go/issues/68984 | ||||
| 	// - https://github.com/golang/go/milestone/371 | ||||
| 	if goVer := runtime.Version(); goVer == "go1.23.0" || goVer == "go1.23.1" { | ||||
| 		dupPidfd, dupErr := unix.FcntlInt(uintptr(pidfd), syscall.F_DUPFD_CLOEXEC, 0) | ||||
| 		if dupErr != nil { | ||||
| 			proc.Kill() | ||||
| 			proc.Wait() | ||||
| 			return fmt.Errorf("failed to dupfd: %w", err) | ||||
| 		} | ||||
| 		pidfd = dupPidfd | ||||
| 	} | ||||
|  | ||||
| 	defer func() { | ||||
| 		derr := unix.PidfdSendSignal(pidfd, unix.SIGKILL, nil, 0) | ||||
| 		if derr != nil { | ||||
| 			if !errors.Is(derr, unix.ESRCH) { | ||||
| 				retErr = derr | ||||
| 			} | ||||
| 			return | ||||
| 		} | ||||
| 		pidfdWaitid(pidfd) | ||||
| 	}() | ||||
|  | ||||
| 	if f != nil { | ||||
| 		if err := f(proc.Pid); err != nil { | ||||
| 			return err | ||||
| 		} | ||||
| 	} | ||||
|  | ||||
| 	// Ensure the child process is still alive. If the err is ESRCH, we | ||||
| 	// should return error because the pid could be reused. It's safe to | ||||
| 	// return error and retry. | ||||
| 	if err := unix.PidfdSendSignal(pidfd, 0, nil, 0); err != nil { | ||||
| 		return fmt.Errorf("failed to ensure child process is alive: %w", err) | ||||
| 	} | ||||
| 	return nil | ||||
| } | ||||
|  | ||||
| // TODO: Support multiple mappings in future | ||||
| func parseIDMapping(mapping string) ([]syscall.SysProcIDMap, error) { | ||||
| 	parts := strings.Split(mapping, ":") | ||||
| 	if len(parts) != 3 { | ||||
| 		return nil, fmt.Errorf("user namespace mappings require the format `container-id:host-id:size`") | ||||
| 	} | ||||
|  | ||||
| 	cID, err := strconv.Atoi(parts[0]) | ||||
| 	if err != nil { | ||||
| 		return nil, fmt.Errorf("invalid container id for user namespace remapping, %w", err) | ||||
| 	} | ||||
|  | ||||
| 	hID, err := strconv.Atoi(parts[1]) | ||||
| 	if err != nil { | ||||
| 		return nil, fmt.Errorf("invalid host id for user namespace remapping, %w", err) | ||||
| 	} | ||||
|  | ||||
| 	size, err := strconv.Atoi(parts[2]) | ||||
| 	if err != nil { | ||||
| 		return nil, fmt.Errorf("invalid size for user namespace remapping, %w", err) | ||||
| 	} | ||||
|  | ||||
| 	if cID < 0 || hID < 0 || size < 0 { | ||||
| 		return nil, fmt.Errorf("invalid mapping %s, all IDs and size must be positive integers", mapping) | ||||
| 	} | ||||
|  | ||||
| 	return []syscall.SysProcIDMap{ | ||||
| 		{ | ||||
| 			ContainerID: cID, | ||||
| 			HostID:      hID, | ||||
| 			Size:        size, | ||||
| 		}, | ||||
| 	}, nil | ||||
| } | ||||
|  | ||||
| func pidfdWaitid(pidfd int) error { | ||||
| 	return IgnoringEINTR(func() error { | ||||
| 		return unix.Waitid(unix.P_PIDFD, pidfd, nil, unix.WEXITED, nil) | ||||
| 	}) | ||||
| } | ||||
							
								
								
									
										149
									
								
								pkg/sys/unshare_linux_test.go
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										149
									
								
								pkg/sys/unshare_linux_test.go
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,149 @@ | ||||
| /* | ||||
|    Copyright The containerd Authors. | ||||
|  | ||||
|    Licensed under the Apache License, Version 2.0 (the "License"); | ||||
|    you may not use this file except in compliance with the License. | ||||
|    You may obtain a copy of the License at | ||||
|  | ||||
|        http://www.apache.org/licenses/LICENSE-2.0 | ||||
|  | ||||
|    Unless required by applicable law or agreed to in writing, software | ||||
|    distributed under the License is distributed on an "AS IS" BASIS, | ||||
|    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||
|    See the License for the specific language governing permissions and | ||||
|    limitations under the License. | ||||
| */ | ||||
|  | ||||
| package sys | ||||
|  | ||||
| import ( | ||||
| 	"fmt" | ||||
| 	"os" | ||||
| 	"syscall" | ||||
| 	"testing" | ||||
|  | ||||
| 	kernel "github.com/containerd/containerd/v2/pkg/kernelversion" | ||||
| 	"github.com/containerd/continuity/testutil" | ||||
| 	"github.com/stretchr/testify/require" | ||||
| ) | ||||
|  | ||||
| func TestUnshareAfterEnterUserns(t *testing.T) { | ||||
| 	testutil.RequiresRoot(t) | ||||
|  | ||||
| 	k510 := kernel.KernelVersion{Kernel: 5, Major: 10} | ||||
| 	ok, err := kernel.GreaterEqualThan(k510) | ||||
| 	require.NoError(t, err) | ||||
| 	if !ok { | ||||
| 		t.Skip("Requires kernel >= 5.10") | ||||
| 	} | ||||
|  | ||||
| 	err = UnshareAfterEnterUserns("0:1000:1", "0:1000:1", syscall.CLONE_NEWUSER|syscall.CLONE_NEWIPC, nil) | ||||
| 	require.Error(t, err) | ||||
| 	require.ErrorContains(t, err, "unshare flags should not include user namespace") | ||||
|  | ||||
| 	t.Run("should work", testUnshareAfterEnterUsernsShouldWork) | ||||
| 	t.Run("killpid", testUnshareAfterEnterUsernsKillPid) | ||||
| 	t.Run("invalid unshare flags", testUnshareAfterEnterUsernsInvalidFlags) | ||||
| } | ||||
|  | ||||
| func testUnshareAfterEnterUsernsShouldWork(t *testing.T) { | ||||
| 	t.Parallel() | ||||
|  | ||||
| 	currentNetNs, err := getNamespaceInode(os.Getpid(), "net") | ||||
| 	require.NoError(t, err) | ||||
|  | ||||
| 	currentUserNs, err := getNamespaceInode(os.Getpid(), "user") | ||||
| 	require.NoError(t, err) | ||||
|  | ||||
| 	currentIpcNs, err := getNamespaceInode(os.Getpid(), "ipc") | ||||
| 	require.NoError(t, err) | ||||
|  | ||||
| 	currentPidNs, err := getNamespaceInode(os.Getpid(), "pid") | ||||
| 	require.NoError(t, err) | ||||
|  | ||||
| 	uerr := UnshareAfterEnterUserns("0:1000:10", "0:1000:10", syscall.CLONE_NEWIPC|syscall.CLONE_NEWNET, func(pid int) error { | ||||
| 		netNs, err := getNamespaceInode(pid, "net") | ||||
| 		require.NoError(t, err) | ||||
| 		require.NotEqual(t, currentNetNs, netNs) | ||||
|  | ||||
| 		userNs, err := getNamespaceInode(pid, "user") | ||||
| 		require.NoError(t, err) | ||||
| 		require.NotEqual(t, currentUserNs, userNs) | ||||
|  | ||||
| 		ipcNs, err := getNamespaceInode(pid, "ipc") | ||||
| 		require.NoError(t, err) | ||||
| 		require.NotEqual(t, currentIpcNs, ipcNs) | ||||
|  | ||||
| 		pidNs, err := getNamespaceInode(pid, "pid") | ||||
| 		require.NoError(t, err) | ||||
| 		require.Equal(t, currentPidNs, pidNs) | ||||
|  | ||||
| 		data, err := os.ReadFile(fmt.Sprintf("/proc/%d/uid_map", pid)) | ||||
| 		require.NoError(t, err) | ||||
| 		require.Equal(t, "         0       1000         10\n", string(data)) | ||||
|  | ||||
| 		data, err = os.ReadFile(fmt.Sprintf("/proc/%d/gid_map", pid)) | ||||
| 		require.NoError(t, err) | ||||
| 		require.Equal(t, "         0       1000         10\n", string(data)) | ||||
| 		return nil | ||||
| 	}) | ||||
| 	require.NoError(t, uerr) | ||||
| } | ||||
|  | ||||
| func testUnshareAfterEnterUsernsKillPid(t *testing.T) { | ||||
| 	t.Parallel() | ||||
|  | ||||
| 	uerr := UnshareAfterEnterUserns("0:1000:1", "0:1000:1", syscall.CLONE_NEWIPC|syscall.CLONE_NEWNET, func(pid int) error { | ||||
| 		proc, err := os.FindProcess(pid) | ||||
| 		if err != nil { | ||||
| 			return fmt.Errorf("failed to find process: %w", err) | ||||
| 		} | ||||
|  | ||||
| 		if err := proc.Kill(); err != nil { | ||||
| 			return fmt.Errorf("failed to kill process: %w", err) | ||||
| 		} | ||||
|  | ||||
| 		proc.Wait() | ||||
|  | ||||
| 		_, err = os.OpenFile(fmt.Sprintf("/proc/%d/ns/net", pid), os.O_RDONLY, 0600) | ||||
| 		require.Error(t, err) | ||||
| 		require.ErrorIs(t, err, os.ErrNotExist) | ||||
| 		return err | ||||
| 	}) | ||||
| 	require.Error(t, uerr) | ||||
| 	require.ErrorIs(t, uerr, os.ErrNotExist) | ||||
|  | ||||
| 	uerr = UnshareAfterEnterUserns("0:1000:1", "0:1000:1", syscall.CLONE_NEWIPC|syscall.CLONE_NEWNET, func(pid int) error { | ||||
| 		proc, err := os.FindProcess(pid) | ||||
| 		if err != nil { | ||||
| 			return fmt.Errorf("failed to find process: %w", err) | ||||
| 		} | ||||
|  | ||||
| 		if err := proc.Kill(); err != nil { | ||||
| 			return fmt.Errorf("failed to kill process: %w", err) | ||||
| 		} | ||||
|  | ||||
| 		proc.Wait() | ||||
|  | ||||
| 		return nil | ||||
| 	}) | ||||
| 	require.Error(t, uerr) | ||||
| 	require.ErrorContains(t, uerr, "failed to ensure child process is alive: no such process") | ||||
| } | ||||
|  | ||||
| func testUnshareAfterEnterUsernsInvalidFlags(t *testing.T) { | ||||
| 	t.Parallel() | ||||
|  | ||||
| 	uerr := UnshareAfterEnterUserns("0:1000:1", "0:1000:1", syscall.CLONE_IO, nil) | ||||
| 	require.Error(t, uerr) | ||||
| 	require.ErrorContains(t, uerr, "fork/exec /proc/self/exe: invalid argument") | ||||
| } | ||||
|  | ||||
| func getNamespaceInode(pid int, typ string) (uint64, error) { | ||||
| 	info, err := os.Stat(fmt.Sprintf("/proc/%d/ns/%s", pid, typ)) | ||||
| 	if err != nil { | ||||
| 		return 0, err | ||||
| 	} | ||||
|  | ||||
| 	return info.Sys().(*syscall.Stat_t).Ino, nil | ||||
| } | ||||
		Reference in New Issue
	
	Block a user
	 Akihiro Suda
					Akihiro Suda