Merge pull request #10607 from fuweid/pin-userns
internal/cri: simplify netns setup with pinned userns
This commit is contained in:
		| @@ -301,6 +301,23 @@ func WithoutNamespace(t runtimespec.LinuxNamespaceType) oci.SpecOpts { | |||||||
| 	} | 	} | ||||||
| } | } | ||||||
|  |  | ||||||
|  | // WithNamespacePath updates namespace with existing path. | ||||||
|  | func WithNamespacePath(t runtimespec.LinuxNamespaceType, nsPath string) oci.SpecOpts { | ||||||
|  | 	return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) error { | ||||||
|  | 		if s.Linux == nil { | ||||||
|  | 			return fmt.Errorf("Linux spec is required") | ||||||
|  | 		} | ||||||
|  |  | ||||||
|  | 		for i, ns := range s.Linux.Namespaces { | ||||||
|  | 			if ns.Type == t { | ||||||
|  | 				s.Linux.Namespaces[i].Path = nsPath | ||||||
|  | 				return nil | ||||||
|  | 			} | ||||||
|  | 		} | ||||||
|  | 		return fmt.Errorf("no such namespace %s", t) | ||||||
|  | 	} | ||||||
|  | } | ||||||
|  |  | ||||||
| // WithPodNamespaces sets the pod namespaces for the container | // WithPodNamespaces sets the pod namespaces for the container | ||||||
| func WithPodNamespaces(config *runtime.LinuxContainerSecurityContext, sandboxPid uint32, targetPid uint32, uids, gids []runtimespec.LinuxIDMapping) oci.SpecOpts { | func WithPodNamespaces(config *runtime.LinuxContainerSecurityContext, sandboxPid uint32, targetPid uint32, uids, gids []runtimespec.LinuxIDMapping) oci.SpecOpts { | ||||||
| 	namespaces := config.GetNamespaceOptions() | 	namespaces := config.GetNamespaceOptions() | ||||||
|   | |||||||
| @@ -40,6 +40,7 @@ import ( | |||||||
| 	"github.com/containerd/containerd/v2/core/snapshots" | 	"github.com/containerd/containerd/v2/core/snapshots" | ||||||
| 	"github.com/containerd/containerd/v2/internal/cri/seutil" | 	"github.com/containerd/containerd/v2/internal/cri/seutil" | ||||||
| 	"github.com/containerd/containerd/v2/pkg/seccomp" | 	"github.com/containerd/containerd/v2/pkg/seccomp" | ||||||
|  | 	"github.com/containerd/containerd/v2/pkg/sys" | ||||||
| ) | ) | ||||||
|  |  | ||||||
| const ( | const ( | ||||||
| @@ -88,6 +89,50 @@ func (c *Controller) getSandboxDevShm(id string) string { | |||||||
| 	return filepath.Join(c.getVolatileSandboxRootDir(id), "shm") | 	return filepath.Join(c.getVolatileSandboxRootDir(id), "shm") | ||||||
| } | } | ||||||
|  |  | ||||||
|  | // getSandboxPinnedNamespaces returns the pinned namespaces directory inside the | ||||||
|  | // sandbox state directory. | ||||||
|  | func (c *Controller) getSandboxPinnedNamespaces(id string) string { | ||||||
|  | 	return filepath.Join(c.getVolatileSandboxRootDir(id), "pinned-namespaces") | ||||||
|  | } | ||||||
|  |  | ||||||
|  | // getSandboxPinnedUserNamespace returns the pinned user namespace file. | ||||||
|  | func (c *Controller) getSandboxPinnedUserNamespace(id string) string { | ||||||
|  | 	return filepath.Join(c.getSandboxPinnedNamespaces(id), "user") | ||||||
|  | } | ||||||
|  |  | ||||||
|  | // pinUserNamespace persists user namespace in namespace filesystem. | ||||||
|  | func (c *Controller) pinUserNamespace(sandboxID string, netnsPath string) error { | ||||||
|  | 	nsPath := c.getSandboxPinnedUserNamespace(sandboxID) | ||||||
|  |  | ||||||
|  | 	baseDir := filepath.Dir(nsPath) | ||||||
|  | 	if err := os.MkdirAll(baseDir, 0755); err != nil { | ||||||
|  | 		return fmt.Errorf("failed to init pinned-namespaces directory %s: %w", baseDir, err) | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	emptyFd, err := os.OpenFile(nsPath, os.O_RDWR|os.O_CREATE|os.O_EXCL, 0666) | ||||||
|  | 	if err != nil { | ||||||
|  | 		return fmt.Errorf("failed to create empty file %s: %w", nsPath, err) | ||||||
|  | 	} | ||||||
|  | 	emptyFd.Close() | ||||||
|  |  | ||||||
|  | 	netnsFd, err := os.Open(netnsPath) | ||||||
|  | 	if err != nil { | ||||||
|  | 		return fmt.Errorf("failed to open netns(%s): %w", netnsPath, err) | ||||||
|  | 	} | ||||||
|  | 	defer netnsFd.Close() | ||||||
|  |  | ||||||
|  | 	usernsFd, err := sys.GetUsernsForNamespace(netnsFd.Fd()) | ||||||
|  | 	if err != nil { | ||||||
|  | 		return fmt.Errorf("failed to get user namespace for netns(%s): %w", netnsPath, err) | ||||||
|  | 	} | ||||||
|  | 	defer usernsFd.Close() | ||||||
|  |  | ||||||
|  | 	if err = unix.Mount(usernsFd.Name(), nsPath, "none", unix.MS_BIND, ""); err != nil { | ||||||
|  | 		return fmt.Errorf("failed to bind mount ns src: %v at %s: %w", usernsFd.Name(), nsPath, err) | ||||||
|  | 	} | ||||||
|  | 	return nil | ||||||
|  | } | ||||||
|  |  | ||||||
| func toLabel(selinuxOptions *runtime.SELinuxOption) ([]string, error) { | func toLabel(selinuxOptions *runtime.SELinuxOption) ([]string, error) { | ||||||
| 	var labels []string | 	var labels []string | ||||||
|  |  | ||||||
|   | |||||||
| @@ -95,6 +95,39 @@ func (c *Controller) Start(ctx context.Context, id string) (cin sandbox.Controll | |||||||
|  |  | ||||||
| 	labels["oci_runtime_type"] = ociRuntime.Type | 	labels["oci_runtime_type"] = ociRuntime.Type | ||||||
|  |  | ||||||
|  | 	// Create sandbox container root directories. | ||||||
|  | 	sandboxRootDir := c.getSandboxRootDir(id) | ||||||
|  | 	if err := c.os.MkdirAll(sandboxRootDir, 0755); err != nil { | ||||||
|  | 		return cin, fmt.Errorf("failed to create sandbox root directory %q: %w", | ||||||
|  | 			sandboxRootDir, err) | ||||||
|  | 	} | ||||||
|  | 	defer func() { | ||||||
|  | 		if retErr != nil && cleanupErr == nil { | ||||||
|  | 			// Cleanup the sandbox root directory. | ||||||
|  | 			if cleanupErr = c.os.RemoveAll(sandboxRootDir); cleanupErr != nil { | ||||||
|  | 				log.G(ctx).WithError(cleanupErr).Errorf("Failed to remove sandbox root directory %q", | ||||||
|  | 					sandboxRootDir) | ||||||
|  | 			} | ||||||
|  | 		} | ||||||
|  | 	}() | ||||||
|  |  | ||||||
|  | 	volatileSandboxRootDir := c.getVolatileSandboxRootDir(id) | ||||||
|  | 	if err := c.os.MkdirAll(volatileSandboxRootDir, 0755); err != nil { | ||||||
|  | 		return cin, fmt.Errorf("failed to create volatile sandbox root directory %q: %w", | ||||||
|  | 			volatileSandboxRootDir, err) | ||||||
|  | 	} | ||||||
|  | 	defer func() { | ||||||
|  | 		if retErr != nil && cleanupErr == nil { | ||||||
|  | 			deferCtx, deferCancel := ctrdutil.DeferContext() | ||||||
|  | 			defer deferCancel() | ||||||
|  | 			// Cleanup the volatile sandbox root directory. | ||||||
|  | 			if cleanupErr = ensureRemoveAll(deferCtx, volatileSandboxRootDir); cleanupErr != nil { | ||||||
|  | 				log.G(ctx).WithError(cleanupErr).Errorf("Failed to remove volatile sandbox root directory %q", | ||||||
|  | 					volatileSandboxRootDir) | ||||||
|  | 			} | ||||||
|  | 		} | ||||||
|  | 	}() | ||||||
|  |  | ||||||
| 	// Create sandbox container. | 	// Create sandbox container. | ||||||
| 	// NOTE: sandboxContainerSpec SHOULD NOT have side | 	// NOTE: sandboxContainerSpec SHOULD NOT have side | ||||||
| 	// effect, e.g. accessing/creating files, so that we can test | 	// effect, e.g. accessing/creating files, so that we can test | ||||||
| @@ -164,37 +197,6 @@ func (c *Controller) Start(ctx context.Context, id string) (cin sandbox.Controll | |||||||
| 		} | 		} | ||||||
| 	}() | 	}() | ||||||
|  |  | ||||||
| 	// Create sandbox container root directories. |  | ||||||
| 	sandboxRootDir := c.getSandboxRootDir(id) |  | ||||||
| 	if err := c.os.MkdirAll(sandboxRootDir, 0755); err != nil { |  | ||||||
| 		return cin, fmt.Errorf("failed to create sandbox root directory %q: %w", |  | ||||||
| 			sandboxRootDir, err) |  | ||||||
| 	} |  | ||||||
| 	defer func() { |  | ||||||
| 		if retErr != nil && cleanupErr == nil { |  | ||||||
| 			// Cleanup the sandbox root directory. |  | ||||||
| 			if cleanupErr = c.os.RemoveAll(sandboxRootDir); cleanupErr != nil { |  | ||||||
| 				log.G(ctx).WithError(cleanupErr).Errorf("Failed to remove sandbox root directory %q", |  | ||||||
| 					sandboxRootDir) |  | ||||||
| 			} |  | ||||||
| 		} |  | ||||||
| 	}() |  | ||||||
|  |  | ||||||
| 	volatileSandboxRootDir := c.getVolatileSandboxRootDir(id) |  | ||||||
| 	if err := c.os.MkdirAll(volatileSandboxRootDir, 0755); err != nil { |  | ||||||
| 		return cin, fmt.Errorf("failed to create volatile sandbox root directory %q: %w", |  | ||||||
| 			volatileSandboxRootDir, err) |  | ||||||
| 	} |  | ||||||
| 	defer func() { |  | ||||||
| 		if retErr != nil && cleanupErr == nil { |  | ||||||
| 			// Cleanup the volatile sandbox root directory. |  | ||||||
| 			if cleanupErr = c.os.RemoveAll(volatileSandboxRootDir); cleanupErr != nil { |  | ||||||
| 				log.G(ctx).WithError(cleanupErr).Errorf("Failed to remove volatile sandbox root directory %q", |  | ||||||
| 					volatileSandboxRootDir) |  | ||||||
| 			} |  | ||||||
| 		} |  | ||||||
| 	}() |  | ||||||
|  |  | ||||||
| 	// Setup files required for the sandbox. | 	// Setup files required for the sandbox. | ||||||
| 	if err = c.setupSandboxFiles(id, config); err != nil { | 	if err = c.setupSandboxFiles(id, config); err != nil { | ||||||
| 		return cin, fmt.Errorf("failed to setup sandbox files: %w", err) | 		return cin, fmt.Errorf("failed to setup sandbox files: %w", err) | ||||||
|   | |||||||
| @@ -103,6 +103,11 @@ func (c *Controller) sandboxContainerSpec(id string, config *runtime.PodSandboxC | |||||||
| 		case runtime.NamespaceMode_POD: | 		case runtime.NamespaceMode_POD: | ||||||
| 			specOpts = append(specOpts, oci.WithUserNamespace(uids, gids)) | 			specOpts = append(specOpts, oci.WithUserNamespace(uids, gids)) | ||||||
| 			usernsEnabled = true | 			usernsEnabled = true | ||||||
|  |  | ||||||
|  | 			if err := c.pinUserNamespace(id, nsPath); err != nil { | ||||||
|  | 				return nil, fmt.Errorf("failed to pin user namespace: %w", err) | ||||||
|  | 			} | ||||||
|  | 			specOpts = append(specOpts, customopts.WithNamespacePath(runtimespec.UserNamespace, c.getSandboxPinnedUserNamespace(id))) | ||||||
| 		default: | 		default: | ||||||
| 			return nil, fmt.Errorf("unsupported user namespace mode: %q", mode) | 			return nil, fmt.Errorf("unsupported user namespace mode: %q", mode) | ||||||
| 		} | 		} | ||||||
|   | |||||||
| @@ -17,9 +17,11 @@ | |||||||
| package podsandbox | package podsandbox | ||||||
|  |  | ||||||
| import ( | import ( | ||||||
|  | 	"context" | ||||||
| 	"os" | 	"os" | ||||||
| 	"path/filepath" | 	"path/filepath" | ||||||
| 	"strconv" | 	"strconv" | ||||||
|  | 	"syscall" | ||||||
| 	"testing" | 	"testing" | ||||||
|  |  | ||||||
| 	"github.com/moby/sys/userns" | 	"github.com/moby/sys/userns" | ||||||
| @@ -32,11 +34,15 @@ import ( | |||||||
| 	v1 "k8s.io/cri-api/pkg/apis/runtime/v1" | 	v1 "k8s.io/cri-api/pkg/apis/runtime/v1" | ||||||
|  |  | ||||||
| 	"github.com/containerd/containerd/v2/internal/cri/annotations" | 	"github.com/containerd/containerd/v2/internal/cri/annotations" | ||||||
|  | 	criconfig "github.com/containerd/containerd/v2/internal/cri/config" | ||||||
| 	"github.com/containerd/containerd/v2/internal/cri/opts" | 	"github.com/containerd/containerd/v2/internal/cri/opts" | ||||||
|  | 	"github.com/containerd/containerd/v2/pkg/netns" | ||||||
| 	ostesting "github.com/containerd/containerd/v2/pkg/os/testing" | 	ostesting "github.com/containerd/containerd/v2/pkg/os/testing" | ||||||
|  | 	"github.com/containerd/containerd/v2/pkg/sys" | ||||||
|  | 	"github.com/containerd/containerd/v2/pkg/testutil" | ||||||
| ) | ) | ||||||
|  |  | ||||||
| func getRunPodSandboxTestData() (*runtime.PodSandboxConfig, *imagespec.ImageConfig, func(*testing.T, string, *runtimespec.Spec)) { | func getRunPodSandboxTestData(criCfg criconfig.Config) (*runtime.PodSandboxConfig, *imagespec.ImageConfig, func(*testing.T, string, *runtimespec.Spec)) { | ||||||
| 	config := &runtime.PodSandboxConfig{ | 	config := &runtime.PodSandboxConfig{ | ||||||
| 		Metadata: &runtime.PodSandboxMetadata{ | 		Metadata: &runtime.PodSandboxMetadata{ | ||||||
| 			Name:      "test-name", | 			Name:      "test-name", | ||||||
| @@ -94,7 +100,7 @@ func getRunPodSandboxTestData() (*runtime.PodSandboxConfig, *imagespec.ImageConf | |||||||
| 		} | 		} | ||||||
|  |  | ||||||
| 		assert.Contains(t, spec.Mounts, runtimespec.Mount{ | 		assert.Contains(t, spec.Mounts, runtimespec.Mount{ | ||||||
| 			Source:      "/test/root/sandboxes/test-id/resolv.conf", | 			Source:      filepath.Join(criCfg.RootDir, "sandboxes/test-id/resolv.conf"), | ||||||
| 			Destination: resolvConfPath, | 			Destination: resolvConfPath, | ||||||
| 			Type:        "bind", | 			Type:        "bind", | ||||||
| 			Options:     []string{"rbind", "ro", "nosuid", "nodev", "noexec"}, | 			Options:     []string{"rbind", "ro", "nosuid", "nodev", "noexec"}, | ||||||
| @@ -105,8 +111,10 @@ func getRunPodSandboxTestData() (*runtime.PodSandboxConfig, *imagespec.ImageConf | |||||||
| } | } | ||||||
|  |  | ||||||
| func TestLinuxSandboxContainerSpec(t *testing.T) { | func TestLinuxSandboxContainerSpec(t *testing.T) { | ||||||
|  | 	testutil.RequiresRoot(t) | ||||||
|  |  | ||||||
| 	testID := "test-id" | 	testID := "test-id" | ||||||
| 	nsPath := "test-cni" |  | ||||||
| 	idMap := runtime.IDMapping{ | 	idMap := runtime.IDMapping{ | ||||||
| 		HostId:      1000, | 		HostId:      1000, | ||||||
| 		ContainerId: 1000, | 		ContainerId: 1000, | ||||||
| @@ -118,15 +126,30 @@ func TestLinuxSandboxContainerSpec(t *testing.T) { | |||||||
| 		Size:        10, | 		Size:        10, | ||||||
| 	} | 	} | ||||||
|  |  | ||||||
|  | 	netnsBasedir := t.TempDir() | ||||||
|  | 	t.Cleanup(func() { | ||||||
|  | 		assert.NoError(t, unmountRecursive(context.Background(), netnsBasedir)) | ||||||
|  | 	}) | ||||||
|  |  | ||||||
|  | 	var netNs *netns.NetNS | ||||||
|  | 	uerr := sys.UnshareAfterEnterUserns("1000:1000:10", "1000:1000:10", syscall.CLONE_NEWNET, func(pid int) error { | ||||||
|  | 		var err error | ||||||
|  | 		netNs, err = netns.NewNetNSFromPID(netnsBasedir, uint32(pid)) | ||||||
|  | 		return err | ||||||
|  | 	}) | ||||||
|  | 	require.NoError(t, uerr) | ||||||
|  |  | ||||||
|  | 	nsPath := netNs.GetPath() | ||||||
|  |  | ||||||
| 	for _, test := range []struct { | 	for _, test := range []struct { | ||||||
| 		desc         string | 		desc         string | ||||||
| 		configChange func(*runtime.PodSandboxConfig) | 		configChange func(*runtime.PodSandboxConfig) | ||||||
| 		specCheck    func(*testing.T, *runtimespec.Spec) | 		specCheck    func(*testing.T, *Controller, *runtimespec.Spec) | ||||||
| 		expectErr    bool | 		expectErr    bool | ||||||
| 	}{ | 	}{ | ||||||
| 		{ | 		{ | ||||||
| 			desc: "spec should reflect original config", | 			desc: "spec should reflect original config", | ||||||
| 			specCheck: func(t *testing.T, spec *runtimespec.Spec) { | 			specCheck: func(t *testing.T, _ *Controller, spec *runtimespec.Spec) { | ||||||
| 				// runtime spec should have expected namespaces enabled by default. | 				// runtime spec should have expected namespaces enabled by default. | ||||||
| 				require.NotNil(t, spec.Linux) | 				require.NotNil(t, spec.Linux) | ||||||
| 				assert.Contains(t, spec.Linux.Namespaces, runtimespec.LinuxNamespace{ | 				assert.Contains(t, spec.Linux.Namespaces, runtimespec.LinuxNamespace{ | ||||||
| @@ -162,10 +185,11 @@ func TestLinuxSandboxContainerSpec(t *testing.T) { | |||||||
| 					}, | 					}, | ||||||
| 				} | 				} | ||||||
| 			}, | 			}, | ||||||
| 			specCheck: func(t *testing.T, spec *runtimespec.Spec) { | 			specCheck: func(t *testing.T, c *Controller, spec *runtimespec.Spec) { | ||||||
| 				require.NotNil(t, spec.Linux) | 				require.NotNil(t, spec.Linux) | ||||||
| 				assert.Contains(t, spec.Linux.Namespaces, runtimespec.LinuxNamespace{ | 				assert.Contains(t, spec.Linux.Namespaces, runtimespec.LinuxNamespace{ | ||||||
| 					Type: runtimespec.UserNamespace, | 					Type: runtimespec.UserNamespace, | ||||||
|  | 					Path: filepath.Join(c.config.StateDir, "sandboxes", testID, "pinned-namespaces", "user"), | ||||||
| 				}) | 				}) | ||||||
| 				assert.NotContains(t, spec.Linux.Sysctl["net.ipv4.ping_group_range"], "0 2147483647") | 				assert.NotContains(t, spec.Linux.Sysctl["net.ipv4.ping_group_range"], "0 2147483647") | ||||||
| 			}, | 			}, | ||||||
| @@ -181,7 +205,7 @@ func TestLinuxSandboxContainerSpec(t *testing.T) { | |||||||
| 					}, | 					}, | ||||||
| 				} | 				} | ||||||
| 			}, | 			}, | ||||||
| 			specCheck: func(t *testing.T, spec *runtimespec.Spec) { | 			specCheck: func(t *testing.T, _ *Controller, spec *runtimespec.Spec) { | ||||||
| 				// runtime spec should disable expected namespaces in host mode. | 				// runtime spec should disable expected namespaces in host mode. | ||||||
| 				require.NotNil(t, spec.Linux) | 				require.NotNil(t, spec.Linux) | ||||||
| 				assert.NotContains(t, spec.Linux.Namespaces, runtimespec.LinuxNamespace{ | 				assert.NotContains(t, spec.Linux.Namespaces, runtimespec.LinuxNamespace{ | ||||||
| @@ -213,10 +237,11 @@ func TestLinuxSandboxContainerSpec(t *testing.T) { | |||||||
| 					}, | 					}, | ||||||
| 				} | 				} | ||||||
| 			}, | 			}, | ||||||
| 			specCheck: func(t *testing.T, spec *runtimespec.Spec) { | 			specCheck: func(t *testing.T, c *Controller, spec *runtimespec.Spec) { | ||||||
| 				require.NotNil(t, spec.Linux) | 				require.NotNil(t, spec.Linux) | ||||||
| 				assert.Contains(t, spec.Linux.Namespaces, runtimespec.LinuxNamespace{ | 				assert.Contains(t, spec.Linux.Namespaces, runtimespec.LinuxNamespace{ | ||||||
| 					Type: runtimespec.UserNamespace, | 					Type: runtimespec.UserNamespace, | ||||||
|  | 					Path: filepath.Join(c.config.StateDir, "sandboxes", testID, "pinned-namespaces", "user"), | ||||||
| 				}) | 				}) | ||||||
| 				require.Equal(t, spec.Linux.UIDMappings, []runtimespec.LinuxIDMapping{expIDMap}) | 				require.Equal(t, spec.Linux.UIDMappings, []runtimespec.LinuxIDMapping{expIDMap}) | ||||||
| 				require.Equal(t, spec.Linux.GIDMappings, []runtimespec.LinuxIDMapping{expIDMap}) | 				require.Equal(t, spec.Linux.GIDMappings, []runtimespec.LinuxIDMapping{expIDMap}) | ||||||
| @@ -314,7 +339,7 @@ func TestLinuxSandboxContainerSpec(t *testing.T) { | |||||||
| 					SupplementalGroups: []int64{1111, 2222}, | 					SupplementalGroups: []int64{1111, 2222}, | ||||||
| 				} | 				} | ||||||
| 			}, | 			}, | ||||||
| 			specCheck: func(t *testing.T, spec *runtimespec.Spec) { | 			specCheck: func(t *testing.T, _ *Controller, spec *runtimespec.Spec) { | ||||||
| 				require.NotNil(t, spec.Process) | 				require.NotNil(t, spec.Process) | ||||||
| 				assert.Contains(t, spec.Process.User.AdditionalGids, uint32(1111)) | 				assert.Contains(t, spec.Process.User.AdditionalGids, uint32(1111)) | ||||||
| 				assert.Contains(t, spec.Process.User.AdditionalGids, uint32(2222)) | 				assert.Contains(t, spec.Process.User.AdditionalGids, uint32(2222)) | ||||||
| @@ -328,7 +353,7 @@ func TestLinuxSandboxContainerSpec(t *testing.T) { | |||||||
| 					"net.ipv4.ping_group_range":           "1 1000", | 					"net.ipv4.ping_group_range":           "1 1000", | ||||||
| 				} | 				} | ||||||
| 			}, | 			}, | ||||||
| 			specCheck: func(t *testing.T, spec *runtimespec.Spec) { | 			specCheck: func(t *testing.T, _ *Controller, spec *runtimespec.Spec) { | ||||||
| 				require.NotNil(t, spec.Process) | 				require.NotNil(t, spec.Process) | ||||||
| 				assert.Contains(t, spec.Linux.Sysctl["net.ipv4.ip_unprivileged_port_start"], "500") | 				assert.Contains(t, spec.Linux.Sysctl["net.ipv4.ip_unprivileged_port_start"], "500") | ||||||
| 				assert.Contains(t, spec.Linux.Sysctl["net.ipv4.ping_group_range"], "1 1000") | 				assert.Contains(t, spec.Linux.Sysctl["net.ipv4.ping_group_range"], "1 1000") | ||||||
| @@ -344,7 +369,7 @@ func TestLinuxSandboxContainerSpec(t *testing.T) { | |||||||
| 					MemoryLimitInBytes: 1024, | 					MemoryLimitInBytes: 1024, | ||||||
| 				} | 				} | ||||||
| 			}, | 			}, | ||||||
| 			specCheck: func(t *testing.T, spec *runtimespec.Spec) { | 			specCheck: func(t *testing.T, _ *Controller, spec *runtimespec.Spec) { | ||||||
| 				value, ok := spec.Annotations[annotations.SandboxCPUPeriod] | 				value, ok := spec.Annotations[annotations.SandboxCPUPeriod] | ||||||
| 				assert.True(t, ok) | 				assert.True(t, ok) | ||||||
| 				assert.EqualValues(t, strconv.FormatInt(100, 10), value) | 				assert.EqualValues(t, strconv.FormatInt(100, 10), value) | ||||||
| @@ -365,7 +390,7 @@ func TestLinuxSandboxContainerSpec(t *testing.T) { | |||||||
| 		}, | 		}, | ||||||
| 		{ | 		{ | ||||||
| 			desc: "sandbox sizing annotations should not be set if LinuxContainerResources were not provided", | 			desc: "sandbox sizing annotations should not be set if LinuxContainerResources were not provided", | ||||||
| 			specCheck: func(t *testing.T, spec *runtimespec.Spec) { | 			specCheck: func(t *testing.T, _ *Controller, spec *runtimespec.Spec) { | ||||||
| 				_, ok := spec.Annotations[annotations.SandboxCPUPeriod] | 				_, ok := spec.Annotations[annotations.SandboxCPUPeriod] | ||||||
| 				assert.False(t, ok) | 				assert.False(t, ok) | ||||||
| 				_, ok = spec.Annotations[annotations.SandboxCPUQuota] | 				_, ok = spec.Annotations[annotations.SandboxCPUQuota] | ||||||
| @@ -381,7 +406,7 @@ func TestLinuxSandboxContainerSpec(t *testing.T) { | |||||||
| 			configChange: func(c *runtime.PodSandboxConfig) { | 			configChange: func(c *runtime.PodSandboxConfig) { | ||||||
| 				c.Linux.Resources = &v1.LinuxContainerResources{} | 				c.Linux.Resources = &v1.LinuxContainerResources{} | ||||||
| 			}, | 			}, | ||||||
| 			specCheck: func(t *testing.T, spec *runtimespec.Spec) { | 			specCheck: func(t *testing.T, _ *Controller, spec *runtimespec.Spec) { | ||||||
| 				value, ok := spec.Annotations[annotations.SandboxCPUPeriod] | 				value, ok := spec.Annotations[annotations.SandboxCPUPeriod] | ||||||
| 				assert.True(t, ok) | 				assert.True(t, ok) | ||||||
| 				assert.EqualValues(t, "0", value) | 				assert.EqualValues(t, "0", value) | ||||||
| @@ -400,9 +425,17 @@ func TestLinuxSandboxContainerSpec(t *testing.T) { | |||||||
| 		test := test | 		test := test | ||||||
| 		t.Run(test.desc, func(t *testing.T) { | 		t.Run(test.desc, func(t *testing.T) { | ||||||
| 			c := newControllerService() | 			c := newControllerService() | ||||||
|  | 			c.config.RootDir = t.TempDir() | ||||||
|  | 			c.config.StateDir = t.TempDir() | ||||||
|  |  | ||||||
|  | 			defer func() { | ||||||
|  | 				assert.NoError(t, unmountRecursive(context.Background(), c.config.StateDir)) | ||||||
|  | 			}() | ||||||
|  |  | ||||||
| 			c.config.EnableUnprivilegedICMP = true | 			c.config.EnableUnprivilegedICMP = true | ||||||
| 			c.config.EnableUnprivilegedPorts = true | 			c.config.EnableUnprivilegedPorts = true | ||||||
| 			config, imageConfig, specCheck := getRunPodSandboxTestData() |  | ||||||
|  | 			config, imageConfig, specCheck := getRunPodSandboxTestData(c.config) | ||||||
| 			if test.configChange != nil { | 			if test.configChange != nil { | ||||||
| 				test.configChange(config) | 				test.configChange(config) | ||||||
| 			} | 			} | ||||||
| @@ -416,7 +449,7 @@ func TestLinuxSandboxContainerSpec(t *testing.T) { | |||||||
| 			assert.NotNil(t, spec) | 			assert.NotNil(t, spec) | ||||||
| 			specCheck(t, testID, spec) | 			specCheck(t, testID, spec) | ||||||
| 			if test.specCheck != nil { | 			if test.specCheck != nil { | ||||||
| 				test.specCheck(t, spec) | 				test.specCheck(t, c, spec) | ||||||
| 			} | 			} | ||||||
| 		}) | 		}) | ||||||
| 	} | 	} | ||||||
| @@ -757,6 +790,3 @@ options timeout:1 | |||||||
| 		}) | 		}) | ||||||
| 	} | 	} | ||||||
| } | } | ||||||
|  |  | ||||||
| // TODO(random-liu): [P1] Add unit test for different error cases to make sure |  | ||||||
| // the function cleans up on error properly. |  | ||||||
|   | |||||||
| @@ -21,12 +21,13 @@ package podsandbox | |||||||
| import ( | import ( | ||||||
| 	"testing" | 	"testing" | ||||||
|  |  | ||||||
|  | 	criconfig "github.com/containerd/containerd/v2/internal/cri/config" | ||||||
| 	imagespec "github.com/opencontainers/image-spec/specs-go/v1" | 	imagespec "github.com/opencontainers/image-spec/specs-go/v1" | ||||||
| 	runtimespec "github.com/opencontainers/runtime-spec/specs-go" | 	runtimespec "github.com/opencontainers/runtime-spec/specs-go" | ||||||
| 	runtime "k8s.io/cri-api/pkg/apis/runtime/v1" | 	runtime "k8s.io/cri-api/pkg/apis/runtime/v1" | ||||||
| ) | ) | ||||||
|  |  | ||||||
| func getRunPodSandboxTestData() (*runtime.PodSandboxConfig, *imagespec.ImageConfig, func(*testing.T, string, *runtimespec.Spec)) { | func getRunPodSandboxTestData(_ criconfig.Config) (*runtime.PodSandboxConfig, *imagespec.ImageConfig, func(*testing.T, string, *runtimespec.Spec)) { | ||||||
| 	config := &runtime.PodSandboxConfig{} | 	config := &runtime.PodSandboxConfig{} | ||||||
| 	imageConfig := &imagespec.ImageConfig{} | 	imageConfig := &imagespec.ImageConfig{} | ||||||
| 	specCheck := func(t *testing.T, id string, spec *runtimespec.Spec) { | 	specCheck := func(t *testing.T, id string, spec *runtimespec.Spec) { | ||||||
|   | |||||||
| @@ -27,8 +27,14 @@ import ( | |||||||
| 	runtime "k8s.io/cri-api/pkg/apis/runtime/v1" | 	runtime "k8s.io/cri-api/pkg/apis/runtime/v1" | ||||||
|  |  | ||||||
| 	sandboxstore "github.com/containerd/containerd/v2/internal/cri/store/sandbox" | 	sandboxstore "github.com/containerd/containerd/v2/internal/cri/store/sandbox" | ||||||
|  | 	"github.com/containerd/containerd/v2/pkg/testutil" | ||||||
| ) | ) | ||||||
|  |  | ||||||
|  | func TestEmpty(t *testing.T) { | ||||||
|  | 	// NOTE: It's used to register -test.root for all platforms. | ||||||
|  | 	testutil.RequiresRoot(t) | ||||||
|  | } | ||||||
|  |  | ||||||
| func TestSandboxContainerSpec(t *testing.T) { | func TestSandboxContainerSpec(t *testing.T) { | ||||||
| 	switch goruntime.GOOS { | 	switch goruntime.GOOS { | ||||||
| 	case "darwin": | 	case "darwin": | ||||||
| @@ -97,7 +103,7 @@ func TestSandboxContainerSpec(t *testing.T) { | |||||||
| 		test := test | 		test := test | ||||||
| 		t.Run(test.desc, func(t *testing.T) { | 		t.Run(test.desc, func(t *testing.T) { | ||||||
| 			c := newControllerService() | 			c := newControllerService() | ||||||
| 			config, imageConfig, specCheck := getRunPodSandboxTestData() | 			config, imageConfig, specCheck := getRunPodSandboxTestData(c.config) | ||||||
| 			if test.configChange != nil { | 			if test.configChange != nil { | ||||||
| 				test.configChange(config) | 				test.configChange(config) | ||||||
| 			} | 			} | ||||||
| @@ -154,7 +160,9 @@ func TestTypeurlMarshalUnmarshalSandboxMeta(t *testing.T) { | |||||||
| 				Name:      "sandbox_1", | 				Name:      "sandbox_1", | ||||||
| 				NetNSPath: "/home/cloud", | 				NetNSPath: "/home/cloud", | ||||||
| 			} | 			} | ||||||
| 			meta.Config, _, _ = getRunPodSandboxTestData() |  | ||||||
|  | 			c := newControllerService() | ||||||
|  | 			meta.Config, _, _ = getRunPodSandboxTestData(c.config) | ||||||
| 			if test.configChange != nil { | 			if test.configChange != nil { | ||||||
| 				test.configChange(meta.Config) | 				test.configChange(meta.Config) | ||||||
| 			} | 			} | ||||||
|   | |||||||
| @@ -25,10 +25,11 @@ import ( | |||||||
| 	runtime "k8s.io/cri-api/pkg/apis/runtime/v1" | 	runtime "k8s.io/cri-api/pkg/apis/runtime/v1" | ||||||
|  |  | ||||||
| 	"github.com/containerd/containerd/v2/internal/cri/annotations" | 	"github.com/containerd/containerd/v2/internal/cri/annotations" | ||||||
|  | 	criconfig "github.com/containerd/containerd/v2/internal/cri/config" | ||||||
| 	"github.com/containerd/containerd/v2/internal/cri/opts" | 	"github.com/containerd/containerd/v2/internal/cri/opts" | ||||||
| ) | ) | ||||||
|  |  | ||||||
| func getRunPodSandboxTestData() (*runtime.PodSandboxConfig, *imagespec.ImageConfig, func(*testing.T, string, *runtimespec.Spec)) { | func getRunPodSandboxTestData(criCfg criconfig.Config) (*runtime.PodSandboxConfig, *imagespec.ImageConfig, func(*testing.T, string, *runtimespec.Spec)) { | ||||||
| 	config := &runtime.PodSandboxConfig{ | 	config := &runtime.PodSandboxConfig{ | ||||||
| 		Metadata: &runtime.PodSandboxMetadata{ | 		Metadata: &runtime.PodSandboxMetadata{ | ||||||
| 			Name:      "test-name", | 			Name:      "test-name", | ||||||
| @@ -100,7 +101,7 @@ func TestSandboxWindowsNetworkNamespace(t *testing.T) { | |||||||
| 	nsPath := "test-cni" | 	nsPath := "test-cni" | ||||||
| 	c := newControllerService() | 	c := newControllerService() | ||||||
|  |  | ||||||
| 	config, imageConfig, specCheck := getRunPodSandboxTestData() | 	config, imageConfig, specCheck := getRunPodSandboxTestData(c.config) | ||||||
| 	spec, err := c.sandboxContainerSpec(testID, config, imageConfig, nsPath, nil) | 	spec, err := c.sandboxContainerSpec(testID, config, imageConfig, nsPath, nil) | ||||||
| 	assert.NoError(t, err) | 	assert.NoError(t, err) | ||||||
| 	assert.NotNil(t, spec) | 	assert.NotNil(t, spec) | ||||||
|   | |||||||
| @@ -167,18 +167,7 @@ func (c *criService) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandbox | |||||||
| 	} | 	} | ||||||
|  |  | ||||||
| 	// Setup the network namespace if host networking wasn't requested. | 	// Setup the network namespace if host networking wasn't requested. | ||||||
| 	if !hostNetwork(config) && !userNsEnabled { | 	if !hostNetwork(config) { | ||||||
| 		// XXX: We do c&p of this code later for the podNetwork && userNsEnabled case too. |  | ||||||
| 		// We can't move this to a function, as the defer calls need to be executed if other |  | ||||||
| 		// errors are returned in this function. So, we would need more refactors to move |  | ||||||
| 		// this code to a function and the idea was to not change the current code for |  | ||||||
| 		// !userNsEnabled case, therefore doing it would defeat the purpose. |  | ||||||
| 		// |  | ||||||
| 		// The difference between the cases is the use of netns.NewNetNS() vs |  | ||||||
| 		// netns.NewNetNSFromPID(). |  | ||||||
| 		// |  | ||||||
| 		// To simplify this, in the future, we should just remove this case (podNetwork && |  | ||||||
| 		// !userNsEnabled) and just keep the other case (podNetwork && userNsEnabled). |  | ||||||
| 		span.AddEvent("setup pod network") | 		span.AddEvent("setup pod network") | ||||||
| 		netStart := time.Now() | 		netStart := time.Now() | ||||||
| 		// If it is not in host network namespace then create a namespace and set the sandbox | 		// If it is not in host network namespace then create a namespace and set the sandbox | ||||||
| @@ -189,7 +178,13 @@ func (c *criService) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandbox | |||||||
| 		if c.config.NetNSMountsUnderStateDir { | 		if c.config.NetNSMountsUnderStateDir { | ||||||
| 			netnsMountDir = filepath.Join(c.config.StateDir, "netns") | 			netnsMountDir = filepath.Join(c.config.StateDir, "netns") | ||||||
| 		} | 		} | ||||||
| 		sandbox.NetNS, err = netns.NewNetNS(netnsMountDir) |  | ||||||
|  | 		if !userNsEnabled { | ||||||
|  | 			sandbox.NetNS, err = netns.NewNetNS(netnsMountDir) | ||||||
|  | 		} else { | ||||||
|  | 			usernsOpts := config.GetLinux().GetSecurityContext().GetNamespaceOptions().GetUsernsOptions() | ||||||
|  | 			sandbox.NetNS, err = c.setupNetnsWithinUserns(netnsMountDir, usernsOpts) | ||||||
|  | 		} | ||||||
| 		if err != nil { | 		if err != nil { | ||||||
| 			return nil, fmt.Errorf("failed to create network namespace for sandbox %q: %w", id, err) | 			return nil, fmt.Errorf("failed to create network namespace for sandbox %q: %w", id, err) | ||||||
| 		} | 		} | ||||||
| @@ -284,92 +279,6 @@ func (c *criService) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandbox | |||||||
| 		return nil, fmt.Errorf("unable to update extensions for sandbox %q: %w", id, err) | 		return nil, fmt.Errorf("unable to update extensions for sandbox %q: %w", id, err) | ||||||
| 	} | 	} | ||||||
|  |  | ||||||
| 	if !hostNetwork(config) && userNsEnabled { |  | ||||||
| 		// If userns is enabled, then the netns was created by the OCI runtime |  | ||||||
| 		// on controller.Start(). The OCI runtime needs to create the netns |  | ||||||
| 		// because, if userns is in use, the netns needs to be owned by the |  | ||||||
| 		// userns. So, let the OCI runtime just handle this for us. |  | ||||||
| 		// If the netns is not owned by the userns several problems will happen. |  | ||||||
| 		// For instance, the container will lack permission (even if |  | ||||||
| 		// capabilities are present) to modify the netns or, even worse, the OCI |  | ||||||
| 		// runtime will fail to mount sysfs: |  | ||||||
| 		//      https://github.com/torvalds/linux/commit/7dc5dbc879bd0779924b5132a48b731a0bc04a1e#diff-4839664cd0c8eab716e064323c7cd71fR1164 |  | ||||||
| 		// |  | ||||||
| 		// Note we do this after controller.Start(), as before that we |  | ||||||
| 		// can't get the PID for the sandbox that we need for the netns. |  | ||||||
| 		// Doing a controller.Status() call before that fails (can't |  | ||||||
| 		// find the sandbox) so we can't get the PID. |  | ||||||
| 		netStart := time.Now() |  | ||||||
|  |  | ||||||
| 		// If it is not in host network namespace then create a namespace and set the sandbox |  | ||||||
| 		// handle. NetNSPath in sandbox metadata and NetNS is non empty only for non host network |  | ||||||
| 		// namespaces. If the pod is in host network namespace then both are empty and should not |  | ||||||
| 		// be used. |  | ||||||
| 		var netnsMountDir = "/var/run/netns" |  | ||||||
| 		if c.config.NetNSMountsUnderStateDir { |  | ||||||
| 			netnsMountDir = filepath.Join(c.config.StateDir, "netns") |  | ||||||
| 		} |  | ||||||
|  |  | ||||||
| 		sandbox.NetNS, err = netns.NewNetNSFromPID(netnsMountDir, ctrl.Pid) |  | ||||||
| 		if err != nil { |  | ||||||
| 			return nil, fmt.Errorf("failed to create network namespace for sandbox %q: %w", id, err) |  | ||||||
| 		} |  | ||||||
|  |  | ||||||
| 		// Update network namespace in the store, which is used to generate the container's spec |  | ||||||
| 		sandbox.NetNSPath = sandbox.NetNS.GetPath() |  | ||||||
| 		defer func() { |  | ||||||
| 			// Remove the network namespace only if all the resource cleanup is done |  | ||||||
| 			if retErr != nil && cleanupErr == nil { |  | ||||||
| 				if cleanupErr = sandbox.NetNS.Remove(); cleanupErr != nil { |  | ||||||
| 					log.G(ctx).WithError(cleanupErr).Errorf("Failed to remove network namespace %s for sandbox %q", sandbox.NetNSPath, id) |  | ||||||
| 					return |  | ||||||
| 				} |  | ||||||
| 				sandbox.NetNSPath = "" |  | ||||||
| 			} |  | ||||||
| 		}() |  | ||||||
|  |  | ||||||
| 		if err := sandboxInfo.AddExtension(podsandbox.MetadataKey, &sandbox.Metadata); err != nil { |  | ||||||
| 			return nil, fmt.Errorf("unable to save sandbox %q to store: %w", id, err) |  | ||||||
| 		} |  | ||||||
| 		// Save sandbox metadata to store |  | ||||||
| 		if sandboxInfo, err = c.client.SandboxStore().Update(ctx, sandboxInfo, "extensions"); err != nil { |  | ||||||
| 			return nil, fmt.Errorf("unable to update extensions for sandbox %q: %w", id, err) |  | ||||||
| 		} |  | ||||||
|  |  | ||||||
| 		// Define this defer to teardownPodNetwork prior to the setupPodNetwork function call. |  | ||||||
| 		// This is because in setupPodNetwork the resource is allocated even if it returns error, unlike other resource |  | ||||||
| 		// creation functions. |  | ||||||
| 		defer func() { |  | ||||||
| 			// Remove the network namespace only if all the resource cleanup is done. |  | ||||||
| 			if retErr != nil && cleanupErr == nil { |  | ||||||
| 				deferCtx, deferCancel := util.DeferContext() |  | ||||||
| 				defer deferCancel() |  | ||||||
| 				// Teardown network if an error is returned. |  | ||||||
| 				if cleanupErr = c.teardownPodNetwork(deferCtx, sandbox); cleanupErr != nil { |  | ||||||
| 					log.G(ctx).WithError(cleanupErr).Errorf("Failed to destroy network for sandbox %q", id) |  | ||||||
| 				} |  | ||||||
|  |  | ||||||
| 			} |  | ||||||
| 		}() |  | ||||||
|  |  | ||||||
| 		// Setup network for sandbox. |  | ||||||
| 		// Certain VM based solutions like clear containers (Issue containerd/cri-containerd#524) |  | ||||||
| 		// rely on the assumption that CRI shim will not be querying the network namespace to check the |  | ||||||
| 		// network states such as IP. |  | ||||||
| 		// In future runtime implementation should avoid relying on CRI shim implementation details. |  | ||||||
| 		// In this case however caching the IP will add a subtle performance enhancement by avoiding |  | ||||||
| 		// calls to network namespace of the pod to query the IP of the veth interface on every |  | ||||||
| 		// SandboxStatus request. |  | ||||||
| 		if err := c.setupPodNetwork(ctx, &sandbox); err != nil { |  | ||||||
| 			return nil, fmt.Errorf("failed to setup network for sandbox %q: %w", id, err) |  | ||||||
| 		} |  | ||||||
| 		sandboxCreateNetworkTimer.UpdateSince(netStart) |  | ||||||
|  |  | ||||||
| 		span.AddEvent("finished pod network setup", |  | ||||||
| 			tracing.Attribute("pod.network.setup.duration", time.Since(netStart).String()), |  | ||||||
| 		) |  | ||||||
| 	} |  | ||||||
|  |  | ||||||
| 	// TODO: get rid of this. sandbox object should no longer have Container field. | 	// TODO: get rid of this. sandbox object should no longer have Container field. | ||||||
| 	if ociRuntime.Sandboxer == string(criconfig.ModePodSandbox) { | 	if ociRuntime.Sandboxer == string(criconfig.ModePodSandbox) { | ||||||
| 		container, err := c.client.LoadContainer(ctx, id) | 		container, err := c.client.LoadContainer(ctx, id) | ||||||
|   | |||||||
| @@ -18,9 +18,14 @@ package server | |||||||
|  |  | ||||||
| import ( | import ( | ||||||
| 	"fmt" | 	"fmt" | ||||||
|  | 	"syscall" | ||||||
|  |  | ||||||
|  | 	"github.com/containerd/containerd/v2/pkg/netns" | ||||||
|  | 	"github.com/containerd/containerd/v2/pkg/sys" | ||||||
|  |  | ||||||
| 	"github.com/containernetworking/plugins/pkg/ns" | 	"github.com/containernetworking/plugins/pkg/ns" | ||||||
| 	"github.com/vishvananda/netlink" | 	"github.com/vishvananda/netlink" | ||||||
|  | 	runtime "k8s.io/cri-api/pkg/apis/runtime/v1" | ||||||
| ) | ) | ||||||
|  |  | ||||||
| func (c *criService) bringUpLoopback(netns string) error { | func (c *criService) bringUpLoopback(netns string) error { | ||||||
| @@ -35,3 +40,44 @@ func (c *criService) bringUpLoopback(netns string) error { | |||||||
| 	} | 	} | ||||||
| 	return nil | 	return nil | ||||||
| } | } | ||||||
|  |  | ||||||
|  | func (c *criService) setupNetnsWithinUserns(netnsMountDir string, opt *runtime.UserNamespace) (*netns.NetNS, error) { | ||||||
|  | 	if opt.GetMode() != runtime.NamespaceMode_POD { | ||||||
|  | 		return nil, fmt.Errorf("required pod-level user namespace setting") | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	uidMaps := opt.GetUids() | ||||||
|  | 	if len(uidMaps) != 1 { | ||||||
|  | 		return nil, fmt.Errorf("required only one uid mapping, but got %d uid mapping(s)", len(uidMaps)) | ||||||
|  | 	} | ||||||
|  | 	if uidMaps[0] == nil { | ||||||
|  | 		return nil, fmt.Errorf("required only one uid mapping, but got empty uid mapping") | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	gidMaps := opt.GetGids() | ||||||
|  | 	if len(gidMaps) != 1 { | ||||||
|  | 		return nil, fmt.Errorf("required only one gid mapping, but got %d gid mapping(s)", len(gidMaps)) | ||||||
|  | 	} | ||||||
|  | 	if gidMaps[0] == nil { | ||||||
|  | 		return nil, fmt.Errorf("required only one gid mapping, but got empty gid mapping") | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	var netNs *netns.NetNS | ||||||
|  | 	var err error | ||||||
|  | 	uerr := sys.UnshareAfterEnterUserns( | ||||||
|  | 		fmt.Sprintf("%d:%d:%d", uidMaps[0].ContainerId, uidMaps[0].HostId, uidMaps[0].Length), | ||||||
|  | 		fmt.Sprintf("%d:%d:%d", gidMaps[0].ContainerId, gidMaps[0].HostId, gidMaps[0].Length), | ||||||
|  | 		syscall.CLONE_NEWNET, | ||||||
|  | 		func(pid int) error { | ||||||
|  | 			netNs, err = netns.NewNetNSFromPID(netnsMountDir, uint32(pid)) | ||||||
|  | 			if err != nil { | ||||||
|  | 				return fmt.Errorf("failed to mount netns from pid %d: %w", pid, err) | ||||||
|  | 			} | ||||||
|  | 			return nil | ||||||
|  | 		}, | ||||||
|  | 	) | ||||||
|  | 	if uerr != nil { | ||||||
|  | 		return nil, uerr | ||||||
|  | 	} | ||||||
|  | 	return netNs, nil | ||||||
|  | } | ||||||
|   | |||||||
| @@ -18,6 +18,17 @@ | |||||||
|  |  | ||||||
| package server | package server | ||||||
|  |  | ||||||
|  | import ( | ||||||
|  | 	"fmt" | ||||||
|  |  | ||||||
|  | 	"github.com/containerd/containerd/v2/pkg/netns" | ||||||
|  | 	runtime "k8s.io/cri-api/pkg/apis/runtime/v1" | ||||||
|  | ) | ||||||
|  |  | ||||||
| func (c *criService) bringUpLoopback(string) error { | func (c *criService) bringUpLoopback(string) error { | ||||||
| 	return nil | 	return nil | ||||||
| } | } | ||||||
|  |  | ||||||
|  | func (c *criService) setupNetnsWithinUserns(basedir string, cfg *runtime.UserNamespace) (*netns.NetNS, error) { | ||||||
|  | 	return nil, fmt.Errorf("unsupported to setup netns within userns on unix platform") | ||||||
|  | } | ||||||
|   | |||||||
| @@ -16,6 +16,17 @@ | |||||||
|  |  | ||||||
| package server | package server | ||||||
|  |  | ||||||
|  | import ( | ||||||
|  | 	"fmt" | ||||||
|  |  | ||||||
|  | 	"github.com/containerd/containerd/v2/pkg/netns" | ||||||
|  | 	runtime "k8s.io/cri-api/pkg/apis/runtime/v1" | ||||||
|  | ) | ||||||
|  |  | ||||||
| func (c *criService) bringUpLoopback(string) error { | func (c *criService) bringUpLoopback(string) error { | ||||||
| 	return nil | 	return nil | ||||||
| } | } | ||||||
|  |  | ||||||
|  | func (c *criService) setupNetnsWithinUserns(basedir string, cfg *runtime.UserNamespace) (*netns.NetNS, error) { | ||||||
|  | 	return nil, fmt.Errorf("unsupported to setup netns within userns on windows platform") | ||||||
|  | } | ||||||
|   | |||||||
							
								
								
									
										38
									
								
								pkg/sys/namespace_linux.go
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										38
									
								
								pkg/sys/namespace_linux.go
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,38 @@ | |||||||
|  | /* | ||||||
|  |    Copyright The containerd Authors. | ||||||
|  |  | ||||||
|  |    Licensed under the Apache License, Version 2.0 (the "License"); | ||||||
|  |    you may not use this file except in compliance with the License. | ||||||
|  |    You may obtain a copy of the License at | ||||||
|  |  | ||||||
|  |        http://www.apache.org/licenses/LICENSE-2.0 | ||||||
|  |  | ||||||
|  |    Unless required by applicable law or agreed to in writing, software | ||||||
|  |    distributed under the License is distributed on an "AS IS" BASIS, | ||||||
|  |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||||
|  |    See the License for the specific language governing permissions and | ||||||
|  |    limitations under the License. | ||||||
|  | */ | ||||||
|  |  | ||||||
|  | package sys | ||||||
|  |  | ||||||
|  | import ( | ||||||
|  | 	"fmt" | ||||||
|  | 	"os" | ||||||
|  | 	"syscall" | ||||||
|  |  | ||||||
|  | 	"golang.org/x/sys/unix" | ||||||
|  | ) | ||||||
|  |  | ||||||
|  | // GetUsernsForNamespace returns a file descriptor that refers to the owning | ||||||
|  | // user namespace for the namespace referred to by fd. | ||||||
|  | // | ||||||
|  | // REF: https://man7.org/linux/man-pages/man2/ioctl_ns.2.html | ||||||
|  | func GetUsernsForNamespace(fd uintptr) (*os.File, error) { | ||||||
|  | 	fd, _, errno := unix.Syscall(syscall.SYS_IOCTL, fd, uintptr(unix.NS_GET_USERNS), 0) | ||||||
|  | 	if errno != 0 { | ||||||
|  | 		return nil, fmt.Errorf("failed to get user namespace fd: %w", errno) | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	return os.NewFile(fd, fmt.Sprintf("/proc/%d/fd/%d", os.Getpid(), fd)), nil | ||||||
|  | } | ||||||
							
								
								
									
										106
									
								
								pkg/sys/namespace_linux_test.go
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										106
									
								
								pkg/sys/namespace_linux_test.go
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,106 @@ | |||||||
|  | /* | ||||||
|  |    Copyright The containerd Authors. | ||||||
|  |  | ||||||
|  |    Licensed under the Apache License, Version 2.0 (the "License"); | ||||||
|  |    you may not use this file except in compliance with the License. | ||||||
|  |    You may obtain a copy of the License at | ||||||
|  |  | ||||||
|  |        http://www.apache.org/licenses/LICENSE-2.0 | ||||||
|  |  | ||||||
|  |    Unless required by applicable law or agreed to in writing, software | ||||||
|  |    distributed under the License is distributed on an "AS IS" BASIS, | ||||||
|  |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||||
|  |    See the License for the specific language governing permissions and | ||||||
|  |    limitations under the License. | ||||||
|  | */ | ||||||
|  |  | ||||||
|  | package sys | ||||||
|  |  | ||||||
|  | import ( | ||||||
|  | 	"fmt" | ||||||
|  | 	"os" | ||||||
|  | 	"syscall" | ||||||
|  | 	"testing" | ||||||
|  |  | ||||||
|  | 	kernel "github.com/containerd/containerd/v2/pkg/kernelversion" | ||||||
|  | 	"github.com/containerd/continuity/testutil" | ||||||
|  | 	"github.com/stretchr/testify/require" | ||||||
|  | 	"golang.org/x/sys/unix" | ||||||
|  | ) | ||||||
|  |  | ||||||
|  | func TestGetUsernsForNamespace(t *testing.T) { | ||||||
|  | 	testutil.RequiresRoot(t) | ||||||
|  |  | ||||||
|  | 	t.Parallel() | ||||||
|  |  | ||||||
|  | 	k409 := kernel.KernelVersion{Kernel: 4, Major: 9} | ||||||
|  | 	ok, err := kernel.GreaterEqualThan(k409) | ||||||
|  | 	require.NoError(t, err) | ||||||
|  | 	if !ok { | ||||||
|  | 		t.Skip("Requires kernel >= 4.9") | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	tmpDir := t.TempDir() | ||||||
|  |  | ||||||
|  | 	f, err := os.CreateTemp(tmpDir, "netns") | ||||||
|  | 	require.NoError(t, err) | ||||||
|  |  | ||||||
|  | 	netnsPath := f.Name() | ||||||
|  | 	f.Close() | ||||||
|  |  | ||||||
|  | 	defer testutil.Unmount(t, netnsPath) | ||||||
|  |  | ||||||
|  | 	currentUsernsIno, err := getNamespaceInode(os.Getpid(), "user") | ||||||
|  | 	require.NoError(t, err) | ||||||
|  |  | ||||||
|  | 	usernsIno := uint64(0) | ||||||
|  | 	uerr := UnshareAfterEnterUserns("0:1000:10", "0:1000:10", syscall.CLONE_NEWNET, func(pid int) error { | ||||||
|  | 		err := unix.Mount( | ||||||
|  | 			fmt.Sprintf("/proc/%d/ns/net", pid), | ||||||
|  | 			netnsPath, | ||||||
|  | 			"", | ||||||
|  | 			unix.MS_BIND|unix.MS_RDONLY, | ||||||
|  | 			"", | ||||||
|  | 		) | ||||||
|  | 		if err != nil { | ||||||
|  | 			return err | ||||||
|  | 		} | ||||||
|  |  | ||||||
|  | 		usernsIno, err = getNamespaceInode(pid, "user") | ||||||
|  | 		if err != nil { | ||||||
|  | 			return err | ||||||
|  | 		} | ||||||
|  | 		return nil | ||||||
|  | 	}) | ||||||
|  | 	require.NoError(t, uerr) | ||||||
|  |  | ||||||
|  | 	require.NotEqual(t, currentUsernsIno, usernsIno) | ||||||
|  | 	t.Logf("Current user namespace [%d], new user namespace [%d]", currentUsernsIno, usernsIno) | ||||||
|  |  | ||||||
|  | 	netnsFd, err := os.Open(netnsPath) | ||||||
|  | 	require.NoError(t, err) | ||||||
|  | 	defer netnsFd.Close() | ||||||
|  |  | ||||||
|  | 	usernsFd, err := GetUsernsForNamespace(netnsFd.Fd()) | ||||||
|  | 	require.NoError(t, err) | ||||||
|  | 	defer usernsFd.Close() | ||||||
|  |  | ||||||
|  | 	usernsInoFromNetnsFd := getInode(t, usernsFd) | ||||||
|  |  | ||||||
|  | 	t.Logf("Fetch netns namespace %s' user namespace owner %d", netnsPath, usernsInoFromNetnsFd) | ||||||
|  | 	require.Equal(t, usernsIno, usernsInoFromNetnsFd) | ||||||
|  |  | ||||||
|  | 	parentUsernsFd, err := GetUsernsForNamespace(usernsFd.Fd()) | ||||||
|  | 	require.NoError(t, err) | ||||||
|  | 	defer parentUsernsFd.Close() | ||||||
|  |  | ||||||
|  | 	parentUsernsIno := getInode(t, parentUsernsFd) | ||||||
|  | 	t.Logf("User namespace %d's parent %d", usernsInoFromNetnsFd, parentUsernsIno) | ||||||
|  | 	require.Equal(t, currentUsernsIno, parentUsernsIno) | ||||||
|  | } | ||||||
|  |  | ||||||
|  | func getInode(t *testing.T, f *os.File) uint64 { | ||||||
|  | 	info, err := f.Stat() | ||||||
|  | 	require.NoError(t, err) | ||||||
|  | 	return info.Sys().(*syscall.Stat_t).Ino | ||||||
|  | } | ||||||
							
								
								
									
										153
									
								
								pkg/sys/unshare_linux.go
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										153
									
								
								pkg/sys/unshare_linux.go
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,153 @@ | |||||||
|  | /* | ||||||
|  |    Copyright The containerd Authors. | ||||||
|  |  | ||||||
|  |    Licensed under the Apache License, Version 2.0 (the "License"); | ||||||
|  |    you may not use this file except in compliance with the License. | ||||||
|  |    You may obtain a copy of the License at | ||||||
|  |  | ||||||
|  |        http://www.apache.org/licenses/LICENSE-2.0 | ||||||
|  |  | ||||||
|  |    Unless required by applicable law or agreed to in writing, software | ||||||
|  |    distributed under the License is distributed on an "AS IS" BASIS, | ||||||
|  |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||||
|  |    See the License for the specific language governing permissions and | ||||||
|  |    limitations under the License. | ||||||
|  | */ | ||||||
|  |  | ||||||
|  | package sys | ||||||
|  |  | ||||||
|  | import ( | ||||||
|  | 	"errors" | ||||||
|  | 	"fmt" | ||||||
|  | 	"os" | ||||||
|  | 	"runtime" | ||||||
|  | 	"strconv" | ||||||
|  | 	"strings" | ||||||
|  | 	"syscall" | ||||||
|  |  | ||||||
|  | 	"golang.org/x/sys/unix" | ||||||
|  | ) | ||||||
|  |  | ||||||
|  | // UnshareAfterEnterUserns allows to disassociate parts of its execution context | ||||||
|  | // within a user namespace. | ||||||
|  | func UnshareAfterEnterUserns(uidMap, gidMap string, unshareFlags uintptr, f func(pid int) error) (retErr error) { | ||||||
|  | 	if unshareFlags&syscall.CLONE_NEWUSER == syscall.CLONE_NEWUSER { | ||||||
|  | 		return fmt.Errorf("unshare flags should not include user namespace") | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	uidMaps, err := parseIDMapping(uidMap) | ||||||
|  | 	if err != nil { | ||||||
|  | 		return err | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	gidMaps, err := parseIDMapping(gidMap) | ||||||
|  | 	if err != nil { | ||||||
|  | 		return err | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	var pidfd int | ||||||
|  | 	proc, err := os.StartProcess("/proc/self/exe", []string{"UnshareAfterEnterUserns"}, &os.ProcAttr{ | ||||||
|  | 		Sys: &syscall.SysProcAttr{ | ||||||
|  | 			// clone new user namespace first and then unshare | ||||||
|  | 			Cloneflags:   unix.CLONE_NEWUSER, | ||||||
|  | 			Unshareflags: unshareFlags, | ||||||
|  | 			UidMappings:  uidMaps, | ||||||
|  | 			GidMappings:  gidMaps, | ||||||
|  | 			// NOTE: It's reexec but it's not heavy because subprocess | ||||||
|  | 			// be in PTRACE_TRACEME mode before performing execve. | ||||||
|  | 			Ptrace:    true, | ||||||
|  | 			Pdeathsig: syscall.SIGKILL, | ||||||
|  | 			PidFD:     &pidfd, | ||||||
|  | 		}, | ||||||
|  | 	}) | ||||||
|  | 	if err != nil { | ||||||
|  | 		return fmt.Errorf("failed to start noop process for unshare: %w", err) | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	if pidfd == -1 || !SupportsPidFD() { | ||||||
|  | 		proc.Kill() | ||||||
|  | 		proc.Wait() | ||||||
|  | 		return fmt.Errorf("kernel doesn't support CLONE_PIDFD") | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	// Since go1.23.{0,1} has double close issue, we should dup it before using it. | ||||||
|  | 	// | ||||||
|  | 	// References: | ||||||
|  | 	// - https://github.com/golang/go/issues/68984 | ||||||
|  | 	// - https://github.com/golang/go/milestone/371 | ||||||
|  | 	if goVer := runtime.Version(); goVer == "go1.23.0" || goVer == "go1.23.1" { | ||||||
|  | 		dupPidfd, dupErr := unix.FcntlInt(uintptr(pidfd), syscall.F_DUPFD_CLOEXEC, 0) | ||||||
|  | 		if dupErr != nil { | ||||||
|  | 			proc.Kill() | ||||||
|  | 			proc.Wait() | ||||||
|  | 			return fmt.Errorf("failed to dupfd: %w", err) | ||||||
|  | 		} | ||||||
|  | 		pidfd = dupPidfd | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	defer func() { | ||||||
|  | 		derr := unix.PidfdSendSignal(pidfd, unix.SIGKILL, nil, 0) | ||||||
|  | 		if derr != nil { | ||||||
|  | 			if !errors.Is(derr, unix.ESRCH) { | ||||||
|  | 				retErr = derr | ||||||
|  | 			} | ||||||
|  | 			return | ||||||
|  | 		} | ||||||
|  | 		pidfdWaitid(pidfd) | ||||||
|  | 	}() | ||||||
|  |  | ||||||
|  | 	if f != nil { | ||||||
|  | 		if err := f(proc.Pid); err != nil { | ||||||
|  | 			return err | ||||||
|  | 		} | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	// Ensure the child process is still alive. If the err is ESRCH, we | ||||||
|  | 	// should return error because the pid could be reused. It's safe to | ||||||
|  | 	// return error and retry. | ||||||
|  | 	if err := unix.PidfdSendSignal(pidfd, 0, nil, 0); err != nil { | ||||||
|  | 		return fmt.Errorf("failed to ensure child process is alive: %w", err) | ||||||
|  | 	} | ||||||
|  | 	return nil | ||||||
|  | } | ||||||
|  |  | ||||||
|  | // TODO: Support multiple mappings in future | ||||||
|  | func parseIDMapping(mapping string) ([]syscall.SysProcIDMap, error) { | ||||||
|  | 	parts := strings.Split(mapping, ":") | ||||||
|  | 	if len(parts) != 3 { | ||||||
|  | 		return nil, fmt.Errorf("user namespace mappings require the format `container-id:host-id:size`") | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	cID, err := strconv.Atoi(parts[0]) | ||||||
|  | 	if err != nil { | ||||||
|  | 		return nil, fmt.Errorf("invalid container id for user namespace remapping, %w", err) | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	hID, err := strconv.Atoi(parts[1]) | ||||||
|  | 	if err != nil { | ||||||
|  | 		return nil, fmt.Errorf("invalid host id for user namespace remapping, %w", err) | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	size, err := strconv.Atoi(parts[2]) | ||||||
|  | 	if err != nil { | ||||||
|  | 		return nil, fmt.Errorf("invalid size for user namespace remapping, %w", err) | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	if cID < 0 || hID < 0 || size < 0 { | ||||||
|  | 		return nil, fmt.Errorf("invalid mapping %s, all IDs and size must be positive integers", mapping) | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	return []syscall.SysProcIDMap{ | ||||||
|  | 		{ | ||||||
|  | 			ContainerID: cID, | ||||||
|  | 			HostID:      hID, | ||||||
|  | 			Size:        size, | ||||||
|  | 		}, | ||||||
|  | 	}, nil | ||||||
|  | } | ||||||
|  |  | ||||||
|  | func pidfdWaitid(pidfd int) error { | ||||||
|  | 	return IgnoringEINTR(func() error { | ||||||
|  | 		return unix.Waitid(unix.P_PIDFD, pidfd, nil, unix.WEXITED, nil) | ||||||
|  | 	}) | ||||||
|  | } | ||||||
							
								
								
									
										149
									
								
								pkg/sys/unshare_linux_test.go
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										149
									
								
								pkg/sys/unshare_linux_test.go
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,149 @@ | |||||||
|  | /* | ||||||
|  |    Copyright The containerd Authors. | ||||||
|  |  | ||||||
|  |    Licensed under the Apache License, Version 2.0 (the "License"); | ||||||
|  |    you may not use this file except in compliance with the License. | ||||||
|  |    You may obtain a copy of the License at | ||||||
|  |  | ||||||
|  |        http://www.apache.org/licenses/LICENSE-2.0 | ||||||
|  |  | ||||||
|  |    Unless required by applicable law or agreed to in writing, software | ||||||
|  |    distributed under the License is distributed on an "AS IS" BASIS, | ||||||
|  |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||||
|  |    See the License for the specific language governing permissions and | ||||||
|  |    limitations under the License. | ||||||
|  | */ | ||||||
|  |  | ||||||
|  | package sys | ||||||
|  |  | ||||||
|  | import ( | ||||||
|  | 	"fmt" | ||||||
|  | 	"os" | ||||||
|  | 	"syscall" | ||||||
|  | 	"testing" | ||||||
|  |  | ||||||
|  | 	kernel "github.com/containerd/containerd/v2/pkg/kernelversion" | ||||||
|  | 	"github.com/containerd/continuity/testutil" | ||||||
|  | 	"github.com/stretchr/testify/require" | ||||||
|  | ) | ||||||
|  |  | ||||||
|  | func TestUnshareAfterEnterUserns(t *testing.T) { | ||||||
|  | 	testutil.RequiresRoot(t) | ||||||
|  |  | ||||||
|  | 	k510 := kernel.KernelVersion{Kernel: 5, Major: 10} | ||||||
|  | 	ok, err := kernel.GreaterEqualThan(k510) | ||||||
|  | 	require.NoError(t, err) | ||||||
|  | 	if !ok { | ||||||
|  | 		t.Skip("Requires kernel >= 5.10") | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	err = UnshareAfterEnterUserns("0:1000:1", "0:1000:1", syscall.CLONE_NEWUSER|syscall.CLONE_NEWIPC, nil) | ||||||
|  | 	require.Error(t, err) | ||||||
|  | 	require.ErrorContains(t, err, "unshare flags should not include user namespace") | ||||||
|  |  | ||||||
|  | 	t.Run("should work", testUnshareAfterEnterUsernsShouldWork) | ||||||
|  | 	t.Run("killpid", testUnshareAfterEnterUsernsKillPid) | ||||||
|  | 	t.Run("invalid unshare flags", testUnshareAfterEnterUsernsInvalidFlags) | ||||||
|  | } | ||||||
|  |  | ||||||
|  | func testUnshareAfterEnterUsernsShouldWork(t *testing.T) { | ||||||
|  | 	t.Parallel() | ||||||
|  |  | ||||||
|  | 	currentNetNs, err := getNamespaceInode(os.Getpid(), "net") | ||||||
|  | 	require.NoError(t, err) | ||||||
|  |  | ||||||
|  | 	currentUserNs, err := getNamespaceInode(os.Getpid(), "user") | ||||||
|  | 	require.NoError(t, err) | ||||||
|  |  | ||||||
|  | 	currentIpcNs, err := getNamespaceInode(os.Getpid(), "ipc") | ||||||
|  | 	require.NoError(t, err) | ||||||
|  |  | ||||||
|  | 	currentPidNs, err := getNamespaceInode(os.Getpid(), "pid") | ||||||
|  | 	require.NoError(t, err) | ||||||
|  |  | ||||||
|  | 	uerr := UnshareAfterEnterUserns("0:1000:10", "0:1000:10", syscall.CLONE_NEWIPC|syscall.CLONE_NEWNET, func(pid int) error { | ||||||
|  | 		netNs, err := getNamespaceInode(pid, "net") | ||||||
|  | 		require.NoError(t, err) | ||||||
|  | 		require.NotEqual(t, currentNetNs, netNs) | ||||||
|  |  | ||||||
|  | 		userNs, err := getNamespaceInode(pid, "user") | ||||||
|  | 		require.NoError(t, err) | ||||||
|  | 		require.NotEqual(t, currentUserNs, userNs) | ||||||
|  |  | ||||||
|  | 		ipcNs, err := getNamespaceInode(pid, "ipc") | ||||||
|  | 		require.NoError(t, err) | ||||||
|  | 		require.NotEqual(t, currentIpcNs, ipcNs) | ||||||
|  |  | ||||||
|  | 		pidNs, err := getNamespaceInode(pid, "pid") | ||||||
|  | 		require.NoError(t, err) | ||||||
|  | 		require.Equal(t, currentPidNs, pidNs) | ||||||
|  |  | ||||||
|  | 		data, err := os.ReadFile(fmt.Sprintf("/proc/%d/uid_map", pid)) | ||||||
|  | 		require.NoError(t, err) | ||||||
|  | 		require.Equal(t, "         0       1000         10\n", string(data)) | ||||||
|  |  | ||||||
|  | 		data, err = os.ReadFile(fmt.Sprintf("/proc/%d/gid_map", pid)) | ||||||
|  | 		require.NoError(t, err) | ||||||
|  | 		require.Equal(t, "         0       1000         10\n", string(data)) | ||||||
|  | 		return nil | ||||||
|  | 	}) | ||||||
|  | 	require.NoError(t, uerr) | ||||||
|  | } | ||||||
|  |  | ||||||
|  | func testUnshareAfterEnterUsernsKillPid(t *testing.T) { | ||||||
|  | 	t.Parallel() | ||||||
|  |  | ||||||
|  | 	uerr := UnshareAfterEnterUserns("0:1000:1", "0:1000:1", syscall.CLONE_NEWIPC|syscall.CLONE_NEWNET, func(pid int) error { | ||||||
|  | 		proc, err := os.FindProcess(pid) | ||||||
|  | 		if err != nil { | ||||||
|  | 			return fmt.Errorf("failed to find process: %w", err) | ||||||
|  | 		} | ||||||
|  |  | ||||||
|  | 		if err := proc.Kill(); err != nil { | ||||||
|  | 			return fmt.Errorf("failed to kill process: %w", err) | ||||||
|  | 		} | ||||||
|  |  | ||||||
|  | 		proc.Wait() | ||||||
|  |  | ||||||
|  | 		_, err = os.OpenFile(fmt.Sprintf("/proc/%d/ns/net", pid), os.O_RDONLY, 0600) | ||||||
|  | 		require.Error(t, err) | ||||||
|  | 		require.ErrorIs(t, err, os.ErrNotExist) | ||||||
|  | 		return err | ||||||
|  | 	}) | ||||||
|  | 	require.Error(t, uerr) | ||||||
|  | 	require.ErrorIs(t, uerr, os.ErrNotExist) | ||||||
|  |  | ||||||
|  | 	uerr = UnshareAfterEnterUserns("0:1000:1", "0:1000:1", syscall.CLONE_NEWIPC|syscall.CLONE_NEWNET, func(pid int) error { | ||||||
|  | 		proc, err := os.FindProcess(pid) | ||||||
|  | 		if err != nil { | ||||||
|  | 			return fmt.Errorf("failed to find process: %w", err) | ||||||
|  | 		} | ||||||
|  |  | ||||||
|  | 		if err := proc.Kill(); err != nil { | ||||||
|  | 			return fmt.Errorf("failed to kill process: %w", err) | ||||||
|  | 		} | ||||||
|  |  | ||||||
|  | 		proc.Wait() | ||||||
|  |  | ||||||
|  | 		return nil | ||||||
|  | 	}) | ||||||
|  | 	require.Error(t, uerr) | ||||||
|  | 	require.ErrorContains(t, uerr, "failed to ensure child process is alive: no such process") | ||||||
|  | } | ||||||
|  |  | ||||||
|  | func testUnshareAfterEnterUsernsInvalidFlags(t *testing.T) { | ||||||
|  | 	t.Parallel() | ||||||
|  |  | ||||||
|  | 	uerr := UnshareAfterEnterUserns("0:1000:1", "0:1000:1", syscall.CLONE_IO, nil) | ||||||
|  | 	require.Error(t, uerr) | ||||||
|  | 	require.ErrorContains(t, uerr, "fork/exec /proc/self/exe: invalid argument") | ||||||
|  | } | ||||||
|  |  | ||||||
|  | func getNamespaceInode(pid int, typ string) (uint64, error) { | ||||||
|  | 	info, err := os.Stat(fmt.Sprintf("/proc/%d/ns/%s", pid, typ)) | ||||||
|  | 	if err != nil { | ||||||
|  | 		return 0, err | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	return info.Sys().(*syscall.Stat_t).Ino, nil | ||||||
|  | } | ||||||
		Reference in New Issue
	
	Block a user
	 Akihiro Suda
					Akihiro Suda