diff --git a/pkg/sys/unshare_linux.go b/pkg/sys/unshare_linux.go new file mode 100644 index 000000000..e8774a21c --- /dev/null +++ b/pkg/sys/unshare_linux.go @@ -0,0 +1,153 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package sys + +import ( + "errors" + "fmt" + "os" + "runtime" + "strconv" + "strings" + "syscall" + + "golang.org/x/sys/unix" +) + +// UnshareAfterEnterUserns allows to disassociate parts of its execution context +// within a user namespace. +func UnshareAfterEnterUserns(uidMap, gidMap string, unshareFlags uintptr, f func(pid int) error) (retErr error) { + if unshareFlags&syscall.CLONE_NEWUSER == syscall.CLONE_NEWUSER { + return fmt.Errorf("unshare flags should not include user namespace") + } + + uidMaps, err := parseIDMapping(uidMap) + if err != nil { + return err + } + + gidMaps, err := parseIDMapping(gidMap) + if err != nil { + return err + } + + var pidfd int + proc, err := os.StartProcess("/proc/self/exe", []string{"UnshareAfterEnterUserns"}, &os.ProcAttr{ + Sys: &syscall.SysProcAttr{ + // clone new user namespace first and then unshare + Cloneflags: unix.CLONE_NEWUSER, + Unshareflags: unshareFlags, + UidMappings: uidMaps, + GidMappings: gidMaps, + // NOTE: It's reexec but it's not heavy because subprocess + // be in PTRACE_TRACEME mode before performing execve. + Ptrace: true, + Pdeathsig: syscall.SIGKILL, + PidFD: &pidfd, + }, + }) + if err != nil { + return fmt.Errorf("failed to start noop process for unshare: %w", err) + } + + if pidfd == -1 || !SupportsPidFD() { + proc.Kill() + proc.Wait() + return fmt.Errorf("kernel doesn't support CLONE_PIDFD") + } + + // Since go1.23.{0,1} has double close issue, we should dup it before using it. + // + // References: + // - https://github.com/golang/go/issues/68984 + // - https://github.com/golang/go/milestone/371 + if goVer := runtime.Version(); goVer == "go1.23.0" || goVer == "go1.23.1" { + dupPidfd, dupErr := unix.FcntlInt(uintptr(pidfd), syscall.F_DUPFD_CLOEXEC, 0) + if dupErr != nil { + proc.Kill() + proc.Wait() + return fmt.Errorf("failed to dupfd: %w", err) + } + pidfd = dupPidfd + } + + defer func() { + derr := unix.PidfdSendSignal(pidfd, unix.SIGKILL, nil, 0) + if derr != nil { + if !errors.Is(derr, unix.ESRCH) { + retErr = derr + } + return + } + pidfdWaitid(pidfd) + }() + + if f != nil { + if err := f(proc.Pid); err != nil { + return err + } + } + + // Ensure the child process is still alive. If the err is ESRCH, we + // should return error because the pid could be reused. It's safe to + // return error and retry. + if err := unix.PidfdSendSignal(pidfd, 0, nil, 0); err != nil { + return fmt.Errorf("failed to ensure child process is alive: %w", err) + } + return nil +} + +// TODO: Support multiple mappings in future +func parseIDMapping(mapping string) ([]syscall.SysProcIDMap, error) { + parts := strings.Split(mapping, ":") + if len(parts) != 3 { + return nil, fmt.Errorf("user namespace mappings require the format `container-id:host-id:size`") + } + + cID, err := strconv.Atoi(parts[0]) + if err != nil { + return nil, fmt.Errorf("invalid container id for user namespace remapping, %w", err) + } + + hID, err := strconv.Atoi(parts[1]) + if err != nil { + return nil, fmt.Errorf("invalid host id for user namespace remapping, %w", err) + } + + size, err := strconv.Atoi(parts[2]) + if err != nil { + return nil, fmt.Errorf("invalid size for user namespace remapping, %w", err) + } + + if cID < 0 || hID < 0 || size < 0 { + return nil, fmt.Errorf("invalid mapping %s, all IDs and size must be positive integers", mapping) + } + + return []syscall.SysProcIDMap{ + { + ContainerID: cID, + HostID: hID, + Size: size, + }, + }, nil +} + +func pidfdWaitid(pidfd int) error { + return IgnoringEINTR(func() error { + return unix.Waitid(unix.P_PIDFD, pidfd, nil, unix.WEXITED, nil) + }) +} diff --git a/pkg/sys/unshare_linux_test.go b/pkg/sys/unshare_linux_test.go new file mode 100644 index 000000000..d4486f42f --- /dev/null +++ b/pkg/sys/unshare_linux_test.go @@ -0,0 +1,149 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package sys + +import ( + "fmt" + "os" + "syscall" + "testing" + + kernel "github.com/containerd/containerd/v2/pkg/kernelversion" + "github.com/containerd/continuity/testutil" + "github.com/stretchr/testify/require" +) + +func TestUnshareAfterEnterUserns(t *testing.T) { + testutil.RequiresRoot(t) + + k510 := kernel.KernelVersion{Kernel: 5, Major: 10} + ok, err := kernel.GreaterEqualThan(k510) + require.NoError(t, err) + if !ok { + t.Skip("Requires kernel >= 5.10") + } + + err = UnshareAfterEnterUserns("0:1000:1", "0:1000:1", syscall.CLONE_NEWUSER|syscall.CLONE_NEWIPC, nil) + require.Error(t, err) + require.ErrorContains(t, err, "unshare flags should not include user namespace") + + t.Run("should work", testUnshareAfterEnterUsernsShouldWork) + t.Run("killpid", testUnshareAfterEnterUsernsKillPid) + t.Run("invalid unshare flags", testUnshareAfterEnterUsernsInvalidFlags) +} + +func testUnshareAfterEnterUsernsShouldWork(t *testing.T) { + t.Parallel() + + currentNetNs, err := getNamespaceInode(os.Getpid(), "net") + require.NoError(t, err) + + currentUserNs, err := getNamespaceInode(os.Getpid(), "user") + require.NoError(t, err) + + currentIpcNs, err := getNamespaceInode(os.Getpid(), "ipc") + require.NoError(t, err) + + currentPidNs, err := getNamespaceInode(os.Getpid(), "pid") + require.NoError(t, err) + + uerr := UnshareAfterEnterUserns("0:1000:10", "0:1000:10", syscall.CLONE_NEWIPC|syscall.CLONE_NEWNET, func(pid int) error { + netNs, err := getNamespaceInode(pid, "net") + require.NoError(t, err) + require.NotEqual(t, currentNetNs, netNs) + + userNs, err := getNamespaceInode(pid, "user") + require.NoError(t, err) + require.NotEqual(t, currentUserNs, userNs) + + ipcNs, err := getNamespaceInode(pid, "ipc") + require.NoError(t, err) + require.NotEqual(t, currentIpcNs, ipcNs) + + pidNs, err := getNamespaceInode(pid, "pid") + require.NoError(t, err) + require.Equal(t, currentPidNs, pidNs) + + data, err := os.ReadFile(fmt.Sprintf("/proc/%d/uid_map", pid)) + require.NoError(t, err) + require.Equal(t, " 0 1000 10\n", string(data)) + + data, err = os.ReadFile(fmt.Sprintf("/proc/%d/gid_map", pid)) + require.NoError(t, err) + require.Equal(t, " 0 1000 10\n", string(data)) + return nil + }) + require.NoError(t, uerr) +} + +func testUnshareAfterEnterUsernsKillPid(t *testing.T) { + t.Parallel() + + uerr := UnshareAfterEnterUserns("0:1000:1", "0:1000:1", syscall.CLONE_NEWIPC|syscall.CLONE_NEWNET, func(pid int) error { + proc, err := os.FindProcess(pid) + if err != nil { + return fmt.Errorf("failed to find process: %w", err) + } + + if err := proc.Kill(); err != nil { + return fmt.Errorf("failed to kill process: %w", err) + } + + proc.Wait() + + _, err = os.OpenFile(fmt.Sprintf("/proc/%d/ns/net", pid), os.O_RDONLY, 0600) + require.Error(t, err) + require.ErrorIs(t, err, os.ErrNotExist) + return err + }) + require.Error(t, uerr) + require.ErrorIs(t, uerr, os.ErrNotExist) + + uerr = UnshareAfterEnterUserns("0:1000:1", "0:1000:1", syscall.CLONE_NEWIPC|syscall.CLONE_NEWNET, func(pid int) error { + proc, err := os.FindProcess(pid) + if err != nil { + return fmt.Errorf("failed to find process: %w", err) + } + + if err := proc.Kill(); err != nil { + return fmt.Errorf("failed to kill process: %w", err) + } + + proc.Wait() + + return nil + }) + require.Error(t, uerr) + require.ErrorContains(t, uerr, "failed to ensure child process is alive: no such process") +} + +func testUnshareAfterEnterUsernsInvalidFlags(t *testing.T) { + t.Parallel() + + uerr := UnshareAfterEnterUserns("0:1000:1", "0:1000:1", syscall.CLONE_IO, nil) + require.Error(t, uerr) + require.ErrorContains(t, uerr, "fork/exec /proc/self/exe: invalid argument") +} + +func getNamespaceInode(pid int, typ string) (uint64, error) { + info, err := os.Stat(fmt.Sprintf("/proc/%d/ns/%s", pid, typ)) + if err != nil { + return 0, err + } + + return info.Sys().(*syscall.Stat_t).Ino, nil +}