diff --git a/core/mount/mount_idmapped_linux.go b/core/mount/mount_idmapped_linux.go index 4fee53e98..d929157d7 100644 --- a/core/mount/mount_idmapped_linux.go +++ b/core/mount/mount_idmapped_linux.go @@ -19,15 +19,11 @@ package mount import ( "fmt" "os" - "runtime" "strconv" "strings" - "sync" "syscall" "golang.org/x/sys/unix" - - "github.com/containerd/containerd/v2/pkg/sys" ) // TODO: Support multiple mappings in future @@ -106,99 +102,3 @@ func GetUsernsFD(uidmap, gidmap string) (_usernsFD *os.File, _ error) { } return getUsernsFD(uidMaps, gidMaps) } - -func getUsernsFD(uidMaps, gidMaps []syscall.SysProcIDMap) (_usernsFD *os.File, retErr error) { - runtime.LockOSThread() - defer runtime.UnlockOSThread() - - pid, pidfd, errno := sys.ForkUserns() - if errno != 0 { - return nil, errno - } - - pidFD := os.NewFile(pidfd, "pidfd") - defer func() { - unix.PidfdSendSignal(int(pidFD.Fd()), unix.SIGKILL, nil, 0) - - pidfdWaitid(pidFD) - - pidFD.Close() - }() - - // NOTE: - // - // The usernsFD will hold the userns reference in kernel. Even if the - // child process is reaped, the usernsFD is still valid. - usernsFD, err := os.Open(fmt.Sprintf("/proc/%d/ns/user", pid)) - if err != nil { - return nil, fmt.Errorf("failed to get userns file descriptor for /proc/%d/user/ns: %w", pid, err) - } - defer func() { - if retErr != nil { - usernsFD.Close() - } - }() - - uidmapFile, err := os.OpenFile(fmt.Sprintf("/proc/%d/%s", pid, "uid_map"), os.O_WRONLY, 0600) - if err != nil { - return nil, fmt.Errorf("failed to open /proc/%d/uid_map: %w", pid, err) - } - defer uidmapFile.Close() - - gidmapFile, err := os.OpenFile(fmt.Sprintf("/proc/%d/%s", pid, "gid_map"), os.O_WRONLY, 0600) - if err != nil { - return nil, fmt.Errorf("failed to open /proc/%d/gid_map: %w", pid, err) - } - defer gidmapFile.Close() - - testHookKillChildBeforePidfdSendSignal(pid, pidFD) - - // Ensure the child process is still alive. If the err is ESRCH, we - // should return error because we can't guarantee the usernsFD and - // u[g]idmapFile are valid. It's safe to return error and retry. - if err := unix.PidfdSendSignal(int(pidFD.Fd()), 0, nil, 0); err != nil { - return nil, fmt.Errorf("failed to ensure child process is alive: %w", err) - } - - testHookKillChildAfterPidfdSendSignal(pid, pidFD) - - // NOTE: - // - // The u[g]id_map file descriptor is still valid if the child process - // is reaped. - writeMappings := func(f *os.File, idmap []syscall.SysProcIDMap) error { - mappings := "" - for _, m := range idmap { - mappings = fmt.Sprintf("%s%d %d %d\n", mappings, m.ContainerID, m.HostID, m.Size) - } - - _, err := f.Write([]byte(mappings)) - if err1 := f.Close(); err1 != nil && err == nil { - err = err1 - } - return err - } - - if err := writeMappings(uidmapFile, uidMaps); err != nil { - return nil, fmt.Errorf("failed to write uid_map: %w", err) - } - - if err := writeMappings(gidmapFile, gidMaps); err != nil { - return nil, fmt.Errorf("failed to write gid_map: %w", err) - } - return usernsFD, nil -} - -func pidfdWaitid(pidFD *os.File) error { - return sys.IgnoringEINTR(func() error { - return unix.Waitid(unix.P_PIDFD, int(pidFD.Fd()), nil, unix.WEXITED, nil) - }) -} - -var ( - testHookLock sync.Mutex - - testHookKillChildBeforePidfdSendSignal = func(_pid uintptr, _pidFD *os.File) {} - - testHookKillChildAfterPidfdSendSignal = func(_pid uintptr, _pidFD *os.File) {} -) diff --git a/core/mount/mount_idmapped_linux_test.go b/core/mount/mount_idmapped_linux_test.go index 1dd36d2e3..90e3a61e8 100644 --- a/core/mount/mount_idmapped_linux_test.go +++ b/core/mount/mount_idmapped_linux_test.go @@ -20,15 +20,43 @@ import ( "fmt" "os" "path/filepath" + "sync" "syscall" "testing" kernel "github.com/containerd/containerd/v2/pkg/kernelversion" "github.com/containerd/continuity/testutil" "github.com/stretchr/testify/require" - "golang.org/x/sys/unix" ) +func BenchmarkBatchRunGetUsernsFD_Concurrent1(b *testing.B) { + for range b.N { + benchmarkBatchRunGetUsernsFD(1) + } +} + +func BenchmarkBatchRunGetUsernsFD_Concurrent10(b *testing.B) { + for range b.N { + benchmarkBatchRunGetUsernsFD(10) + } +} + +func benchmarkBatchRunGetUsernsFD(n int) { + var wg sync.WaitGroup + wg.Add(n) + for i := 0; i < n; i++ { + go func() { + defer wg.Done() + fd, err := getUsernsFD(testUIDMaps, testGIDMaps) + if err != nil { + panic(err) + } + fd.Close() + }() + } + wg.Wait() +} + var ( testUIDMaps = []syscall.SysProcIDMap{ {ContainerID: 1000, HostID: 0, Size: 100}, @@ -43,7 +71,7 @@ var ( } ) -func TestGetUsernsFD(t *testing.T) { +func TestIdmappedMount(t *testing.T) { testutil.RequiresRoot(t) k512 := kernel.KernelVersion{Kernel: 5, Major: 12} @@ -53,15 +81,12 @@ func TestGetUsernsFD(t *testing.T) { t.Skip("GetUsernsFD requires kernel >= 5.12") } - t.Run("basic", testGetUsernsFDBasic) - - t.Run("when kill child process before write u[g]id maps", testGetUsernsFDKillChildWhenWriteUGIDMaps) - - t.Run("when kill child process after open u[g]id_map file", testGetUsernsFDKillChildAfterOpenUGIDMapFiles) + t.Run("GetUsernsFD", testGetUsernsFD) + t.Run("IDMapMount", testIDMapMount) } -func testGetUsernsFDBasic(t *testing.T) { +func testGetUsernsFD(t *testing.T) { for idx, tc := range []struct { uidMaps string gidMaps string @@ -99,108 +124,21 @@ func testGetUsernsFDBasic(t *testing.T) { } } -func testGetUsernsFDKillChildWhenWriteUGIDMaps(t *testing.T) { - hookFunc := func(reap bool) func(uintptr, *os.File) { - return func(_pid uintptr, pidFD *os.File) { - err := unix.PidfdSendSignal(int(pidFD.Fd()), unix.SIGKILL, nil, 0) - require.NoError(t, err) +func testIDMapMount(t *testing.T) { + usernsFD, err := getUsernsFD(testUIDMaps, testGIDMaps) + require.NoError(t, err) + defer usernsFD.Close() - if reap { - pidfdWaitid(pidFD) - } - } - } + srcDir, checkFunc := initIDMappedChecker(t, testUIDMaps, testGIDMaps) + destDir := t.TempDir() + defer func() { + require.NoError(t, UnmountAll(destDir, 0)) + }() - for _, tcReap := range []bool{true, false} { - t.Run(fmt.Sprintf("#reap=%v", tcReap), func(t *testing.T) { - updateTestHookKillForGetUsernsFD(t, nil, hookFunc(tcReap)) - - usernsFD, err := getUsernsFD(testUIDMaps, testGIDMaps) - require.NoError(t, err) - defer usernsFD.Close() - - srcDir, checkFunc := initIDMappedChecker(t, testUIDMaps, testGIDMaps) - destDir := t.TempDir() - defer func() { - require.NoError(t, UnmountAll(destDir, 0)) - }() - - err = IDMapMount(srcDir, destDir, int(usernsFD.Fd())) - usernsFD.Close() - require.NoError(t, err) - checkFunc(destDir) - }) - } - -} - -func testGetUsernsFDKillChildAfterOpenUGIDMapFiles(t *testing.T) { - hookFunc := func(reap bool) func(uintptr, *os.File) { - return func(_pid uintptr, pidFD *os.File) { - err := unix.PidfdSendSignal(int(pidFD.Fd()), unix.SIGKILL, nil, 0) - require.NoError(t, err) - - if reap { - pidfdWaitid(pidFD) - } - } - } - - for _, tc := range []struct { - reap bool - expected error - }{ - { - reap: false, - expected: nil, - }, - { - reap: true, - expected: syscall.ESRCH, - }, - } { - t.Run(fmt.Sprintf("#reap=%v", tc.reap), func(t *testing.T) { - updateTestHookKillForGetUsernsFD(t, hookFunc(tc.reap), nil) - - usernsFD, err := getUsernsFD(testUIDMaps, testGIDMaps) - if tc.expected != nil { - require.Error(t, tc.expected, err) - return - } - - require.NoError(t, err) - defer usernsFD.Close() - - srcDir, checkFunc := initIDMappedChecker(t, testUIDMaps, testGIDMaps) - destDir := t.TempDir() - defer func() { - require.NoError(t, UnmountAll(destDir, 0)) - }() - - err = IDMapMount(srcDir, destDir, int(usernsFD.Fd())) - usernsFD.Close() - require.NoError(t, err) - checkFunc(destDir) - }) - } -} - -func updateTestHookKillForGetUsernsFD(t *testing.T, newBeforeFunc, newAfterFunc func(uintptr, *os.File)) { - testHookLock.Lock() - - oldBefore := testHookKillChildBeforePidfdSendSignal - oldAfter := testHookKillChildAfterPidfdSendSignal - t.Cleanup(func() { - testHookKillChildBeforePidfdSendSignal = oldBefore - testHookKillChildAfterPidfdSendSignal = oldAfter - testHookLock.Unlock() - }) - if newBeforeFunc != nil { - testHookKillChildBeforePidfdSendSignal = newBeforeFunc - } - if newAfterFunc != nil { - testHookKillChildAfterPidfdSendSignal = newAfterFunc - } + err = IDMapMount(srcDir, destDir, int(usernsFD.Fd())) + usernsFD.Close() + require.NoError(t, err) + checkFunc(destDir) } func initIDMappedChecker(t *testing.T, uidMaps, gidMaps []syscall.SysProcIDMap) (_srcDir string, _verifyFunc func(destDir string)) { diff --git a/core/mount/mount_idmapped_utils_linux.go b/core/mount/mount_idmapped_utils_linux.go new file mode 100644 index 000000000..373285be8 --- /dev/null +++ b/core/mount/mount_idmapped_utils_linux.go @@ -0,0 +1,85 @@ +//go:build go1.23 && linux + +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package mount + +import ( + "fmt" + "os" + "syscall" + + "github.com/containerd/containerd/v2/pkg/sys" + + "golang.org/x/sys/unix" +) + +// getUsernsFD returns pinnable user namespace's file descriptor. +// +// NOTE: The GO runtime uses pidfd to handle subprocess since go1.23. However, +// it has double close issue tracked by [1]. We can't use pidfd directly and +// the GO runtime doesn't export interface to show if it's using pidfd or not. +// So, we call `sys.SupportsPidFD` first and then use `os.Process` directly. +// +// [1]: https://github.com/golang/go/issues/68984 +func getUsernsFD(uidMaps, gidMaps []syscall.SysProcIDMap) (_ *os.File, retErr error) { + if !sys.SupportsPidFD() { + return nil, fmt.Errorf("failed to prevent pid reused issue because pidfd isn't supported") + } + + proc, err := os.StartProcess("/proc/self/exe", []string{"containerd[getUsernsFD]"}, &os.ProcAttr{ + Sys: &syscall.SysProcAttr{ + Cloneflags: unix.CLONE_NEWUSER, + UidMappings: uidMaps, + GidMappings: gidMaps, + // NOTE: It's reexec but it's not heavy because subprocess + // be in PTRACE_TRACEME mode before performing execve. + Ptrace: true, + Pdeathsig: syscall.SIGKILL, + }, + }) + if err != nil { + return nil, fmt.Errorf("failed to start noop process for unshare: %w", err) + } + + defer func() { + proc.Kill() + proc.Wait() + }() + + // NOTE: + // + // The usernsFD will hold the userns reference in kernel. Even if the + // child process is reaped, the usernsFD is still valid. + usernsFD, err := os.Open(fmt.Sprintf("/proc/%d/ns/user", proc.Pid)) + if err != nil { + return nil, fmt.Errorf("failed to get userns file descriptor for /proc/%d/user/ns: %w", proc.Pid, err) + } + defer func() { + if retErr != nil { + usernsFD.Close() + } + }() + + // Ensure the child process is still alive. If the err is ESRCH, we + // should return error because we can't guarantee the usernsFD and + // u[g]idmapFile are valid. It's safe to return error and retry. + if err := proc.Signal(syscall.Signal(0)); err != nil { + return nil, fmt.Errorf("failed to ensure child process is alive: %w", err) + } + return usernsFD, nil +} diff --git a/core/mount/mount_idmapped_utils_linux_go122.go b/core/mount/mount_idmapped_utils_linux_go122.go new file mode 100644 index 000000000..669f395d2 --- /dev/null +++ b/core/mount/mount_idmapped_utils_linux_go122.go @@ -0,0 +1,93 @@ +//go:build !go1.23 && linux + +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package mount + +import ( + "fmt" + "os" + "syscall" + + "github.com/containerd/containerd/v2/pkg/sys" + + "golang.org/x/sys/unix" +) + +// getUsernsFD returns pinnable user namespace's file descriptor. +func getUsernsFD(uidMaps, gidMaps []syscall.SysProcIDMap) (_ *os.File, retErr error) { + var pidfd int + + proc, err := os.StartProcess("/proc/self/exe", []string{"containerd[getUsernsFD]"}, &os.ProcAttr{ + Sys: &syscall.SysProcAttr{ + Cloneflags: unix.CLONE_NEWUSER, + UidMappings: uidMaps, + GidMappings: gidMaps, + // NOTE: It's reexec but it's not heavy because subprocess + // be in PTRACE_TRACEME mode before performing execve. + Ptrace: true, + Pdeathsig: syscall.SIGKILL, + PidFD: &pidfd, + }, + }) + if err != nil { + return nil, fmt.Errorf("failed to start noop process for unshare: %w", err) + } + + if pidfd == -1 || !sys.SupportsPidFD() { + proc.Kill() + proc.Wait() + return nil, fmt.Errorf("failed to prevent pid reused issue because pidfd isn't supported") + } + + pidFD := os.NewFile(uintptr(pidfd), "pidfd") + defer func() { + unix.PidfdSendSignal(int(pidFD.Fd()), unix.SIGKILL, nil, 0) + + pidfdWaitid(pidFD) + + pidFD.Close() + }() + + // NOTE: + // + // The usernsFD will hold the userns reference in kernel. Even if the + // child process is reaped, the usernsFD is still valid. + usernsFD, err := os.Open(fmt.Sprintf("/proc/%d/ns/user", proc.Pid)) + if err != nil { + return nil, fmt.Errorf("failed to get userns file descriptor for /proc/%d/user/ns: %w", proc.Pid, err) + } + defer func() { + if retErr != nil { + usernsFD.Close() + } + }() + + // Ensure the child process is still alive. If the err is ESRCH, we + // should return error because we can't guarantee the usernsFD and + // u[g]idmapFile are valid. It's safe to return error and retry. + if err := unix.PidfdSendSignal(int(pidFD.Fd()), 0, nil, 0); err != nil { + return nil, fmt.Errorf("failed to ensure child process is alive: %w", err) + } + return usernsFD, nil +} + +func pidfdWaitid(pidFD *os.File) error { + return sys.IgnoringEINTR(func() error { + return unix.Waitid(unix.P_PIDFD, int(pidFD.Fd()), nil, unix.WEXITED, nil) + }) +} diff --git a/pkg/sys/pidfd_linux.go b/pkg/sys/pidfd_linux.go new file mode 100644 index 000000000..b6f82d793 --- /dev/null +++ b/pkg/sys/pidfd_linux.go @@ -0,0 +1,85 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package sys + +import ( + "context" + "errors" + "fmt" + "os" + "sync" + + "github.com/containerd/log" + "golang.org/x/sys/unix" +) + +var ( + pidfdSupported bool + pidfdSupportedOnce sync.Once +) + +// SupportsPidFD returns true if current kernel supports pidfd. +func SupportsPidFD() bool { + pidfdSupportedOnce.Do(func() { + logger := log.G(context.Background()) + + if err := checkPidFD(); err != nil { + logger.WithError(err).Error("failed to ensure the kernel supports pidfd") + + pidfdSupported = false + return + } + pidfdSupported = true + }) + return pidfdSupported +} + +func checkPidFD() error { + // Linux kernel supports pidfd_open(2) since v5.3. + // + // https://man7.org/linux/man-pages/man2/pidfd_open.2.html + pidfd, err := unix.PidfdOpen(os.Getpid(), 0) + if err != nil { + return fmt.Errorf("failed to invoke pidfd_open: %w", err) + } + defer unix.Close(pidfd) + + // Linux kernel supports pidfd_send_signal(2) since v5.1. + // + // https://man7.org/linux/man-pages/man2/pidfd_send_signal.2.html + if err := unix.PidfdSendSignal(pidfd, 0, nil, 0); err != nil { + return fmt.Errorf("failed to invoke pidfd_send_signal: %w", err) + } + + // The waitid(2) supports P_PIDFD since Linux kernel v5.4. + // + // https://man7.org/linux/man-pages/man2/waitid.2.html + werr := IgnoringEINTR(func() error { + return unix.Waitid(unix.P_PIDFD, pidfd, nil, unix.WEXITED, nil) + }) + + // The waitid returns ECHILD since current process isn't the child of current process. + if !errors.Is(werr, unix.ECHILD) { + return fmt.Errorf("failed to invoke waitid with P_PIDFD: wanted error %v, but got %v", + unix.ECHILD, werr) + } + + // NOTE: The CLONE_PIDFD flag has been supported since Linux kernel v5.2. + // So assumption is that if waitid(2) supports P_PIDFD, current kernel + // should support CLONE_PIDFD as well. + return nil +} diff --git a/pkg/sys/subprocess_unsafe_linux.go b/pkg/sys/subprocess_unsafe_linux.go deleted file mode 100644 index 6e40a9c7d..000000000 --- a/pkg/sys/subprocess_unsafe_linux.go +++ /dev/null @@ -1,30 +0,0 @@ -/* - Copyright The containerd Authors. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -package sys - -import ( - _ "unsafe" // required for go:linkname. -) - -//go:linkname beforeFork syscall.runtime_BeforeFork -func beforeFork() - -//go:linkname afterFork syscall.runtime_AfterFork -func afterFork() - -//go:linkname afterForkInChild syscall.runtime_AfterForkInChild -func afterForkInChild() diff --git a/pkg/sys/userns_unsafe_linux.go b/pkg/sys/userns_unsafe_linux.go deleted file mode 100644 index 4ebe46c32..000000000 --- a/pkg/sys/userns_unsafe_linux.go +++ /dev/null @@ -1,94 +0,0 @@ -/* - Copyright The containerd Authors. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -package sys - -import ( - "runtime" - "syscall" - "unsafe" -) - -// ForkUserns is to fork child process with user namespace. It returns child -// process's pid and pidfd reference to the child process. -// -// Precondition: The runtime OS thread must be locked, which is GO runtime -// requirement. -// -// Beside this, the child process sets PR_SET_PDEATHSIG with SIGKILL so that -// the parent process's OS thread must be locked. Otherwise, the exit event of -// parent process's OS thread will send kill signal to the child process, -// even if parent process is still running. -// -//go:norace -//go:noinline -func ForkUserns() (_pid uintptr, _pidfd uintptr, _ syscall.Errno) { - var ( - pidfd uintptr - pid, ppid uintptr - err syscall.Errno - ) - - ppid, _, err = syscall.RawSyscall(uintptr(syscall.SYS_GETPID), 0, 0, 0) - if err != 0 { - return 0, 0, err - } - - beforeFork() - if runtime.GOARCH == "s390x" { - // NOTE: - // - // On the s390 architectures, the order of the first two - // arguments is reversed. - // - // REF: https://man7.org/linux/man-pages/man2/clone.2.html - pid, _, err = syscall.RawSyscall(syscall.SYS_CLONE, - 0, - uintptr(syscall.CLONE_NEWUSER|syscall.SIGCHLD|syscall.CLONE_PIDFD), - uintptr(unsafe.Pointer(&pidfd)), - ) - } else { - pid, _, err = syscall.RawSyscall(syscall.SYS_CLONE, - uintptr(syscall.CLONE_NEWUSER|syscall.SIGCHLD|syscall.CLONE_PIDFD), - 0, - uintptr(unsafe.Pointer(&pidfd)), - ) - } - if err != 0 || pid != 0 { - afterFork() - return pid, pidfd, err - } - afterForkInChild() - - if _, _, err = syscall.RawSyscall(syscall.SYS_PRCTL, syscall.PR_SET_PDEATHSIG, uintptr(syscall.SIGKILL), 0); err != 0 { - goto err - } - - pid, _, err = syscall.RawSyscall(syscall.SYS_GETPPID, 0, 0, 0) - if err != 0 { - goto err - } - - // exit if re-parent - if pid != ppid { - goto err - } - - _, _, err = syscall.RawSyscall(syscall.SYS_PPOLL, 0, 0, 0) -err: - syscall.RawSyscall(syscall.SYS_EXIT, uintptr(err), 0, 0) - panic("unreachable") -}