core/mount: use ptrace instead of go:linkname
The Go runtime has started to [lock down future uses of linkname][1] since go1.23. In the go source code, containerd project has been marked in the comment, [hall of shame][2]. Well, the go:linkname is used to fork no-op subprocess efficiently. However, since that comment, I would like to use ptrace and remove go:linkname in the whole repository. With go1.22 `go:linkname`: ```bash $ go test -bench=. -benchmem ./ -exec sudo goos: linux goarch: amd64 pkg: github.com/containerd/containerd/v2/core/mount cpu: AMD Ryzen 7 5800H with Radeon Graphics BenchmarkBatchRunGetUsernsFD_Concurrent1-16 2440 533320 ns/op 1145 B/op 43 allocs/op BenchmarkBatchRunGetUsernsFD_Concurrent10-16 342 3661616 ns/op 11562 B/op 421 allocs/op PASS ok github.com/containerd/containerd/v2/core/mount 2.983s ``` With go1.22 `ptrace`: ```bash $ go test -bench=. -benchmem ./ -exec sudo goos: linux goarch: amd64 pkg: github.com/containerd/containerd/v2/core/mount cpu: AMD Ryzen 7 5800H with Radeon Graphics BenchmarkBatchRunGetUsernsFD_Concurrent1-16 1785 739557 ns/op 3948 B/op 68 allocs/op BenchmarkBatchRunGetUsernsFD_Concurrent10-16 328 4024300 ns/op 39601 B/op 671 allocs/op PASS ok github.com/containerd/containerd/v2/core/mount 3.104s ``` With go1.23 `ptrace`: ```bash $ go test -bench=. -benchmem ./ -exec sudo goos: linux goarch: amd64 pkg: github.com/containerd/containerd/v2/core/mount cpu: AMD Ryzen 7 5800H with Radeon Graphics BenchmarkBatchRunGetUsernsFD_Concurrent1-16 1815 723252 ns/op 4220 B/op 69 allocs/op BenchmarkBatchRunGetUsernsFD_Concurrent10-16 319 3957157 ns/op 42351 B/op 682 allocs/op PASS ok github.com/containerd/containerd/v2/core/mount 3.051s ``` Diff: The `ptrace` is slower than `go:linkname` mode. However, it's accepctable. ``` goos: linux goarch: amd64 pkg: github.com/containerd/containerd/v2/core/mount cpu: AMD Ryzen 7 5800H with Radeon Graphics │ go122-golinkname │ go122-ptrace │ go123-ptrace │ │ sec/op │ sec/op vs base │ sec/op vs base │ BatchRunGetUsernsFD_Concurrent1-16 533.3µ ± ∞ ¹ 739.6µ ± ∞ ¹ ~ (p=1.000 n=1) ² 723.3µ ± ∞ ¹ ~ (p=1.000 n=1) ² BatchRunGetUsernsFD_Concurrent10-16 3.662m ± ∞ ¹ 4.024m ± ∞ ¹ ~ (p=1.000 n=1) ² 3.957m ± ∞ ¹ ~ (p=1.000 n=1) ² geomean 1.397m 1.725m +23.45% 1.692m +21.06% ¹ need >= 6 samples for confidence interval at level 0.95 ² need >= 4 samples to detect a difference at alpha level 0.05 │ go122-golinkname │ go122-ptrace │ go123-ptrace │ │ B/op │ B/op vs base │ B/op vs base │ BatchRunGetUsernsFD_Concurrent1-16 1.118Ki ± ∞ ¹ 3.855Ki ± ∞ ¹ ~ (p=1.000 n=1) ² 4.121Ki ± ∞ ¹ ~ (p=1.000 n=1) ² BatchRunGetUsernsFD_Concurrent10-16 11.29Ki ± ∞ ¹ 38.67Ki ± ∞ ¹ ~ (p=1.000 n=1) ² 41.36Ki ± ∞ ¹ ~ (p=1.000 n=1) ² geomean 3.553Ki 12.21Ki +243.65% 13.06Ki +267.43% ¹ need >= 6 samples for confidence interval at level 0.95 ² need >= 4 samples to detect a difference at alpha level 0.05 │ go122-golinkname │ go122-ptrace │ go123-ptrace │ │ allocs/op │ allocs/op vs base │ allocs/op vs base │ BatchRunGetUsernsFD_Concurrent1-16 43.00 ± ∞ ¹ 68.00 ± ∞ ¹ ~ (p=1.000 n=1) ² 69.00 ± ∞ ¹ ~ (p=1.000 n=1) ² BatchRunGetUsernsFD_Concurrent10-16 421.0 ± ∞ ¹ 671.0 ± ∞ ¹ ~ (p=1.000 n=1) ² 682.0 ± ∞ ¹ ~ (p=1.000 n=1) ² geomean 134.5 213.6 +58.76% 216.9 +61.23% ¹ need >= 6 samples for confidence interval at level 0.95 ² need >= 4 samples to detect a difference at alpha level 0.05 ``` [1]: <https://github.com/golang/go/issues/67401> [2]: <https://github.com/golang/go/blob/release-branch.go1.23/src/runtime/proc.go#L4820> Signed-off-by: Wei Fu <fuweid89@gmail.com>
This commit is contained in:
parent
bcdf507363
commit
3cd8f9734d
@ -19,15 +19,11 @@ package mount
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"runtime"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"syscall"
|
||||
|
||||
"golang.org/x/sys/unix"
|
||||
|
||||
"github.com/containerd/containerd/v2/pkg/sys"
|
||||
)
|
||||
|
||||
// TODO: Support multiple mappings in future
|
||||
@ -106,99 +102,3 @@ func GetUsernsFD(uidmap, gidmap string) (_usernsFD *os.File, _ error) {
|
||||
}
|
||||
return getUsernsFD(uidMaps, gidMaps)
|
||||
}
|
||||
|
||||
func getUsernsFD(uidMaps, gidMaps []syscall.SysProcIDMap) (_usernsFD *os.File, retErr error) {
|
||||
runtime.LockOSThread()
|
||||
defer runtime.UnlockOSThread()
|
||||
|
||||
pid, pidfd, errno := sys.ForkUserns()
|
||||
if errno != 0 {
|
||||
return nil, errno
|
||||
}
|
||||
|
||||
pidFD := os.NewFile(pidfd, "pidfd")
|
||||
defer func() {
|
||||
unix.PidfdSendSignal(int(pidFD.Fd()), unix.SIGKILL, nil, 0)
|
||||
|
||||
pidfdWaitid(pidFD)
|
||||
|
||||
pidFD.Close()
|
||||
}()
|
||||
|
||||
// NOTE:
|
||||
//
|
||||
// The usernsFD will hold the userns reference in kernel. Even if the
|
||||
// child process is reaped, the usernsFD is still valid.
|
||||
usernsFD, err := os.Open(fmt.Sprintf("/proc/%d/ns/user", pid))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to get userns file descriptor for /proc/%d/user/ns: %w", pid, err)
|
||||
}
|
||||
defer func() {
|
||||
if retErr != nil {
|
||||
usernsFD.Close()
|
||||
}
|
||||
}()
|
||||
|
||||
uidmapFile, err := os.OpenFile(fmt.Sprintf("/proc/%d/%s", pid, "uid_map"), os.O_WRONLY, 0600)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to open /proc/%d/uid_map: %w", pid, err)
|
||||
}
|
||||
defer uidmapFile.Close()
|
||||
|
||||
gidmapFile, err := os.OpenFile(fmt.Sprintf("/proc/%d/%s", pid, "gid_map"), os.O_WRONLY, 0600)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to open /proc/%d/gid_map: %w", pid, err)
|
||||
}
|
||||
defer gidmapFile.Close()
|
||||
|
||||
testHookKillChildBeforePidfdSendSignal(pid, pidFD)
|
||||
|
||||
// Ensure the child process is still alive. If the err is ESRCH, we
|
||||
// should return error because we can't guarantee the usernsFD and
|
||||
// u[g]idmapFile are valid. It's safe to return error and retry.
|
||||
if err := unix.PidfdSendSignal(int(pidFD.Fd()), 0, nil, 0); err != nil {
|
||||
return nil, fmt.Errorf("failed to ensure child process is alive: %w", err)
|
||||
}
|
||||
|
||||
testHookKillChildAfterPidfdSendSignal(pid, pidFD)
|
||||
|
||||
// NOTE:
|
||||
//
|
||||
// The u[g]id_map file descriptor is still valid if the child process
|
||||
// is reaped.
|
||||
writeMappings := func(f *os.File, idmap []syscall.SysProcIDMap) error {
|
||||
mappings := ""
|
||||
for _, m := range idmap {
|
||||
mappings = fmt.Sprintf("%s%d %d %d\n", mappings, m.ContainerID, m.HostID, m.Size)
|
||||
}
|
||||
|
||||
_, err := f.Write([]byte(mappings))
|
||||
if err1 := f.Close(); err1 != nil && err == nil {
|
||||
err = err1
|
||||
}
|
||||
return err
|
||||
}
|
||||
|
||||
if err := writeMappings(uidmapFile, uidMaps); err != nil {
|
||||
return nil, fmt.Errorf("failed to write uid_map: %w", err)
|
||||
}
|
||||
|
||||
if err := writeMappings(gidmapFile, gidMaps); err != nil {
|
||||
return nil, fmt.Errorf("failed to write gid_map: %w", err)
|
||||
}
|
||||
return usernsFD, nil
|
||||
}
|
||||
|
||||
func pidfdWaitid(pidFD *os.File) error {
|
||||
return sys.IgnoringEINTR(func() error {
|
||||
return unix.Waitid(unix.P_PIDFD, int(pidFD.Fd()), nil, unix.WEXITED, nil)
|
||||
})
|
||||
}
|
||||
|
||||
var (
|
||||
testHookLock sync.Mutex
|
||||
|
||||
testHookKillChildBeforePidfdSendSignal = func(_pid uintptr, _pidFD *os.File) {}
|
||||
|
||||
testHookKillChildAfterPidfdSendSignal = func(_pid uintptr, _pidFD *os.File) {}
|
||||
)
|
||||
|
@ -27,7 +27,6 @@ import (
|
||||
kernel "github.com/containerd/containerd/v2/pkg/kernelversion"
|
||||
"github.com/containerd/continuity/testutil"
|
||||
"github.com/stretchr/testify/require"
|
||||
"golang.org/x/sys/unix"
|
||||
)
|
||||
|
||||
func BenchmarkBatchRunGetUsernsFD_Concurrent1(b *testing.B) {
|
||||
@ -72,7 +71,7 @@ var (
|
||||
}
|
||||
)
|
||||
|
||||
func TestGetUsernsFD(t *testing.T) {
|
||||
func TestIdmappedMount(t *testing.T) {
|
||||
testutil.RequiresRoot(t)
|
||||
|
||||
k512 := kernel.KernelVersion{Kernel: 5, Major: 12}
|
||||
@ -82,15 +81,12 @@ func TestGetUsernsFD(t *testing.T) {
|
||||
t.Skip("GetUsernsFD requires kernel >= 5.12")
|
||||
}
|
||||
|
||||
t.Run("basic", testGetUsernsFDBasic)
|
||||
|
||||
t.Run("when kill child process before write u[g]id maps", testGetUsernsFDKillChildWhenWriteUGIDMaps)
|
||||
|
||||
t.Run("when kill child process after open u[g]id_map file", testGetUsernsFDKillChildAfterOpenUGIDMapFiles)
|
||||
t.Run("GetUsernsFD", testGetUsernsFD)
|
||||
|
||||
t.Run("IDMapMount", testIDMapMount)
|
||||
}
|
||||
|
||||
func testGetUsernsFDBasic(t *testing.T) {
|
||||
func testGetUsernsFD(t *testing.T) {
|
||||
for idx, tc := range []struct {
|
||||
uidMaps string
|
||||
gidMaps string
|
||||
@ -128,108 +124,21 @@ func testGetUsernsFDBasic(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func testGetUsernsFDKillChildWhenWriteUGIDMaps(t *testing.T) {
|
||||
hookFunc := func(reap bool) func(uintptr, *os.File) {
|
||||
return func(_pid uintptr, pidFD *os.File) {
|
||||
err := unix.PidfdSendSignal(int(pidFD.Fd()), unix.SIGKILL, nil, 0)
|
||||
require.NoError(t, err)
|
||||
func testIDMapMount(t *testing.T) {
|
||||
usernsFD, err := getUsernsFD(testUIDMaps, testGIDMaps)
|
||||
require.NoError(t, err)
|
||||
defer usernsFD.Close()
|
||||
|
||||
if reap {
|
||||
pidfdWaitid(pidFD)
|
||||
}
|
||||
}
|
||||
}
|
||||
srcDir, checkFunc := initIDMappedChecker(t, testUIDMaps, testGIDMaps)
|
||||
destDir := t.TempDir()
|
||||
defer func() {
|
||||
require.NoError(t, UnmountAll(destDir, 0))
|
||||
}()
|
||||
|
||||
for _, tcReap := range []bool{true, false} {
|
||||
t.Run(fmt.Sprintf("#reap=%v", tcReap), func(t *testing.T) {
|
||||
updateTestHookKillForGetUsernsFD(t, nil, hookFunc(tcReap))
|
||||
|
||||
usernsFD, err := getUsernsFD(testUIDMaps, testGIDMaps)
|
||||
require.NoError(t, err)
|
||||
defer usernsFD.Close()
|
||||
|
||||
srcDir, checkFunc := initIDMappedChecker(t, testUIDMaps, testGIDMaps)
|
||||
destDir := t.TempDir()
|
||||
defer func() {
|
||||
require.NoError(t, UnmountAll(destDir, 0))
|
||||
}()
|
||||
|
||||
err = IDMapMount(srcDir, destDir, int(usernsFD.Fd()))
|
||||
usernsFD.Close()
|
||||
require.NoError(t, err)
|
||||
checkFunc(destDir)
|
||||
})
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
func testGetUsernsFDKillChildAfterOpenUGIDMapFiles(t *testing.T) {
|
||||
hookFunc := func(reap bool) func(uintptr, *os.File) {
|
||||
return func(_pid uintptr, pidFD *os.File) {
|
||||
err := unix.PidfdSendSignal(int(pidFD.Fd()), unix.SIGKILL, nil, 0)
|
||||
require.NoError(t, err)
|
||||
|
||||
if reap {
|
||||
pidfdWaitid(pidFD)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for _, tc := range []struct {
|
||||
reap bool
|
||||
expected error
|
||||
}{
|
||||
{
|
||||
reap: false,
|
||||
expected: nil,
|
||||
},
|
||||
{
|
||||
reap: true,
|
||||
expected: syscall.ESRCH,
|
||||
},
|
||||
} {
|
||||
t.Run(fmt.Sprintf("#reap=%v", tc.reap), func(t *testing.T) {
|
||||
updateTestHookKillForGetUsernsFD(t, hookFunc(tc.reap), nil)
|
||||
|
||||
usernsFD, err := getUsernsFD(testUIDMaps, testGIDMaps)
|
||||
if tc.expected != nil {
|
||||
require.Error(t, tc.expected, err)
|
||||
return
|
||||
}
|
||||
|
||||
require.NoError(t, err)
|
||||
defer usernsFD.Close()
|
||||
|
||||
srcDir, checkFunc := initIDMappedChecker(t, testUIDMaps, testGIDMaps)
|
||||
destDir := t.TempDir()
|
||||
defer func() {
|
||||
require.NoError(t, UnmountAll(destDir, 0))
|
||||
}()
|
||||
|
||||
err = IDMapMount(srcDir, destDir, int(usernsFD.Fd()))
|
||||
usernsFD.Close()
|
||||
require.NoError(t, err)
|
||||
checkFunc(destDir)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func updateTestHookKillForGetUsernsFD(t *testing.T, newBeforeFunc, newAfterFunc func(uintptr, *os.File)) {
|
||||
testHookLock.Lock()
|
||||
|
||||
oldBefore := testHookKillChildBeforePidfdSendSignal
|
||||
oldAfter := testHookKillChildAfterPidfdSendSignal
|
||||
t.Cleanup(func() {
|
||||
testHookKillChildBeforePidfdSendSignal = oldBefore
|
||||
testHookKillChildAfterPidfdSendSignal = oldAfter
|
||||
testHookLock.Unlock()
|
||||
})
|
||||
if newBeforeFunc != nil {
|
||||
testHookKillChildBeforePidfdSendSignal = newBeforeFunc
|
||||
}
|
||||
if newAfterFunc != nil {
|
||||
testHookKillChildAfterPidfdSendSignal = newAfterFunc
|
||||
}
|
||||
err = IDMapMount(srcDir, destDir, int(usernsFD.Fd()))
|
||||
usernsFD.Close()
|
||||
require.NoError(t, err)
|
||||
checkFunc(destDir)
|
||||
}
|
||||
|
||||
func initIDMappedChecker(t *testing.T, uidMaps, gidMaps []syscall.SysProcIDMap) (_srcDir string, _verifyFunc func(destDir string)) {
|
||||
|
85
core/mount/mount_idmapped_utils_linux.go
Normal file
85
core/mount/mount_idmapped_utils_linux.go
Normal file
@ -0,0 +1,85 @@
|
||||
//go:build go1.23 && linux
|
||||
|
||||
/*
|
||||
Copyright The containerd Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package mount
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"syscall"
|
||||
|
||||
"github.com/containerd/containerd/v2/pkg/sys"
|
||||
|
||||
"golang.org/x/sys/unix"
|
||||
)
|
||||
|
||||
// getUsernsFD returns pinnable user namespace's file descriptor.
|
||||
//
|
||||
// NOTE: The GO runtime uses pidfd to handle subprocess since go1.23. However,
|
||||
// it has double close issue tracked by [1]. We can't use pidfd directly and
|
||||
// the GO runtime doesn't export interface to show if it's using pidfd or not.
|
||||
// So, we call `sys.SupportsPidFD` first and then use `os.Process` directly.
|
||||
//
|
||||
// [1]: https://github.com/golang/go/issues/68984
|
||||
func getUsernsFD(uidMaps, gidMaps []syscall.SysProcIDMap) (_ *os.File, retErr error) {
|
||||
if !sys.SupportsPidFD() {
|
||||
return nil, fmt.Errorf("failed to prevent pid reused issue because pidfd isn't supported")
|
||||
}
|
||||
|
||||
proc, err := os.StartProcess("/proc/self/exe", []string{"containerd[getUsernsFD]"}, &os.ProcAttr{
|
||||
Sys: &syscall.SysProcAttr{
|
||||
Cloneflags: unix.CLONE_NEWUSER,
|
||||
UidMappings: uidMaps,
|
||||
GidMappings: gidMaps,
|
||||
// NOTE: It's reexec but it's not heavy because subprocess
|
||||
// be in PTRACE_TRACEME mode before performing execve.
|
||||
Ptrace: true,
|
||||
Pdeathsig: syscall.SIGKILL,
|
||||
},
|
||||
})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to start noop process for unshare: %w", err)
|
||||
}
|
||||
|
||||
defer func() {
|
||||
proc.Kill()
|
||||
proc.Wait()
|
||||
}()
|
||||
|
||||
// NOTE:
|
||||
//
|
||||
// The usernsFD will hold the userns reference in kernel. Even if the
|
||||
// child process is reaped, the usernsFD is still valid.
|
||||
usernsFD, err := os.Open(fmt.Sprintf("/proc/%d/ns/user", proc.Pid))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to get userns file descriptor for /proc/%d/user/ns: %w", proc.Pid, err)
|
||||
}
|
||||
defer func() {
|
||||
if retErr != nil {
|
||||
usernsFD.Close()
|
||||
}
|
||||
}()
|
||||
|
||||
// Ensure the child process is still alive. If the err is ESRCH, we
|
||||
// should return error because we can't guarantee the usernsFD and
|
||||
// u[g]idmapFile are valid. It's safe to return error and retry.
|
||||
if err := proc.Signal(syscall.Signal(0)); err != nil {
|
||||
return nil, fmt.Errorf("failed to ensure child process is alive: %w", err)
|
||||
}
|
||||
return usernsFD, nil
|
||||
}
|
93
core/mount/mount_idmapped_utils_linux_go122.go
Normal file
93
core/mount/mount_idmapped_utils_linux_go122.go
Normal file
@ -0,0 +1,93 @@
|
||||
//go:build !go1.23 && linux
|
||||
|
||||
/*
|
||||
Copyright The containerd Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package mount
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"syscall"
|
||||
|
||||
"github.com/containerd/containerd/v2/pkg/sys"
|
||||
|
||||
"golang.org/x/sys/unix"
|
||||
)
|
||||
|
||||
// getUsernsFD returns pinnable user namespace's file descriptor.
|
||||
func getUsernsFD(uidMaps, gidMaps []syscall.SysProcIDMap) (_ *os.File, retErr error) {
|
||||
var pidfd int
|
||||
|
||||
proc, err := os.StartProcess("/proc/self/exe", []string{"containerd[getUsernsFD]"}, &os.ProcAttr{
|
||||
Sys: &syscall.SysProcAttr{
|
||||
Cloneflags: unix.CLONE_NEWUSER,
|
||||
UidMappings: uidMaps,
|
||||
GidMappings: gidMaps,
|
||||
// NOTE: It's reexec but it's not heavy because subprocess
|
||||
// be in PTRACE_TRACEME mode before performing execve.
|
||||
Ptrace: true,
|
||||
Pdeathsig: syscall.SIGKILL,
|
||||
PidFD: &pidfd,
|
||||
},
|
||||
})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to start noop process for unshare: %w", err)
|
||||
}
|
||||
|
||||
if pidfd == -1 || !sys.SupportsPidFD() {
|
||||
proc.Kill()
|
||||
proc.Wait()
|
||||
return nil, fmt.Errorf("failed to prevent pid reused issue because pidfd isn't supported")
|
||||
}
|
||||
|
||||
pidFD := os.NewFile(uintptr(pidfd), "pidfd")
|
||||
defer func() {
|
||||
unix.PidfdSendSignal(int(pidFD.Fd()), unix.SIGKILL, nil, 0)
|
||||
|
||||
pidfdWaitid(pidFD)
|
||||
|
||||
pidFD.Close()
|
||||
}()
|
||||
|
||||
// NOTE:
|
||||
//
|
||||
// The usernsFD will hold the userns reference in kernel. Even if the
|
||||
// child process is reaped, the usernsFD is still valid.
|
||||
usernsFD, err := os.Open(fmt.Sprintf("/proc/%d/ns/user", proc.Pid))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to get userns file descriptor for /proc/%d/user/ns: %w", proc.Pid, err)
|
||||
}
|
||||
defer func() {
|
||||
if retErr != nil {
|
||||
usernsFD.Close()
|
||||
}
|
||||
}()
|
||||
|
||||
// Ensure the child process is still alive. If the err is ESRCH, we
|
||||
// should return error because we can't guarantee the usernsFD and
|
||||
// u[g]idmapFile are valid. It's safe to return error and retry.
|
||||
if err := unix.PidfdSendSignal(int(pidFD.Fd()), 0, nil, 0); err != nil {
|
||||
return nil, fmt.Errorf("failed to ensure child process is alive: %w", err)
|
||||
}
|
||||
return usernsFD, nil
|
||||
}
|
||||
|
||||
func pidfdWaitid(pidFD *os.File) error {
|
||||
return sys.IgnoringEINTR(func() error {
|
||||
return unix.Waitid(unix.P_PIDFD, int(pidFD.Fd()), nil, unix.WEXITED, nil)
|
||||
})
|
||||
}
|
85
pkg/sys/pidfd_linux.go
Normal file
85
pkg/sys/pidfd_linux.go
Normal file
@ -0,0 +1,85 @@
|
||||
/*
|
||||
Copyright The containerd Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package sys
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"os"
|
||||
"sync"
|
||||
|
||||
"github.com/containerd/log"
|
||||
"golang.org/x/sys/unix"
|
||||
)
|
||||
|
||||
var (
|
||||
pidfdSupported bool
|
||||
pidfdSupportedOnce sync.Once
|
||||
)
|
||||
|
||||
// SupportsPidFD returns true if current kernel supports pidfd.
|
||||
func SupportsPidFD() bool {
|
||||
pidfdSupportedOnce.Do(func() {
|
||||
logger := log.G(context.Background())
|
||||
|
||||
if err := checkPidFD(); err != nil {
|
||||
logger.WithError(err).Error("failed to ensure the kernel supports pidfd")
|
||||
|
||||
pidfdSupported = false
|
||||
return
|
||||
}
|
||||
pidfdSupported = true
|
||||
})
|
||||
return pidfdSupported
|
||||
}
|
||||
|
||||
func checkPidFD() error {
|
||||
// Linux kernel supports pidfd_open(2) since v5.3.
|
||||
//
|
||||
// https://man7.org/linux/man-pages/man2/pidfd_open.2.html
|
||||
pidfd, err := unix.PidfdOpen(os.Getpid(), 0)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to invoke pidfd_open: %w", err)
|
||||
}
|
||||
defer unix.Close(pidfd)
|
||||
|
||||
// Linux kernel supports pidfd_send_signal(2) since v5.1.
|
||||
//
|
||||
// https://man7.org/linux/man-pages/man2/pidfd_send_signal.2.html
|
||||
if err := unix.PidfdSendSignal(pidfd, 0, nil, 0); err != nil {
|
||||
return fmt.Errorf("failed to invoke pidfd_send_signal: %w", err)
|
||||
}
|
||||
|
||||
// The waitid(2) supports P_PIDFD since Linux kernel v5.4.
|
||||
//
|
||||
// https://man7.org/linux/man-pages/man2/waitid.2.html
|
||||
werr := IgnoringEINTR(func() error {
|
||||
return unix.Waitid(unix.P_PIDFD, pidfd, nil, unix.WEXITED, nil)
|
||||
})
|
||||
|
||||
// The waitid returns ECHILD since current process isn't the child of current process.
|
||||
if !errors.Is(werr, unix.ECHILD) {
|
||||
return fmt.Errorf("failed to invoke waitid with P_PIDFD: wanted error %v, but got %v",
|
||||
unix.ECHILD, werr)
|
||||
}
|
||||
|
||||
// NOTE: The CLONE_PIDFD flag has been supported since Linux kernel v5.2.
|
||||
// So assumption is that if waitid(2) supports P_PIDFD, current kernel
|
||||
// should support CLONE_PIDFD as well.
|
||||
return nil
|
||||
}
|
@ -1,30 +0,0 @@
|
||||
/*
|
||||
Copyright The containerd Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package sys
|
||||
|
||||
import (
|
||||
_ "unsafe" // required for go:linkname.
|
||||
)
|
||||
|
||||
//go:linkname beforeFork syscall.runtime_BeforeFork
|
||||
func beforeFork()
|
||||
|
||||
//go:linkname afterFork syscall.runtime_AfterFork
|
||||
func afterFork()
|
||||
|
||||
//go:linkname afterForkInChild syscall.runtime_AfterForkInChild
|
||||
func afterForkInChild()
|
@ -1,94 +0,0 @@
|
||||
/*
|
||||
Copyright The containerd Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package sys
|
||||
|
||||
import (
|
||||
"runtime"
|
||||
"syscall"
|
||||
"unsafe"
|
||||
)
|
||||
|
||||
// ForkUserns is to fork child process with user namespace. It returns child
|
||||
// process's pid and pidfd reference to the child process.
|
||||
//
|
||||
// Precondition: The runtime OS thread must be locked, which is GO runtime
|
||||
// requirement.
|
||||
//
|
||||
// Beside this, the child process sets PR_SET_PDEATHSIG with SIGKILL so that
|
||||
// the parent process's OS thread must be locked. Otherwise, the exit event of
|
||||
// parent process's OS thread will send kill signal to the child process,
|
||||
// even if parent process is still running.
|
||||
//
|
||||
//go:norace
|
||||
//go:noinline
|
||||
func ForkUserns() (_pid uintptr, _pidfd uintptr, _ syscall.Errno) {
|
||||
var (
|
||||
pidfd uintptr
|
||||
pid, ppid uintptr
|
||||
err syscall.Errno
|
||||
)
|
||||
|
||||
ppid, _, err = syscall.RawSyscall(uintptr(syscall.SYS_GETPID), 0, 0, 0)
|
||||
if err != 0 {
|
||||
return 0, 0, err
|
||||
}
|
||||
|
||||
beforeFork()
|
||||
if runtime.GOARCH == "s390x" {
|
||||
// NOTE:
|
||||
//
|
||||
// On the s390 architectures, the order of the first two
|
||||
// arguments is reversed.
|
||||
//
|
||||
// REF: https://man7.org/linux/man-pages/man2/clone.2.html
|
||||
pid, _, err = syscall.RawSyscall(syscall.SYS_CLONE,
|
||||
0,
|
||||
uintptr(syscall.CLONE_NEWUSER|syscall.SIGCHLD|syscall.CLONE_PIDFD),
|
||||
uintptr(unsafe.Pointer(&pidfd)),
|
||||
)
|
||||
} else {
|
||||
pid, _, err = syscall.RawSyscall(syscall.SYS_CLONE,
|
||||
uintptr(syscall.CLONE_NEWUSER|syscall.SIGCHLD|syscall.CLONE_PIDFD),
|
||||
0,
|
||||
uintptr(unsafe.Pointer(&pidfd)),
|
||||
)
|
||||
}
|
||||
if err != 0 || pid != 0 {
|
||||
afterFork()
|
||||
return pid, pidfd, err
|
||||
}
|
||||
afterForkInChild()
|
||||
|
||||
if _, _, err = syscall.RawSyscall(syscall.SYS_PRCTL, syscall.PR_SET_PDEATHSIG, uintptr(syscall.SIGKILL), 0); err != 0 {
|
||||
goto err
|
||||
}
|
||||
|
||||
pid, _, err = syscall.RawSyscall(syscall.SYS_GETPPID, 0, 0, 0)
|
||||
if err != 0 {
|
||||
goto err
|
||||
}
|
||||
|
||||
// exit if re-parent
|
||||
if pid != ppid {
|
||||
goto err
|
||||
}
|
||||
|
||||
_, _, err = syscall.RawSyscall(syscall.SYS_PPOLL, 0, 0, 0)
|
||||
err:
|
||||
syscall.RawSyscall(syscall.SYS_EXIT, uintptr(err), 0, 0)
|
||||
panic("unreachable")
|
||||
}
|
Loading…
Reference in New Issue
Block a user