Merge pull request #10611 from fuweid/getridof-hall-of-shame
core/mount: use ptrace instead of go:linkname
This commit is contained in:
commit
26d6fd0c3f
@ -19,15 +19,11 @@ package mount
|
|||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
"os"
|
"os"
|
||||||
"runtime"
|
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"sync"
|
|
||||||
"syscall"
|
"syscall"
|
||||||
|
|
||||||
"golang.org/x/sys/unix"
|
"golang.org/x/sys/unix"
|
||||||
|
|
||||||
"github.com/containerd/containerd/v2/pkg/sys"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
// TODO: Support multiple mappings in future
|
// TODO: Support multiple mappings in future
|
||||||
@ -106,99 +102,3 @@ func GetUsernsFD(uidmap, gidmap string) (_usernsFD *os.File, _ error) {
|
|||||||
}
|
}
|
||||||
return getUsernsFD(uidMaps, gidMaps)
|
return getUsernsFD(uidMaps, gidMaps)
|
||||||
}
|
}
|
||||||
|
|
||||||
func getUsernsFD(uidMaps, gidMaps []syscall.SysProcIDMap) (_usernsFD *os.File, retErr error) {
|
|
||||||
runtime.LockOSThread()
|
|
||||||
defer runtime.UnlockOSThread()
|
|
||||||
|
|
||||||
pid, pidfd, errno := sys.ForkUserns()
|
|
||||||
if errno != 0 {
|
|
||||||
return nil, errno
|
|
||||||
}
|
|
||||||
|
|
||||||
pidFD := os.NewFile(pidfd, "pidfd")
|
|
||||||
defer func() {
|
|
||||||
unix.PidfdSendSignal(int(pidFD.Fd()), unix.SIGKILL, nil, 0)
|
|
||||||
|
|
||||||
pidfdWaitid(pidFD)
|
|
||||||
|
|
||||||
pidFD.Close()
|
|
||||||
}()
|
|
||||||
|
|
||||||
// NOTE:
|
|
||||||
//
|
|
||||||
// The usernsFD will hold the userns reference in kernel. Even if the
|
|
||||||
// child process is reaped, the usernsFD is still valid.
|
|
||||||
usernsFD, err := os.Open(fmt.Sprintf("/proc/%d/ns/user", pid))
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("failed to get userns file descriptor for /proc/%d/user/ns: %w", pid, err)
|
|
||||||
}
|
|
||||||
defer func() {
|
|
||||||
if retErr != nil {
|
|
||||||
usernsFD.Close()
|
|
||||||
}
|
|
||||||
}()
|
|
||||||
|
|
||||||
uidmapFile, err := os.OpenFile(fmt.Sprintf("/proc/%d/%s", pid, "uid_map"), os.O_WRONLY, 0600)
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("failed to open /proc/%d/uid_map: %w", pid, err)
|
|
||||||
}
|
|
||||||
defer uidmapFile.Close()
|
|
||||||
|
|
||||||
gidmapFile, err := os.OpenFile(fmt.Sprintf("/proc/%d/%s", pid, "gid_map"), os.O_WRONLY, 0600)
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("failed to open /proc/%d/gid_map: %w", pid, err)
|
|
||||||
}
|
|
||||||
defer gidmapFile.Close()
|
|
||||||
|
|
||||||
testHookKillChildBeforePidfdSendSignal(pid, pidFD)
|
|
||||||
|
|
||||||
// Ensure the child process is still alive. If the err is ESRCH, we
|
|
||||||
// should return error because we can't guarantee the usernsFD and
|
|
||||||
// u[g]idmapFile are valid. It's safe to return error and retry.
|
|
||||||
if err := unix.PidfdSendSignal(int(pidFD.Fd()), 0, nil, 0); err != nil {
|
|
||||||
return nil, fmt.Errorf("failed to ensure child process is alive: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
testHookKillChildAfterPidfdSendSignal(pid, pidFD)
|
|
||||||
|
|
||||||
// NOTE:
|
|
||||||
//
|
|
||||||
// The u[g]id_map file descriptor is still valid if the child process
|
|
||||||
// is reaped.
|
|
||||||
writeMappings := func(f *os.File, idmap []syscall.SysProcIDMap) error {
|
|
||||||
mappings := ""
|
|
||||||
for _, m := range idmap {
|
|
||||||
mappings = fmt.Sprintf("%s%d %d %d\n", mappings, m.ContainerID, m.HostID, m.Size)
|
|
||||||
}
|
|
||||||
|
|
||||||
_, err := f.Write([]byte(mappings))
|
|
||||||
if err1 := f.Close(); err1 != nil && err == nil {
|
|
||||||
err = err1
|
|
||||||
}
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
if err := writeMappings(uidmapFile, uidMaps); err != nil {
|
|
||||||
return nil, fmt.Errorf("failed to write uid_map: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
if err := writeMappings(gidmapFile, gidMaps); err != nil {
|
|
||||||
return nil, fmt.Errorf("failed to write gid_map: %w", err)
|
|
||||||
}
|
|
||||||
return usernsFD, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func pidfdWaitid(pidFD *os.File) error {
|
|
||||||
return sys.IgnoringEINTR(func() error {
|
|
||||||
return unix.Waitid(unix.P_PIDFD, int(pidFD.Fd()), nil, unix.WEXITED, nil)
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
var (
|
|
||||||
testHookLock sync.Mutex
|
|
||||||
|
|
||||||
testHookKillChildBeforePidfdSendSignal = func(_pid uintptr, _pidFD *os.File) {}
|
|
||||||
|
|
||||||
testHookKillChildAfterPidfdSendSignal = func(_pid uintptr, _pidFD *os.File) {}
|
|
||||||
)
|
|
||||||
|
@ -20,15 +20,43 @@ import (
|
|||||||
"fmt"
|
"fmt"
|
||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
|
"sync"
|
||||||
"syscall"
|
"syscall"
|
||||||
"testing"
|
"testing"
|
||||||
|
|
||||||
kernel "github.com/containerd/containerd/v2/pkg/kernelversion"
|
kernel "github.com/containerd/containerd/v2/pkg/kernelversion"
|
||||||
"github.com/containerd/continuity/testutil"
|
"github.com/containerd/continuity/testutil"
|
||||||
"github.com/stretchr/testify/require"
|
"github.com/stretchr/testify/require"
|
||||||
"golang.org/x/sys/unix"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
func BenchmarkBatchRunGetUsernsFD_Concurrent1(b *testing.B) {
|
||||||
|
for range b.N {
|
||||||
|
benchmarkBatchRunGetUsernsFD(1)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func BenchmarkBatchRunGetUsernsFD_Concurrent10(b *testing.B) {
|
||||||
|
for range b.N {
|
||||||
|
benchmarkBatchRunGetUsernsFD(10)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func benchmarkBatchRunGetUsernsFD(n int) {
|
||||||
|
var wg sync.WaitGroup
|
||||||
|
wg.Add(n)
|
||||||
|
for i := 0; i < n; i++ {
|
||||||
|
go func() {
|
||||||
|
defer wg.Done()
|
||||||
|
fd, err := getUsernsFD(testUIDMaps, testGIDMaps)
|
||||||
|
if err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
fd.Close()
|
||||||
|
}()
|
||||||
|
}
|
||||||
|
wg.Wait()
|
||||||
|
}
|
||||||
|
|
||||||
var (
|
var (
|
||||||
testUIDMaps = []syscall.SysProcIDMap{
|
testUIDMaps = []syscall.SysProcIDMap{
|
||||||
{ContainerID: 1000, HostID: 0, Size: 100},
|
{ContainerID: 1000, HostID: 0, Size: 100},
|
||||||
@ -43,7 +71,7 @@ var (
|
|||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
func TestGetUsernsFD(t *testing.T) {
|
func TestIdmappedMount(t *testing.T) {
|
||||||
testutil.RequiresRoot(t)
|
testutil.RequiresRoot(t)
|
||||||
|
|
||||||
k512 := kernel.KernelVersion{Kernel: 5, Major: 12}
|
k512 := kernel.KernelVersion{Kernel: 5, Major: 12}
|
||||||
@ -53,15 +81,12 @@ func TestGetUsernsFD(t *testing.T) {
|
|||||||
t.Skip("GetUsernsFD requires kernel >= 5.12")
|
t.Skip("GetUsernsFD requires kernel >= 5.12")
|
||||||
}
|
}
|
||||||
|
|
||||||
t.Run("basic", testGetUsernsFDBasic)
|
t.Run("GetUsernsFD", testGetUsernsFD)
|
||||||
|
|
||||||
t.Run("when kill child process before write u[g]id maps", testGetUsernsFDKillChildWhenWriteUGIDMaps)
|
|
||||||
|
|
||||||
t.Run("when kill child process after open u[g]id_map file", testGetUsernsFDKillChildAfterOpenUGIDMapFiles)
|
|
||||||
|
|
||||||
|
t.Run("IDMapMount", testIDMapMount)
|
||||||
}
|
}
|
||||||
|
|
||||||
func testGetUsernsFDBasic(t *testing.T) {
|
func testGetUsernsFD(t *testing.T) {
|
||||||
for idx, tc := range []struct {
|
for idx, tc := range []struct {
|
||||||
uidMaps string
|
uidMaps string
|
||||||
gidMaps string
|
gidMaps string
|
||||||
@ -99,108 +124,21 @@ func testGetUsernsFDBasic(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func testGetUsernsFDKillChildWhenWriteUGIDMaps(t *testing.T) {
|
func testIDMapMount(t *testing.T) {
|
||||||
hookFunc := func(reap bool) func(uintptr, *os.File) {
|
usernsFD, err := getUsernsFD(testUIDMaps, testGIDMaps)
|
||||||
return func(_pid uintptr, pidFD *os.File) {
|
require.NoError(t, err)
|
||||||
err := unix.PidfdSendSignal(int(pidFD.Fd()), unix.SIGKILL, nil, 0)
|
defer usernsFD.Close()
|
||||||
require.NoError(t, err)
|
|
||||||
|
|
||||||
if reap {
|
srcDir, checkFunc := initIDMappedChecker(t, testUIDMaps, testGIDMaps)
|
||||||
pidfdWaitid(pidFD)
|
destDir := t.TempDir()
|
||||||
}
|
defer func() {
|
||||||
}
|
require.NoError(t, UnmountAll(destDir, 0))
|
||||||
}
|
}()
|
||||||
|
|
||||||
for _, tcReap := range []bool{true, false} {
|
err = IDMapMount(srcDir, destDir, int(usernsFD.Fd()))
|
||||||
t.Run(fmt.Sprintf("#reap=%v", tcReap), func(t *testing.T) {
|
usernsFD.Close()
|
||||||
updateTestHookKillForGetUsernsFD(t, nil, hookFunc(tcReap))
|
require.NoError(t, err)
|
||||||
|
checkFunc(destDir)
|
||||||
usernsFD, err := getUsernsFD(testUIDMaps, testGIDMaps)
|
|
||||||
require.NoError(t, err)
|
|
||||||
defer usernsFD.Close()
|
|
||||||
|
|
||||||
srcDir, checkFunc := initIDMappedChecker(t, testUIDMaps, testGIDMaps)
|
|
||||||
destDir := t.TempDir()
|
|
||||||
defer func() {
|
|
||||||
require.NoError(t, UnmountAll(destDir, 0))
|
|
||||||
}()
|
|
||||||
|
|
||||||
err = IDMapMount(srcDir, destDir, int(usernsFD.Fd()))
|
|
||||||
usernsFD.Close()
|
|
||||||
require.NoError(t, err)
|
|
||||||
checkFunc(destDir)
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
func testGetUsernsFDKillChildAfterOpenUGIDMapFiles(t *testing.T) {
|
|
||||||
hookFunc := func(reap bool) func(uintptr, *os.File) {
|
|
||||||
return func(_pid uintptr, pidFD *os.File) {
|
|
||||||
err := unix.PidfdSendSignal(int(pidFD.Fd()), unix.SIGKILL, nil, 0)
|
|
||||||
require.NoError(t, err)
|
|
||||||
|
|
||||||
if reap {
|
|
||||||
pidfdWaitid(pidFD)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, tc := range []struct {
|
|
||||||
reap bool
|
|
||||||
expected error
|
|
||||||
}{
|
|
||||||
{
|
|
||||||
reap: false,
|
|
||||||
expected: nil,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
reap: true,
|
|
||||||
expected: syscall.ESRCH,
|
|
||||||
},
|
|
||||||
} {
|
|
||||||
t.Run(fmt.Sprintf("#reap=%v", tc.reap), func(t *testing.T) {
|
|
||||||
updateTestHookKillForGetUsernsFD(t, hookFunc(tc.reap), nil)
|
|
||||||
|
|
||||||
usernsFD, err := getUsernsFD(testUIDMaps, testGIDMaps)
|
|
||||||
if tc.expected != nil {
|
|
||||||
require.Error(t, tc.expected, err)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
require.NoError(t, err)
|
|
||||||
defer usernsFD.Close()
|
|
||||||
|
|
||||||
srcDir, checkFunc := initIDMappedChecker(t, testUIDMaps, testGIDMaps)
|
|
||||||
destDir := t.TempDir()
|
|
||||||
defer func() {
|
|
||||||
require.NoError(t, UnmountAll(destDir, 0))
|
|
||||||
}()
|
|
||||||
|
|
||||||
err = IDMapMount(srcDir, destDir, int(usernsFD.Fd()))
|
|
||||||
usernsFD.Close()
|
|
||||||
require.NoError(t, err)
|
|
||||||
checkFunc(destDir)
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func updateTestHookKillForGetUsernsFD(t *testing.T, newBeforeFunc, newAfterFunc func(uintptr, *os.File)) {
|
|
||||||
testHookLock.Lock()
|
|
||||||
|
|
||||||
oldBefore := testHookKillChildBeforePidfdSendSignal
|
|
||||||
oldAfter := testHookKillChildAfterPidfdSendSignal
|
|
||||||
t.Cleanup(func() {
|
|
||||||
testHookKillChildBeforePidfdSendSignal = oldBefore
|
|
||||||
testHookKillChildAfterPidfdSendSignal = oldAfter
|
|
||||||
testHookLock.Unlock()
|
|
||||||
})
|
|
||||||
if newBeforeFunc != nil {
|
|
||||||
testHookKillChildBeforePidfdSendSignal = newBeforeFunc
|
|
||||||
}
|
|
||||||
if newAfterFunc != nil {
|
|
||||||
testHookKillChildAfterPidfdSendSignal = newAfterFunc
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func initIDMappedChecker(t *testing.T, uidMaps, gidMaps []syscall.SysProcIDMap) (_srcDir string, _verifyFunc func(destDir string)) {
|
func initIDMappedChecker(t *testing.T, uidMaps, gidMaps []syscall.SysProcIDMap) (_srcDir string, _verifyFunc func(destDir string)) {
|
||||||
|
85
core/mount/mount_idmapped_utils_linux.go
Normal file
85
core/mount/mount_idmapped_utils_linux.go
Normal file
@ -0,0 +1,85 @@
|
|||||||
|
//go:build go1.23 && linux
|
||||||
|
|
||||||
|
/*
|
||||||
|
Copyright The containerd Authors.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package mount
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
"syscall"
|
||||||
|
|
||||||
|
"github.com/containerd/containerd/v2/pkg/sys"
|
||||||
|
|
||||||
|
"golang.org/x/sys/unix"
|
||||||
|
)
|
||||||
|
|
||||||
|
// getUsernsFD returns pinnable user namespace's file descriptor.
|
||||||
|
//
|
||||||
|
// NOTE: The GO runtime uses pidfd to handle subprocess since go1.23. However,
|
||||||
|
// it has double close issue tracked by [1]. We can't use pidfd directly and
|
||||||
|
// the GO runtime doesn't export interface to show if it's using pidfd or not.
|
||||||
|
// So, we call `sys.SupportsPidFD` first and then use `os.Process` directly.
|
||||||
|
//
|
||||||
|
// [1]: https://github.com/golang/go/issues/68984
|
||||||
|
func getUsernsFD(uidMaps, gidMaps []syscall.SysProcIDMap) (_ *os.File, retErr error) {
|
||||||
|
if !sys.SupportsPidFD() {
|
||||||
|
return nil, fmt.Errorf("failed to prevent pid reused issue because pidfd isn't supported")
|
||||||
|
}
|
||||||
|
|
||||||
|
proc, err := os.StartProcess("/proc/self/exe", []string{"containerd[getUsernsFD]"}, &os.ProcAttr{
|
||||||
|
Sys: &syscall.SysProcAttr{
|
||||||
|
Cloneflags: unix.CLONE_NEWUSER,
|
||||||
|
UidMappings: uidMaps,
|
||||||
|
GidMappings: gidMaps,
|
||||||
|
// NOTE: It's reexec but it's not heavy because subprocess
|
||||||
|
// be in PTRACE_TRACEME mode before performing execve.
|
||||||
|
Ptrace: true,
|
||||||
|
Pdeathsig: syscall.SIGKILL,
|
||||||
|
},
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to start noop process for unshare: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
defer func() {
|
||||||
|
proc.Kill()
|
||||||
|
proc.Wait()
|
||||||
|
}()
|
||||||
|
|
||||||
|
// NOTE:
|
||||||
|
//
|
||||||
|
// The usernsFD will hold the userns reference in kernel. Even if the
|
||||||
|
// child process is reaped, the usernsFD is still valid.
|
||||||
|
usernsFD, err := os.Open(fmt.Sprintf("/proc/%d/ns/user", proc.Pid))
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to get userns file descriptor for /proc/%d/user/ns: %w", proc.Pid, err)
|
||||||
|
}
|
||||||
|
defer func() {
|
||||||
|
if retErr != nil {
|
||||||
|
usernsFD.Close()
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
|
// Ensure the child process is still alive. If the err is ESRCH, we
|
||||||
|
// should return error because we can't guarantee the usernsFD and
|
||||||
|
// u[g]idmapFile are valid. It's safe to return error and retry.
|
||||||
|
if err := proc.Signal(syscall.Signal(0)); err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to ensure child process is alive: %w", err)
|
||||||
|
}
|
||||||
|
return usernsFD, nil
|
||||||
|
}
|
93
core/mount/mount_idmapped_utils_linux_go122.go
Normal file
93
core/mount/mount_idmapped_utils_linux_go122.go
Normal file
@ -0,0 +1,93 @@
|
|||||||
|
//go:build !go1.23 && linux
|
||||||
|
|
||||||
|
/*
|
||||||
|
Copyright The containerd Authors.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package mount
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
"syscall"
|
||||||
|
|
||||||
|
"github.com/containerd/containerd/v2/pkg/sys"
|
||||||
|
|
||||||
|
"golang.org/x/sys/unix"
|
||||||
|
)
|
||||||
|
|
||||||
|
// getUsernsFD returns pinnable user namespace's file descriptor.
|
||||||
|
func getUsernsFD(uidMaps, gidMaps []syscall.SysProcIDMap) (_ *os.File, retErr error) {
|
||||||
|
var pidfd int
|
||||||
|
|
||||||
|
proc, err := os.StartProcess("/proc/self/exe", []string{"containerd[getUsernsFD]"}, &os.ProcAttr{
|
||||||
|
Sys: &syscall.SysProcAttr{
|
||||||
|
Cloneflags: unix.CLONE_NEWUSER,
|
||||||
|
UidMappings: uidMaps,
|
||||||
|
GidMappings: gidMaps,
|
||||||
|
// NOTE: It's reexec but it's not heavy because subprocess
|
||||||
|
// be in PTRACE_TRACEME mode before performing execve.
|
||||||
|
Ptrace: true,
|
||||||
|
Pdeathsig: syscall.SIGKILL,
|
||||||
|
PidFD: &pidfd,
|
||||||
|
},
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to start noop process for unshare: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if pidfd == -1 || !sys.SupportsPidFD() {
|
||||||
|
proc.Kill()
|
||||||
|
proc.Wait()
|
||||||
|
return nil, fmt.Errorf("failed to prevent pid reused issue because pidfd isn't supported")
|
||||||
|
}
|
||||||
|
|
||||||
|
pidFD := os.NewFile(uintptr(pidfd), "pidfd")
|
||||||
|
defer func() {
|
||||||
|
unix.PidfdSendSignal(int(pidFD.Fd()), unix.SIGKILL, nil, 0)
|
||||||
|
|
||||||
|
pidfdWaitid(pidFD)
|
||||||
|
|
||||||
|
pidFD.Close()
|
||||||
|
}()
|
||||||
|
|
||||||
|
// NOTE:
|
||||||
|
//
|
||||||
|
// The usernsFD will hold the userns reference in kernel. Even if the
|
||||||
|
// child process is reaped, the usernsFD is still valid.
|
||||||
|
usernsFD, err := os.Open(fmt.Sprintf("/proc/%d/ns/user", proc.Pid))
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to get userns file descriptor for /proc/%d/user/ns: %w", proc.Pid, err)
|
||||||
|
}
|
||||||
|
defer func() {
|
||||||
|
if retErr != nil {
|
||||||
|
usernsFD.Close()
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
|
// Ensure the child process is still alive. If the err is ESRCH, we
|
||||||
|
// should return error because we can't guarantee the usernsFD and
|
||||||
|
// u[g]idmapFile are valid. It's safe to return error and retry.
|
||||||
|
if err := unix.PidfdSendSignal(int(pidFD.Fd()), 0, nil, 0); err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to ensure child process is alive: %w", err)
|
||||||
|
}
|
||||||
|
return usernsFD, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func pidfdWaitid(pidFD *os.File) error {
|
||||||
|
return sys.IgnoringEINTR(func() error {
|
||||||
|
return unix.Waitid(unix.P_PIDFD, int(pidFD.Fd()), nil, unix.WEXITED, nil)
|
||||||
|
})
|
||||||
|
}
|
85
pkg/sys/pidfd_linux.go
Normal file
85
pkg/sys/pidfd_linux.go
Normal file
@ -0,0 +1,85 @@
|
|||||||
|
/*
|
||||||
|
Copyright The containerd Authors.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package sys
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
"sync"
|
||||||
|
|
||||||
|
"github.com/containerd/log"
|
||||||
|
"golang.org/x/sys/unix"
|
||||||
|
)
|
||||||
|
|
||||||
|
var (
|
||||||
|
pidfdSupported bool
|
||||||
|
pidfdSupportedOnce sync.Once
|
||||||
|
)
|
||||||
|
|
||||||
|
// SupportsPidFD returns true if current kernel supports pidfd.
|
||||||
|
func SupportsPidFD() bool {
|
||||||
|
pidfdSupportedOnce.Do(func() {
|
||||||
|
logger := log.G(context.Background())
|
||||||
|
|
||||||
|
if err := checkPidFD(); err != nil {
|
||||||
|
logger.WithError(err).Error("failed to ensure the kernel supports pidfd")
|
||||||
|
|
||||||
|
pidfdSupported = false
|
||||||
|
return
|
||||||
|
}
|
||||||
|
pidfdSupported = true
|
||||||
|
})
|
||||||
|
return pidfdSupported
|
||||||
|
}
|
||||||
|
|
||||||
|
func checkPidFD() error {
|
||||||
|
// Linux kernel supports pidfd_open(2) since v5.3.
|
||||||
|
//
|
||||||
|
// https://man7.org/linux/man-pages/man2/pidfd_open.2.html
|
||||||
|
pidfd, err := unix.PidfdOpen(os.Getpid(), 0)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("failed to invoke pidfd_open: %w", err)
|
||||||
|
}
|
||||||
|
defer unix.Close(pidfd)
|
||||||
|
|
||||||
|
// Linux kernel supports pidfd_send_signal(2) since v5.1.
|
||||||
|
//
|
||||||
|
// https://man7.org/linux/man-pages/man2/pidfd_send_signal.2.html
|
||||||
|
if err := unix.PidfdSendSignal(pidfd, 0, nil, 0); err != nil {
|
||||||
|
return fmt.Errorf("failed to invoke pidfd_send_signal: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// The waitid(2) supports P_PIDFD since Linux kernel v5.4.
|
||||||
|
//
|
||||||
|
// https://man7.org/linux/man-pages/man2/waitid.2.html
|
||||||
|
werr := IgnoringEINTR(func() error {
|
||||||
|
return unix.Waitid(unix.P_PIDFD, pidfd, nil, unix.WEXITED, nil)
|
||||||
|
})
|
||||||
|
|
||||||
|
// The waitid returns ECHILD since current process isn't the child of current process.
|
||||||
|
if !errors.Is(werr, unix.ECHILD) {
|
||||||
|
return fmt.Errorf("failed to invoke waitid with P_PIDFD: wanted error %v, but got %v",
|
||||||
|
unix.ECHILD, werr)
|
||||||
|
}
|
||||||
|
|
||||||
|
// NOTE: The CLONE_PIDFD flag has been supported since Linux kernel v5.2.
|
||||||
|
// So assumption is that if waitid(2) supports P_PIDFD, current kernel
|
||||||
|
// should support CLONE_PIDFD as well.
|
||||||
|
return nil
|
||||||
|
}
|
@ -1,30 +0,0 @@
|
|||||||
/*
|
|
||||||
Copyright The containerd Authors.
|
|
||||||
|
|
||||||
Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
you may not use this file except in compliance with the License.
|
|
||||||
You may obtain a copy of the License at
|
|
||||||
|
|
||||||
http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
|
|
||||||
Unless required by applicable law or agreed to in writing, software
|
|
||||||
distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
See the License for the specific language governing permissions and
|
|
||||||
limitations under the License.
|
|
||||||
*/
|
|
||||||
|
|
||||||
package sys
|
|
||||||
|
|
||||||
import (
|
|
||||||
_ "unsafe" // required for go:linkname.
|
|
||||||
)
|
|
||||||
|
|
||||||
//go:linkname beforeFork syscall.runtime_BeforeFork
|
|
||||||
func beforeFork()
|
|
||||||
|
|
||||||
//go:linkname afterFork syscall.runtime_AfterFork
|
|
||||||
func afterFork()
|
|
||||||
|
|
||||||
//go:linkname afterForkInChild syscall.runtime_AfterForkInChild
|
|
||||||
func afterForkInChild()
|
|
@ -1,94 +0,0 @@
|
|||||||
/*
|
|
||||||
Copyright The containerd Authors.
|
|
||||||
|
|
||||||
Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
you may not use this file except in compliance with the License.
|
|
||||||
You may obtain a copy of the License at
|
|
||||||
|
|
||||||
http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
|
|
||||||
Unless required by applicable law or agreed to in writing, software
|
|
||||||
distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
See the License for the specific language governing permissions and
|
|
||||||
limitations under the License.
|
|
||||||
*/
|
|
||||||
|
|
||||||
package sys
|
|
||||||
|
|
||||||
import (
|
|
||||||
"runtime"
|
|
||||||
"syscall"
|
|
||||||
"unsafe"
|
|
||||||
)
|
|
||||||
|
|
||||||
// ForkUserns is to fork child process with user namespace. It returns child
|
|
||||||
// process's pid and pidfd reference to the child process.
|
|
||||||
//
|
|
||||||
// Precondition: The runtime OS thread must be locked, which is GO runtime
|
|
||||||
// requirement.
|
|
||||||
//
|
|
||||||
// Beside this, the child process sets PR_SET_PDEATHSIG with SIGKILL so that
|
|
||||||
// the parent process's OS thread must be locked. Otherwise, the exit event of
|
|
||||||
// parent process's OS thread will send kill signal to the child process,
|
|
||||||
// even if parent process is still running.
|
|
||||||
//
|
|
||||||
//go:norace
|
|
||||||
//go:noinline
|
|
||||||
func ForkUserns() (_pid uintptr, _pidfd uintptr, _ syscall.Errno) {
|
|
||||||
var (
|
|
||||||
pidfd uintptr
|
|
||||||
pid, ppid uintptr
|
|
||||||
err syscall.Errno
|
|
||||||
)
|
|
||||||
|
|
||||||
ppid, _, err = syscall.RawSyscall(uintptr(syscall.SYS_GETPID), 0, 0, 0)
|
|
||||||
if err != 0 {
|
|
||||||
return 0, 0, err
|
|
||||||
}
|
|
||||||
|
|
||||||
beforeFork()
|
|
||||||
if runtime.GOARCH == "s390x" {
|
|
||||||
// NOTE:
|
|
||||||
//
|
|
||||||
// On the s390 architectures, the order of the first two
|
|
||||||
// arguments is reversed.
|
|
||||||
//
|
|
||||||
// REF: https://man7.org/linux/man-pages/man2/clone.2.html
|
|
||||||
pid, _, err = syscall.RawSyscall(syscall.SYS_CLONE,
|
|
||||||
0,
|
|
||||||
uintptr(syscall.CLONE_NEWUSER|syscall.SIGCHLD|syscall.CLONE_PIDFD),
|
|
||||||
uintptr(unsafe.Pointer(&pidfd)),
|
|
||||||
)
|
|
||||||
} else {
|
|
||||||
pid, _, err = syscall.RawSyscall(syscall.SYS_CLONE,
|
|
||||||
uintptr(syscall.CLONE_NEWUSER|syscall.SIGCHLD|syscall.CLONE_PIDFD),
|
|
||||||
0,
|
|
||||||
uintptr(unsafe.Pointer(&pidfd)),
|
|
||||||
)
|
|
||||||
}
|
|
||||||
if err != 0 || pid != 0 {
|
|
||||||
afterFork()
|
|
||||||
return pid, pidfd, err
|
|
||||||
}
|
|
||||||
afterForkInChild()
|
|
||||||
|
|
||||||
if _, _, err = syscall.RawSyscall(syscall.SYS_PRCTL, syscall.PR_SET_PDEATHSIG, uintptr(syscall.SIGKILL), 0); err != 0 {
|
|
||||||
goto err
|
|
||||||
}
|
|
||||||
|
|
||||||
pid, _, err = syscall.RawSyscall(syscall.SYS_GETPPID, 0, 0, 0)
|
|
||||||
if err != 0 {
|
|
||||||
goto err
|
|
||||||
}
|
|
||||||
|
|
||||||
// exit if re-parent
|
|
||||||
if pid != ppid {
|
|
||||||
goto err
|
|
||||||
}
|
|
||||||
|
|
||||||
_, _, err = syscall.RawSyscall(syscall.SYS_PPOLL, 0, 0, 0)
|
|
||||||
err:
|
|
||||||
syscall.RawSyscall(syscall.SYS_EXIT, uintptr(err), 0, 0)
|
|
||||||
panic("unreachable")
|
|
||||||
}
|
|
Loading…
Reference in New Issue
Block a user