Merge pull request #10607 from fuweid/pin-userns
internal/cri: simplify netns setup with pinned userns
This commit is contained in:
38
pkg/sys/namespace_linux.go
Normal file
38
pkg/sys/namespace_linux.go
Normal file
@@ -0,0 +1,38 @@
|
||||
/*
|
||||
Copyright The containerd Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package sys
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"syscall"
|
||||
|
||||
"golang.org/x/sys/unix"
|
||||
)
|
||||
|
||||
// GetUsernsForNamespace returns a file descriptor that refers to the owning
|
||||
// user namespace for the namespace referred to by fd.
|
||||
//
|
||||
// REF: https://man7.org/linux/man-pages/man2/ioctl_ns.2.html
|
||||
func GetUsernsForNamespace(fd uintptr) (*os.File, error) {
|
||||
fd, _, errno := unix.Syscall(syscall.SYS_IOCTL, fd, uintptr(unix.NS_GET_USERNS), 0)
|
||||
if errno != 0 {
|
||||
return nil, fmt.Errorf("failed to get user namespace fd: %w", errno)
|
||||
}
|
||||
|
||||
return os.NewFile(fd, fmt.Sprintf("/proc/%d/fd/%d", os.Getpid(), fd)), nil
|
||||
}
|
||||
106
pkg/sys/namespace_linux_test.go
Normal file
106
pkg/sys/namespace_linux_test.go
Normal file
@@ -0,0 +1,106 @@
|
||||
/*
|
||||
Copyright The containerd Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package sys
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"syscall"
|
||||
"testing"
|
||||
|
||||
kernel "github.com/containerd/containerd/v2/pkg/kernelversion"
|
||||
"github.com/containerd/continuity/testutil"
|
||||
"github.com/stretchr/testify/require"
|
||||
"golang.org/x/sys/unix"
|
||||
)
|
||||
|
||||
func TestGetUsernsForNamespace(t *testing.T) {
|
||||
testutil.RequiresRoot(t)
|
||||
|
||||
t.Parallel()
|
||||
|
||||
k409 := kernel.KernelVersion{Kernel: 4, Major: 9}
|
||||
ok, err := kernel.GreaterEqualThan(k409)
|
||||
require.NoError(t, err)
|
||||
if !ok {
|
||||
t.Skip("Requires kernel >= 4.9")
|
||||
}
|
||||
|
||||
tmpDir := t.TempDir()
|
||||
|
||||
f, err := os.CreateTemp(tmpDir, "netns")
|
||||
require.NoError(t, err)
|
||||
|
||||
netnsPath := f.Name()
|
||||
f.Close()
|
||||
|
||||
defer testutil.Unmount(t, netnsPath)
|
||||
|
||||
currentUsernsIno, err := getNamespaceInode(os.Getpid(), "user")
|
||||
require.NoError(t, err)
|
||||
|
||||
usernsIno := uint64(0)
|
||||
uerr := UnshareAfterEnterUserns("0:1000:10", "0:1000:10", syscall.CLONE_NEWNET, func(pid int) error {
|
||||
err := unix.Mount(
|
||||
fmt.Sprintf("/proc/%d/ns/net", pid),
|
||||
netnsPath,
|
||||
"",
|
||||
unix.MS_BIND|unix.MS_RDONLY,
|
||||
"",
|
||||
)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
usernsIno, err = getNamespaceInode(pid, "user")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
})
|
||||
require.NoError(t, uerr)
|
||||
|
||||
require.NotEqual(t, currentUsernsIno, usernsIno)
|
||||
t.Logf("Current user namespace [%d], new user namespace [%d]", currentUsernsIno, usernsIno)
|
||||
|
||||
netnsFd, err := os.Open(netnsPath)
|
||||
require.NoError(t, err)
|
||||
defer netnsFd.Close()
|
||||
|
||||
usernsFd, err := GetUsernsForNamespace(netnsFd.Fd())
|
||||
require.NoError(t, err)
|
||||
defer usernsFd.Close()
|
||||
|
||||
usernsInoFromNetnsFd := getInode(t, usernsFd)
|
||||
|
||||
t.Logf("Fetch netns namespace %s' user namespace owner %d", netnsPath, usernsInoFromNetnsFd)
|
||||
require.Equal(t, usernsIno, usernsInoFromNetnsFd)
|
||||
|
||||
parentUsernsFd, err := GetUsernsForNamespace(usernsFd.Fd())
|
||||
require.NoError(t, err)
|
||||
defer parentUsernsFd.Close()
|
||||
|
||||
parentUsernsIno := getInode(t, parentUsernsFd)
|
||||
t.Logf("User namespace %d's parent %d", usernsInoFromNetnsFd, parentUsernsIno)
|
||||
require.Equal(t, currentUsernsIno, parentUsernsIno)
|
||||
}
|
||||
|
||||
func getInode(t *testing.T, f *os.File) uint64 {
|
||||
info, err := f.Stat()
|
||||
require.NoError(t, err)
|
||||
return info.Sys().(*syscall.Stat_t).Ino
|
||||
}
|
||||
153
pkg/sys/unshare_linux.go
Normal file
153
pkg/sys/unshare_linux.go
Normal file
@@ -0,0 +1,153 @@
|
||||
/*
|
||||
Copyright The containerd Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package sys
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"os"
|
||||
"runtime"
|
||||
"strconv"
|
||||
"strings"
|
||||
"syscall"
|
||||
|
||||
"golang.org/x/sys/unix"
|
||||
)
|
||||
|
||||
// UnshareAfterEnterUserns allows to disassociate parts of its execution context
|
||||
// within a user namespace.
|
||||
func UnshareAfterEnterUserns(uidMap, gidMap string, unshareFlags uintptr, f func(pid int) error) (retErr error) {
|
||||
if unshareFlags&syscall.CLONE_NEWUSER == syscall.CLONE_NEWUSER {
|
||||
return fmt.Errorf("unshare flags should not include user namespace")
|
||||
}
|
||||
|
||||
uidMaps, err := parseIDMapping(uidMap)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
gidMaps, err := parseIDMapping(gidMap)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
var pidfd int
|
||||
proc, err := os.StartProcess("/proc/self/exe", []string{"UnshareAfterEnterUserns"}, &os.ProcAttr{
|
||||
Sys: &syscall.SysProcAttr{
|
||||
// clone new user namespace first and then unshare
|
||||
Cloneflags: unix.CLONE_NEWUSER,
|
||||
Unshareflags: unshareFlags,
|
||||
UidMappings: uidMaps,
|
||||
GidMappings: gidMaps,
|
||||
// NOTE: It's reexec but it's not heavy because subprocess
|
||||
// be in PTRACE_TRACEME mode before performing execve.
|
||||
Ptrace: true,
|
||||
Pdeathsig: syscall.SIGKILL,
|
||||
PidFD: &pidfd,
|
||||
},
|
||||
})
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to start noop process for unshare: %w", err)
|
||||
}
|
||||
|
||||
if pidfd == -1 || !SupportsPidFD() {
|
||||
proc.Kill()
|
||||
proc.Wait()
|
||||
return fmt.Errorf("kernel doesn't support CLONE_PIDFD")
|
||||
}
|
||||
|
||||
// Since go1.23.{0,1} has double close issue, we should dup it before using it.
|
||||
//
|
||||
// References:
|
||||
// - https://github.com/golang/go/issues/68984
|
||||
// - https://github.com/golang/go/milestone/371
|
||||
if goVer := runtime.Version(); goVer == "go1.23.0" || goVer == "go1.23.1" {
|
||||
dupPidfd, dupErr := unix.FcntlInt(uintptr(pidfd), syscall.F_DUPFD_CLOEXEC, 0)
|
||||
if dupErr != nil {
|
||||
proc.Kill()
|
||||
proc.Wait()
|
||||
return fmt.Errorf("failed to dupfd: %w", err)
|
||||
}
|
||||
pidfd = dupPidfd
|
||||
}
|
||||
|
||||
defer func() {
|
||||
derr := unix.PidfdSendSignal(pidfd, unix.SIGKILL, nil, 0)
|
||||
if derr != nil {
|
||||
if !errors.Is(derr, unix.ESRCH) {
|
||||
retErr = derr
|
||||
}
|
||||
return
|
||||
}
|
||||
pidfdWaitid(pidfd)
|
||||
}()
|
||||
|
||||
if f != nil {
|
||||
if err := f(proc.Pid); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
// Ensure the child process is still alive. If the err is ESRCH, we
|
||||
// should return error because the pid could be reused. It's safe to
|
||||
// return error and retry.
|
||||
if err := unix.PidfdSendSignal(pidfd, 0, nil, 0); err != nil {
|
||||
return fmt.Errorf("failed to ensure child process is alive: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// TODO: Support multiple mappings in future
|
||||
func parseIDMapping(mapping string) ([]syscall.SysProcIDMap, error) {
|
||||
parts := strings.Split(mapping, ":")
|
||||
if len(parts) != 3 {
|
||||
return nil, fmt.Errorf("user namespace mappings require the format `container-id:host-id:size`")
|
||||
}
|
||||
|
||||
cID, err := strconv.Atoi(parts[0])
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("invalid container id for user namespace remapping, %w", err)
|
||||
}
|
||||
|
||||
hID, err := strconv.Atoi(parts[1])
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("invalid host id for user namespace remapping, %w", err)
|
||||
}
|
||||
|
||||
size, err := strconv.Atoi(parts[2])
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("invalid size for user namespace remapping, %w", err)
|
||||
}
|
||||
|
||||
if cID < 0 || hID < 0 || size < 0 {
|
||||
return nil, fmt.Errorf("invalid mapping %s, all IDs and size must be positive integers", mapping)
|
||||
}
|
||||
|
||||
return []syscall.SysProcIDMap{
|
||||
{
|
||||
ContainerID: cID,
|
||||
HostID: hID,
|
||||
Size: size,
|
||||
},
|
||||
}, nil
|
||||
}
|
||||
|
||||
func pidfdWaitid(pidfd int) error {
|
||||
return IgnoringEINTR(func() error {
|
||||
return unix.Waitid(unix.P_PIDFD, pidfd, nil, unix.WEXITED, nil)
|
||||
})
|
||||
}
|
||||
149
pkg/sys/unshare_linux_test.go
Normal file
149
pkg/sys/unshare_linux_test.go
Normal file
@@ -0,0 +1,149 @@
|
||||
/*
|
||||
Copyright The containerd Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package sys
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"syscall"
|
||||
"testing"
|
||||
|
||||
kernel "github.com/containerd/containerd/v2/pkg/kernelversion"
|
||||
"github.com/containerd/continuity/testutil"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func TestUnshareAfterEnterUserns(t *testing.T) {
|
||||
testutil.RequiresRoot(t)
|
||||
|
||||
k510 := kernel.KernelVersion{Kernel: 5, Major: 10}
|
||||
ok, err := kernel.GreaterEqualThan(k510)
|
||||
require.NoError(t, err)
|
||||
if !ok {
|
||||
t.Skip("Requires kernel >= 5.10")
|
||||
}
|
||||
|
||||
err = UnshareAfterEnterUserns("0:1000:1", "0:1000:1", syscall.CLONE_NEWUSER|syscall.CLONE_NEWIPC, nil)
|
||||
require.Error(t, err)
|
||||
require.ErrorContains(t, err, "unshare flags should not include user namespace")
|
||||
|
||||
t.Run("should work", testUnshareAfterEnterUsernsShouldWork)
|
||||
t.Run("killpid", testUnshareAfterEnterUsernsKillPid)
|
||||
t.Run("invalid unshare flags", testUnshareAfterEnterUsernsInvalidFlags)
|
||||
}
|
||||
|
||||
func testUnshareAfterEnterUsernsShouldWork(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
currentNetNs, err := getNamespaceInode(os.Getpid(), "net")
|
||||
require.NoError(t, err)
|
||||
|
||||
currentUserNs, err := getNamespaceInode(os.Getpid(), "user")
|
||||
require.NoError(t, err)
|
||||
|
||||
currentIpcNs, err := getNamespaceInode(os.Getpid(), "ipc")
|
||||
require.NoError(t, err)
|
||||
|
||||
currentPidNs, err := getNamespaceInode(os.Getpid(), "pid")
|
||||
require.NoError(t, err)
|
||||
|
||||
uerr := UnshareAfterEnterUserns("0:1000:10", "0:1000:10", syscall.CLONE_NEWIPC|syscall.CLONE_NEWNET, func(pid int) error {
|
||||
netNs, err := getNamespaceInode(pid, "net")
|
||||
require.NoError(t, err)
|
||||
require.NotEqual(t, currentNetNs, netNs)
|
||||
|
||||
userNs, err := getNamespaceInode(pid, "user")
|
||||
require.NoError(t, err)
|
||||
require.NotEqual(t, currentUserNs, userNs)
|
||||
|
||||
ipcNs, err := getNamespaceInode(pid, "ipc")
|
||||
require.NoError(t, err)
|
||||
require.NotEqual(t, currentIpcNs, ipcNs)
|
||||
|
||||
pidNs, err := getNamespaceInode(pid, "pid")
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, currentPidNs, pidNs)
|
||||
|
||||
data, err := os.ReadFile(fmt.Sprintf("/proc/%d/uid_map", pid))
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, " 0 1000 10\n", string(data))
|
||||
|
||||
data, err = os.ReadFile(fmt.Sprintf("/proc/%d/gid_map", pid))
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, " 0 1000 10\n", string(data))
|
||||
return nil
|
||||
})
|
||||
require.NoError(t, uerr)
|
||||
}
|
||||
|
||||
func testUnshareAfterEnterUsernsKillPid(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
uerr := UnshareAfterEnterUserns("0:1000:1", "0:1000:1", syscall.CLONE_NEWIPC|syscall.CLONE_NEWNET, func(pid int) error {
|
||||
proc, err := os.FindProcess(pid)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to find process: %w", err)
|
||||
}
|
||||
|
||||
if err := proc.Kill(); err != nil {
|
||||
return fmt.Errorf("failed to kill process: %w", err)
|
||||
}
|
||||
|
||||
proc.Wait()
|
||||
|
||||
_, err = os.OpenFile(fmt.Sprintf("/proc/%d/ns/net", pid), os.O_RDONLY, 0600)
|
||||
require.Error(t, err)
|
||||
require.ErrorIs(t, err, os.ErrNotExist)
|
||||
return err
|
||||
})
|
||||
require.Error(t, uerr)
|
||||
require.ErrorIs(t, uerr, os.ErrNotExist)
|
||||
|
||||
uerr = UnshareAfterEnterUserns("0:1000:1", "0:1000:1", syscall.CLONE_NEWIPC|syscall.CLONE_NEWNET, func(pid int) error {
|
||||
proc, err := os.FindProcess(pid)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to find process: %w", err)
|
||||
}
|
||||
|
||||
if err := proc.Kill(); err != nil {
|
||||
return fmt.Errorf("failed to kill process: %w", err)
|
||||
}
|
||||
|
||||
proc.Wait()
|
||||
|
||||
return nil
|
||||
})
|
||||
require.Error(t, uerr)
|
||||
require.ErrorContains(t, uerr, "failed to ensure child process is alive: no such process")
|
||||
}
|
||||
|
||||
func testUnshareAfterEnterUsernsInvalidFlags(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
uerr := UnshareAfterEnterUserns("0:1000:1", "0:1000:1", syscall.CLONE_IO, nil)
|
||||
require.Error(t, uerr)
|
||||
require.ErrorContains(t, uerr, "fork/exec /proc/self/exe: invalid argument")
|
||||
}
|
||||
|
||||
func getNamespaceInode(pid int, typ string) (uint64, error) {
|
||||
info, err := os.Stat(fmt.Sprintf("/proc/%d/ns/%s", pid, typ))
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
return info.Sys().(*syscall.Stat_t).Ino, nil
|
||||
}
|
||||
Reference in New Issue
Block a user