pkg/sys: Add UnshareAfterEnterUserns function
It allows to disassociate parts of its execution context within a user namespace. Signed-off-by: Wei Fu <fuweid89@gmail.com>
This commit is contained in:
parent
c8b095f3c2
commit
490e45a08a
153
pkg/sys/unshare_linux.go
Normal file
153
pkg/sys/unshare_linux.go
Normal file
@ -0,0 +1,153 @@
|
|||||||
|
/*
|
||||||
|
Copyright The containerd Authors.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package sys
|
||||||
|
|
||||||
|
import (
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
"runtime"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
"syscall"
|
||||||
|
|
||||||
|
"golang.org/x/sys/unix"
|
||||||
|
)
|
||||||
|
|
||||||
|
// UnshareAfterEnterUserns allows to disassociate parts of its execution context
|
||||||
|
// within a user namespace.
|
||||||
|
func UnshareAfterEnterUserns(uidMap, gidMap string, unshareFlags uintptr, f func(pid int) error) (retErr error) {
|
||||||
|
if unshareFlags&syscall.CLONE_NEWUSER == syscall.CLONE_NEWUSER {
|
||||||
|
return fmt.Errorf("unshare flags should not include user namespace")
|
||||||
|
}
|
||||||
|
|
||||||
|
uidMaps, err := parseIDMapping(uidMap)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
gidMaps, err := parseIDMapping(gidMap)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
var pidfd int
|
||||||
|
proc, err := os.StartProcess("/proc/self/exe", []string{"UnshareAfterEnterUserns"}, &os.ProcAttr{
|
||||||
|
Sys: &syscall.SysProcAttr{
|
||||||
|
// clone new user namespace first and then unshare
|
||||||
|
Cloneflags: unix.CLONE_NEWUSER,
|
||||||
|
Unshareflags: unshareFlags,
|
||||||
|
UidMappings: uidMaps,
|
||||||
|
GidMappings: gidMaps,
|
||||||
|
// NOTE: It's reexec but it's not heavy because subprocess
|
||||||
|
// be in PTRACE_TRACEME mode before performing execve.
|
||||||
|
Ptrace: true,
|
||||||
|
Pdeathsig: syscall.SIGKILL,
|
||||||
|
PidFD: &pidfd,
|
||||||
|
},
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("failed to start noop process for unshare: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if pidfd == -1 || !SupportsPidFD() {
|
||||||
|
proc.Kill()
|
||||||
|
proc.Wait()
|
||||||
|
return fmt.Errorf("kernel doesn't support CLONE_PIDFD")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Since go1.23.{0,1} has double close issue, we should dup it before using it.
|
||||||
|
//
|
||||||
|
// References:
|
||||||
|
// - https://github.com/golang/go/issues/68984
|
||||||
|
// - https://github.com/golang/go/milestone/371
|
||||||
|
if goVer := runtime.Version(); goVer == "go1.23.0" || goVer == "go1.23.1" {
|
||||||
|
dupPidfd, dupErr := unix.FcntlInt(uintptr(pidfd), syscall.F_DUPFD_CLOEXEC, 0)
|
||||||
|
if dupErr != nil {
|
||||||
|
proc.Kill()
|
||||||
|
proc.Wait()
|
||||||
|
return fmt.Errorf("failed to dupfd: %w", err)
|
||||||
|
}
|
||||||
|
pidfd = dupPidfd
|
||||||
|
}
|
||||||
|
|
||||||
|
defer func() {
|
||||||
|
derr := unix.PidfdSendSignal(pidfd, unix.SIGKILL, nil, 0)
|
||||||
|
if derr != nil {
|
||||||
|
if !errors.Is(derr, unix.ESRCH) {
|
||||||
|
retErr = derr
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
pidfdWaitid(pidfd)
|
||||||
|
}()
|
||||||
|
|
||||||
|
if f != nil {
|
||||||
|
if err := f(proc.Pid); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Ensure the child process is still alive. If the err is ESRCH, we
|
||||||
|
// should return error because the pid could be reused. It's safe to
|
||||||
|
// return error and retry.
|
||||||
|
if err := unix.PidfdSendSignal(pidfd, 0, nil, 0); err != nil {
|
||||||
|
return fmt.Errorf("failed to ensure child process is alive: %w", err)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: Support multiple mappings in future
|
||||||
|
func parseIDMapping(mapping string) ([]syscall.SysProcIDMap, error) {
|
||||||
|
parts := strings.Split(mapping, ":")
|
||||||
|
if len(parts) != 3 {
|
||||||
|
return nil, fmt.Errorf("user namespace mappings require the format `container-id:host-id:size`")
|
||||||
|
}
|
||||||
|
|
||||||
|
cID, err := strconv.Atoi(parts[0])
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("invalid container id for user namespace remapping, %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
hID, err := strconv.Atoi(parts[1])
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("invalid host id for user namespace remapping, %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
size, err := strconv.Atoi(parts[2])
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("invalid size for user namespace remapping, %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if cID < 0 || hID < 0 || size < 0 {
|
||||||
|
return nil, fmt.Errorf("invalid mapping %s, all IDs and size must be positive integers", mapping)
|
||||||
|
}
|
||||||
|
|
||||||
|
return []syscall.SysProcIDMap{
|
||||||
|
{
|
||||||
|
ContainerID: cID,
|
||||||
|
HostID: hID,
|
||||||
|
Size: size,
|
||||||
|
},
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func pidfdWaitid(pidfd int) error {
|
||||||
|
return IgnoringEINTR(func() error {
|
||||||
|
return unix.Waitid(unix.P_PIDFD, pidfd, nil, unix.WEXITED, nil)
|
||||||
|
})
|
||||||
|
}
|
149
pkg/sys/unshare_linux_test.go
Normal file
149
pkg/sys/unshare_linux_test.go
Normal file
@ -0,0 +1,149 @@
|
|||||||
|
/*
|
||||||
|
Copyright The containerd Authors.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package sys
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
"syscall"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
kernel "github.com/containerd/containerd/v2/pkg/kernelversion"
|
||||||
|
"github.com/containerd/continuity/testutil"
|
||||||
|
"github.com/stretchr/testify/require"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestUnshareAfterEnterUserns(t *testing.T) {
|
||||||
|
testutil.RequiresRoot(t)
|
||||||
|
|
||||||
|
k510 := kernel.KernelVersion{Kernel: 5, Major: 10}
|
||||||
|
ok, err := kernel.GreaterEqualThan(k510)
|
||||||
|
require.NoError(t, err)
|
||||||
|
if !ok {
|
||||||
|
t.Skip("Requires kernel >= 5.10")
|
||||||
|
}
|
||||||
|
|
||||||
|
err = UnshareAfterEnterUserns("0:1000:1", "0:1000:1", syscall.CLONE_NEWUSER|syscall.CLONE_NEWIPC, nil)
|
||||||
|
require.Error(t, err)
|
||||||
|
require.ErrorContains(t, err, "unshare flags should not include user namespace")
|
||||||
|
|
||||||
|
t.Run("should work", testUnshareAfterEnterUsernsShouldWork)
|
||||||
|
t.Run("killpid", testUnshareAfterEnterUsernsKillPid)
|
||||||
|
t.Run("invalid unshare flags", testUnshareAfterEnterUsernsInvalidFlags)
|
||||||
|
}
|
||||||
|
|
||||||
|
func testUnshareAfterEnterUsernsShouldWork(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
currentNetNs, err := getNamespaceInode(os.Getpid(), "net")
|
||||||
|
require.NoError(t, err)
|
||||||
|
|
||||||
|
currentUserNs, err := getNamespaceInode(os.Getpid(), "user")
|
||||||
|
require.NoError(t, err)
|
||||||
|
|
||||||
|
currentIpcNs, err := getNamespaceInode(os.Getpid(), "ipc")
|
||||||
|
require.NoError(t, err)
|
||||||
|
|
||||||
|
currentPidNs, err := getNamespaceInode(os.Getpid(), "pid")
|
||||||
|
require.NoError(t, err)
|
||||||
|
|
||||||
|
uerr := UnshareAfterEnterUserns("0:1000:10", "0:1000:10", syscall.CLONE_NEWIPC|syscall.CLONE_NEWNET, func(pid int) error {
|
||||||
|
netNs, err := getNamespaceInode(pid, "net")
|
||||||
|
require.NoError(t, err)
|
||||||
|
require.NotEqual(t, currentNetNs, netNs)
|
||||||
|
|
||||||
|
userNs, err := getNamespaceInode(pid, "user")
|
||||||
|
require.NoError(t, err)
|
||||||
|
require.NotEqual(t, currentUserNs, userNs)
|
||||||
|
|
||||||
|
ipcNs, err := getNamespaceInode(pid, "ipc")
|
||||||
|
require.NoError(t, err)
|
||||||
|
require.NotEqual(t, currentIpcNs, ipcNs)
|
||||||
|
|
||||||
|
pidNs, err := getNamespaceInode(pid, "pid")
|
||||||
|
require.NoError(t, err)
|
||||||
|
require.Equal(t, currentPidNs, pidNs)
|
||||||
|
|
||||||
|
data, err := os.ReadFile(fmt.Sprintf("/proc/%d/uid_map", pid))
|
||||||
|
require.NoError(t, err)
|
||||||
|
require.Equal(t, " 0 1000 10\n", string(data))
|
||||||
|
|
||||||
|
data, err = os.ReadFile(fmt.Sprintf("/proc/%d/gid_map", pid))
|
||||||
|
require.NoError(t, err)
|
||||||
|
require.Equal(t, " 0 1000 10\n", string(data))
|
||||||
|
return nil
|
||||||
|
})
|
||||||
|
require.NoError(t, uerr)
|
||||||
|
}
|
||||||
|
|
||||||
|
func testUnshareAfterEnterUsernsKillPid(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
uerr := UnshareAfterEnterUserns("0:1000:1", "0:1000:1", syscall.CLONE_NEWIPC|syscall.CLONE_NEWNET, func(pid int) error {
|
||||||
|
proc, err := os.FindProcess(pid)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("failed to find process: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := proc.Kill(); err != nil {
|
||||||
|
return fmt.Errorf("failed to kill process: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
proc.Wait()
|
||||||
|
|
||||||
|
_, err = os.OpenFile(fmt.Sprintf("/proc/%d/ns/net", pid), os.O_RDONLY, 0600)
|
||||||
|
require.Error(t, err)
|
||||||
|
require.ErrorIs(t, err, os.ErrNotExist)
|
||||||
|
return err
|
||||||
|
})
|
||||||
|
require.Error(t, uerr)
|
||||||
|
require.ErrorIs(t, uerr, os.ErrNotExist)
|
||||||
|
|
||||||
|
uerr = UnshareAfterEnterUserns("0:1000:1", "0:1000:1", syscall.CLONE_NEWIPC|syscall.CLONE_NEWNET, func(pid int) error {
|
||||||
|
proc, err := os.FindProcess(pid)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("failed to find process: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := proc.Kill(); err != nil {
|
||||||
|
return fmt.Errorf("failed to kill process: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
proc.Wait()
|
||||||
|
|
||||||
|
return nil
|
||||||
|
})
|
||||||
|
require.Error(t, uerr)
|
||||||
|
require.ErrorContains(t, uerr, "failed to ensure child process is alive: no such process")
|
||||||
|
}
|
||||||
|
|
||||||
|
func testUnshareAfterEnterUsernsInvalidFlags(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
uerr := UnshareAfterEnterUserns("0:1000:1", "0:1000:1", syscall.CLONE_IO, nil)
|
||||||
|
require.Error(t, uerr)
|
||||||
|
require.ErrorContains(t, uerr, "fork/exec /proc/self/exe: invalid argument")
|
||||||
|
}
|
||||||
|
|
||||||
|
func getNamespaceInode(pid int, typ string) (uint64, error) {
|
||||||
|
info, err := os.Stat(fmt.Sprintf("/proc/%d/ns/%s", pid, typ))
|
||||||
|
if err != nil {
|
||||||
|
return 0, err
|
||||||
|
}
|
||||||
|
|
||||||
|
return info.Sys().(*syscall.Stat_t).Ino, nil
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user