pkg/sys: Add UnshareAfterEnterUserns function
It allows to disassociate parts of its execution context within a user namespace. Signed-off-by: Wei Fu <fuweid89@gmail.com>
This commit is contained in:
153
pkg/sys/unshare_linux.go
Normal file
153
pkg/sys/unshare_linux.go
Normal file
@@ -0,0 +1,153 @@
|
||||
/*
|
||||
Copyright The containerd Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package sys
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"os"
|
||||
"runtime"
|
||||
"strconv"
|
||||
"strings"
|
||||
"syscall"
|
||||
|
||||
"golang.org/x/sys/unix"
|
||||
)
|
||||
|
||||
// UnshareAfterEnterUserns allows to disassociate parts of its execution context
|
||||
// within a user namespace.
|
||||
func UnshareAfterEnterUserns(uidMap, gidMap string, unshareFlags uintptr, f func(pid int) error) (retErr error) {
|
||||
if unshareFlags&syscall.CLONE_NEWUSER == syscall.CLONE_NEWUSER {
|
||||
return fmt.Errorf("unshare flags should not include user namespace")
|
||||
}
|
||||
|
||||
uidMaps, err := parseIDMapping(uidMap)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
gidMaps, err := parseIDMapping(gidMap)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
var pidfd int
|
||||
proc, err := os.StartProcess("/proc/self/exe", []string{"UnshareAfterEnterUserns"}, &os.ProcAttr{
|
||||
Sys: &syscall.SysProcAttr{
|
||||
// clone new user namespace first and then unshare
|
||||
Cloneflags: unix.CLONE_NEWUSER,
|
||||
Unshareflags: unshareFlags,
|
||||
UidMappings: uidMaps,
|
||||
GidMappings: gidMaps,
|
||||
// NOTE: It's reexec but it's not heavy because subprocess
|
||||
// be in PTRACE_TRACEME mode before performing execve.
|
||||
Ptrace: true,
|
||||
Pdeathsig: syscall.SIGKILL,
|
||||
PidFD: &pidfd,
|
||||
},
|
||||
})
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to start noop process for unshare: %w", err)
|
||||
}
|
||||
|
||||
if pidfd == -1 || !SupportsPidFD() {
|
||||
proc.Kill()
|
||||
proc.Wait()
|
||||
return fmt.Errorf("kernel doesn't support CLONE_PIDFD")
|
||||
}
|
||||
|
||||
// Since go1.23.{0,1} has double close issue, we should dup it before using it.
|
||||
//
|
||||
// References:
|
||||
// - https://github.com/golang/go/issues/68984
|
||||
// - https://github.com/golang/go/milestone/371
|
||||
if goVer := runtime.Version(); goVer == "go1.23.0" || goVer == "go1.23.1" {
|
||||
dupPidfd, dupErr := unix.FcntlInt(uintptr(pidfd), syscall.F_DUPFD_CLOEXEC, 0)
|
||||
if dupErr != nil {
|
||||
proc.Kill()
|
||||
proc.Wait()
|
||||
return fmt.Errorf("failed to dupfd: %w", err)
|
||||
}
|
||||
pidfd = dupPidfd
|
||||
}
|
||||
|
||||
defer func() {
|
||||
derr := unix.PidfdSendSignal(pidfd, unix.SIGKILL, nil, 0)
|
||||
if derr != nil {
|
||||
if !errors.Is(derr, unix.ESRCH) {
|
||||
retErr = derr
|
||||
}
|
||||
return
|
||||
}
|
||||
pidfdWaitid(pidfd)
|
||||
}()
|
||||
|
||||
if f != nil {
|
||||
if err := f(proc.Pid); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
// Ensure the child process is still alive. If the err is ESRCH, we
|
||||
// should return error because the pid could be reused. It's safe to
|
||||
// return error and retry.
|
||||
if err := unix.PidfdSendSignal(pidfd, 0, nil, 0); err != nil {
|
||||
return fmt.Errorf("failed to ensure child process is alive: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// TODO: Support multiple mappings in future
|
||||
func parseIDMapping(mapping string) ([]syscall.SysProcIDMap, error) {
|
||||
parts := strings.Split(mapping, ":")
|
||||
if len(parts) != 3 {
|
||||
return nil, fmt.Errorf("user namespace mappings require the format `container-id:host-id:size`")
|
||||
}
|
||||
|
||||
cID, err := strconv.Atoi(parts[0])
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("invalid container id for user namespace remapping, %w", err)
|
||||
}
|
||||
|
||||
hID, err := strconv.Atoi(parts[1])
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("invalid host id for user namespace remapping, %w", err)
|
||||
}
|
||||
|
||||
size, err := strconv.Atoi(parts[2])
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("invalid size for user namespace remapping, %w", err)
|
||||
}
|
||||
|
||||
if cID < 0 || hID < 0 || size < 0 {
|
||||
return nil, fmt.Errorf("invalid mapping %s, all IDs and size must be positive integers", mapping)
|
||||
}
|
||||
|
||||
return []syscall.SysProcIDMap{
|
||||
{
|
||||
ContainerID: cID,
|
||||
HostID: hID,
|
||||
Size: size,
|
||||
},
|
||||
}, nil
|
||||
}
|
||||
|
||||
func pidfdWaitid(pidfd int) error {
|
||||
return IgnoringEINTR(func() error {
|
||||
return unix.Waitid(unix.P_PIDFD, pidfd, nil, unix.WEXITED, nil)
|
||||
})
|
||||
}
|
||||
Reference in New Issue
Block a user