155 lines
4.1 KiB
Go
155 lines
4.1 KiB
Go
/*
|
|
Copyright The containerd Authors.
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
*/
|
|
|
|
package sys
|
|
|
|
import (
|
|
"errors"
|
|
"fmt"
|
|
"os"
|
|
"runtime"
|
|
"strconv"
|
|
"strings"
|
|
"syscall"
|
|
|
|
"golang.org/x/sys/unix"
|
|
)
|
|
|
|
// UnshareAfterEnterUserns allows to disassociate parts of its execution context
|
|
// within a user namespace.
|
|
func UnshareAfterEnterUserns(uidMap, gidMap string, unshareFlags uintptr, f func(pid int) error) (retErr error) {
|
|
if unshareFlags&syscall.CLONE_NEWUSER == syscall.CLONE_NEWUSER {
|
|
return fmt.Errorf("unshare flags should not include user namespace")
|
|
}
|
|
|
|
uidMaps, err := parseIDMapping(uidMap)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
gidMaps, err := parseIDMapping(gidMap)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
var pidfd int
|
|
proc, err := os.StartProcess("/proc/self/exe", []string{"UnshareAfterEnterUserns"}, &os.ProcAttr{
|
|
Sys: &syscall.SysProcAttr{
|
|
// clone new user namespace first and then unshare
|
|
Cloneflags: unix.CLONE_NEWUSER,
|
|
Unshareflags: unshareFlags,
|
|
UidMappings: uidMaps,
|
|
GidMappings: gidMaps,
|
|
GidMappingsEnableSetgroups: true,
|
|
// NOTE: It's reexec but it's not heavy because subprocess
|
|
// be in PTRACE_TRACEME mode before performing execve.
|
|
Ptrace: true,
|
|
Pdeathsig: syscall.SIGKILL,
|
|
PidFD: &pidfd,
|
|
},
|
|
})
|
|
if err != nil {
|
|
return fmt.Errorf("failed to start noop process for unshare: %w", err)
|
|
}
|
|
|
|
if pidfd == -1 || !SupportsPidFD() {
|
|
proc.Kill()
|
|
proc.Wait()
|
|
return fmt.Errorf("kernel doesn't support CLONE_PIDFD")
|
|
}
|
|
|
|
// Since go1.23.{0,1} has double close issue, we should dup it before using it.
|
|
//
|
|
// References:
|
|
// - https://github.com/golang/go/issues/68984
|
|
// - https://github.com/golang/go/milestone/371
|
|
if goVer := runtime.Version(); goVer == "go1.23.0" || goVer == "go1.23.1" {
|
|
dupPidfd, dupErr := unix.FcntlInt(uintptr(pidfd), syscall.F_DUPFD_CLOEXEC, 0)
|
|
if dupErr != nil {
|
|
proc.Kill()
|
|
proc.Wait()
|
|
return fmt.Errorf("failed to dupfd: %w", err)
|
|
}
|
|
pidfd = dupPidfd
|
|
}
|
|
|
|
defer func() {
|
|
derr := unix.PidfdSendSignal(pidfd, unix.SIGKILL, nil, 0)
|
|
if derr != nil {
|
|
if !errors.Is(derr, unix.ESRCH) {
|
|
retErr = derr
|
|
}
|
|
return
|
|
}
|
|
pidfdWaitid(pidfd)
|
|
}()
|
|
|
|
if f != nil {
|
|
if err := f(proc.Pid); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
// Ensure the child process is still alive. If the err is ESRCH, we
|
|
// should return error because the pid could be reused. It's safe to
|
|
// return error and retry.
|
|
if err := unix.PidfdSendSignal(pidfd, 0, nil, 0); err != nil {
|
|
return fmt.Errorf("failed to ensure child process is alive: %w", err)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// TODO: Support multiple mappings in future
|
|
func parseIDMapping(mapping string) ([]syscall.SysProcIDMap, error) {
|
|
parts := strings.Split(mapping, ":")
|
|
if len(parts) != 3 {
|
|
return nil, fmt.Errorf("user namespace mappings require the format `container-id:host-id:size`")
|
|
}
|
|
|
|
cID, err := strconv.Atoi(parts[0])
|
|
if err != nil {
|
|
return nil, fmt.Errorf("invalid container id for user namespace remapping, %w", err)
|
|
}
|
|
|
|
hID, err := strconv.Atoi(parts[1])
|
|
if err != nil {
|
|
return nil, fmt.Errorf("invalid host id for user namespace remapping, %w", err)
|
|
}
|
|
|
|
size, err := strconv.Atoi(parts[2])
|
|
if err != nil {
|
|
return nil, fmt.Errorf("invalid size for user namespace remapping, %w", err)
|
|
}
|
|
|
|
if cID < 0 || hID < 0 || size < 0 {
|
|
return nil, fmt.Errorf("invalid mapping %s, all IDs and size must be positive integers", mapping)
|
|
}
|
|
|
|
return []syscall.SysProcIDMap{
|
|
{
|
|
ContainerID: cID,
|
|
HostID: hID,
|
|
Size: size,
|
|
},
|
|
}, nil
|
|
}
|
|
|
|
func pidfdWaitid(pidfd int) error {
|
|
return IgnoringEINTR(func() error {
|
|
return unix.Waitid(unix.P_PIDFD, pidfd, nil, unix.WEXITED, nil)
|
|
})
|
|
}
|