95 lines
2.5 KiB
Go
95 lines
2.5 KiB
Go
/*
|
|
Copyright The containerd Authors.
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
*/
|
|
|
|
package sys
|
|
|
|
import (
|
|
"runtime"
|
|
"syscall"
|
|
"unsafe"
|
|
)
|
|
|
|
// ForkUserns is to fork child process with user namespace. It returns child
|
|
// process's pid and pidfd reference to the child process.
|
|
//
|
|
// Precondition: The runtime OS thread must be locked, which is GO runtime
|
|
// requirement.
|
|
//
|
|
// Beside this, the child process sets PR_SET_PDEATHSIG with SIGKILL so that
|
|
// the parent process's OS thread must be locked. Otherwise, the exit event of
|
|
// parent process's OS thread will send kill signal to the child process,
|
|
// even if parent process is still running.
|
|
//
|
|
//go:norace
|
|
//go:noinline
|
|
func ForkUserns() (_pid uintptr, _pidfd uintptr, _ syscall.Errno) {
|
|
var (
|
|
pidfd uintptr
|
|
pid, ppid uintptr
|
|
err syscall.Errno
|
|
)
|
|
|
|
ppid, _, err = syscall.RawSyscall(uintptr(syscall.SYS_GETPID), 0, 0, 0)
|
|
if err != 0 {
|
|
return 0, 0, err
|
|
}
|
|
|
|
beforeFork()
|
|
if runtime.GOARCH == "s390x" {
|
|
// NOTE:
|
|
//
|
|
// On the s390 architectures, the order of the first two
|
|
// arguments is reversed.
|
|
//
|
|
// REF: https://man7.org/linux/man-pages/man2/clone.2.html
|
|
pid, _, err = syscall.RawSyscall(syscall.SYS_CLONE,
|
|
0,
|
|
uintptr(syscall.CLONE_NEWUSER|syscall.SIGCHLD|syscall.CLONE_PIDFD),
|
|
uintptr(unsafe.Pointer(&pidfd)),
|
|
)
|
|
} else {
|
|
pid, _, err = syscall.RawSyscall(syscall.SYS_CLONE,
|
|
uintptr(syscall.CLONE_NEWUSER|syscall.SIGCHLD|syscall.CLONE_PIDFD),
|
|
0,
|
|
uintptr(unsafe.Pointer(&pidfd)),
|
|
)
|
|
}
|
|
if err != 0 || pid != 0 {
|
|
afterFork()
|
|
return pid, pidfd, err
|
|
}
|
|
afterForkInChild()
|
|
|
|
if _, _, err = syscall.RawSyscall(syscall.SYS_PRCTL, syscall.PR_SET_PDEATHSIG, uintptr(syscall.SIGKILL), 0); err != 0 {
|
|
goto err
|
|
}
|
|
|
|
pid, _, err = syscall.RawSyscall(syscall.SYS_GETPPID, 0, 0, 0)
|
|
if err != 0 {
|
|
goto err
|
|
}
|
|
|
|
// exit if re-parent
|
|
if pid != ppid {
|
|
goto err
|
|
}
|
|
|
|
_, _, err = syscall.RawSyscall(syscall.SYS_PPOLL, 0, 0, 0)
|
|
err:
|
|
syscall.RawSyscall(syscall.SYS_EXIT, uintptr(err), 0, 0)
|
|
panic("unreachable")
|
|
}
|