95 lines
		
	
	
		
			2.5 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
			
		
		
	
	
			95 lines
		
	
	
		
			2.5 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
/*
 | 
						|
   Copyright The containerd Authors.
 | 
						|
 | 
						|
   Licensed under the Apache License, Version 2.0 (the "License");
 | 
						|
   you may not use this file except in compliance with the License.
 | 
						|
   You may obtain a copy of the License at
 | 
						|
 | 
						|
       http://www.apache.org/licenses/LICENSE-2.0
 | 
						|
 | 
						|
   Unless required by applicable law or agreed to in writing, software
 | 
						|
   distributed under the License is distributed on an "AS IS" BASIS,
 | 
						|
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | 
						|
   See the License for the specific language governing permissions and
 | 
						|
   limitations under the License.
 | 
						|
*/
 | 
						|
 | 
						|
package sys
 | 
						|
 | 
						|
import (
 | 
						|
	"runtime"
 | 
						|
	"syscall"
 | 
						|
	"unsafe"
 | 
						|
)
 | 
						|
 | 
						|
// ForkUserns is to fork child process with user namespace. It returns child
 | 
						|
// process's pid and pidfd reference to the child process.
 | 
						|
//
 | 
						|
// Precondition: The runtime OS thread must be locked, which is GO runtime
 | 
						|
// requirement.
 | 
						|
//
 | 
						|
// Beside this, the child process sets PR_SET_PDEATHSIG with SIGKILL so that
 | 
						|
// the parent process's OS thread must be locked. Otherwise, the exit event of
 | 
						|
// parent process's OS thread will send kill signal to the child process,
 | 
						|
// even if parent process is still running.
 | 
						|
//
 | 
						|
//go:norace
 | 
						|
//go:noinline
 | 
						|
func ForkUserns() (_pid uintptr, _pidfd uintptr, _ syscall.Errno) {
 | 
						|
	var (
 | 
						|
		pidfd     uintptr
 | 
						|
		pid, ppid uintptr
 | 
						|
		err       syscall.Errno
 | 
						|
	)
 | 
						|
 | 
						|
	ppid, _, err = syscall.RawSyscall(uintptr(syscall.SYS_GETPID), 0, 0, 0)
 | 
						|
	if err != 0 {
 | 
						|
		return 0, 0, err
 | 
						|
	}
 | 
						|
 | 
						|
	beforeFork()
 | 
						|
	if runtime.GOARCH == "s390x" {
 | 
						|
		// NOTE:
 | 
						|
		//
 | 
						|
		// On the s390 architectures, the order of the first two
 | 
						|
		// arguments is reversed.
 | 
						|
		//
 | 
						|
		// REF: https://man7.org/linux/man-pages/man2/clone.2.html
 | 
						|
		pid, _, err = syscall.RawSyscall(syscall.SYS_CLONE,
 | 
						|
			0,
 | 
						|
			uintptr(syscall.CLONE_NEWUSER|syscall.SIGCHLD|syscall.CLONE_PIDFD),
 | 
						|
			uintptr(unsafe.Pointer(&pidfd)),
 | 
						|
		)
 | 
						|
	} else {
 | 
						|
		pid, _, err = syscall.RawSyscall(syscall.SYS_CLONE,
 | 
						|
			uintptr(syscall.CLONE_NEWUSER|syscall.SIGCHLD|syscall.CLONE_PIDFD),
 | 
						|
			0,
 | 
						|
			uintptr(unsafe.Pointer(&pidfd)),
 | 
						|
		)
 | 
						|
	}
 | 
						|
	if err != 0 || pid != 0 {
 | 
						|
		afterFork()
 | 
						|
		return pid, pidfd, err
 | 
						|
	}
 | 
						|
	afterForkInChild()
 | 
						|
 | 
						|
	if _, _, err = syscall.RawSyscall(syscall.SYS_PRCTL, syscall.PR_SET_PDEATHSIG, uintptr(syscall.SIGKILL), 0); err != 0 {
 | 
						|
		goto err
 | 
						|
	}
 | 
						|
 | 
						|
	pid, _, err = syscall.RawSyscall(syscall.SYS_GETPPID, 0, 0, 0)
 | 
						|
	if err != 0 {
 | 
						|
		goto err
 | 
						|
	}
 | 
						|
 | 
						|
	// exit if re-parent
 | 
						|
	if pid != ppid {
 | 
						|
		goto err
 | 
						|
	}
 | 
						|
 | 
						|
	_, _, err = syscall.RawSyscall(syscall.SYS_PPOLL, 0, 0, 0)
 | 
						|
err:
 | 
						|
	syscall.RawSyscall(syscall.SYS_EXIT, uintptr(err), 0, 0)
 | 
						|
	panic("unreachable")
 | 
						|
}
 |