
This patch introduces idmapped mounts support for container rootfs. The idmapped mounts support was merged in Linux kernel 5.12 torvalds/linux@7d6beb7. This functionality allows to address chown overhead for containers that use user namespace. The changes are based on experimental patchset published by Mauricio Vásquez #4734. Current version reiplements support of idmapped mounts using Golang. Performance measurement results: Image idmapped mount recursive chown BusyBox 00.135 04.964 Ubuntu 00.171 15.713 Fedora 00.143 38.799 Signed-off-by: Mauricio Vásquez <mauricio@kinvolk.io> Signed-off-by: Artem Kuzin <artem.kuzin@huawei.com> Signed-off-by: Alexey Perevalov <alexey.perevalov@huawei.com> Signed-off-by: Ilya Hanov <ilya.hanov@huawei-partners.com>
66 lines
1.9 KiB
Go
66 lines
1.9 KiB
Go
/*
|
|
Copyright The containerd Authors.
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
*/
|
|
|
|
package sys
|
|
|
|
import (
|
|
"runtime"
|
|
"syscall"
|
|
"unsafe"
|
|
)
|
|
|
|
// ProcSyncType is used for synchronization
|
|
// between parent and child processes.
|
|
type ProcSyncType uint8
|
|
|
|
const (
|
|
// ProcSyncExit tells child "it's time to exit".
|
|
ProcSyncExit ProcSyncType = 0x1
|
|
)
|
|
|
|
//go:norace
|
|
//go:noinline
|
|
func ForkUserns(pipeMap [2]int) (pid uintptr, errno syscall.Errno) {
|
|
var sync ProcSyncType
|
|
|
|
beforeFork()
|
|
if runtime.GOARCH == "s390x" {
|
|
pid, _, errno = syscall.RawSyscall6(uintptr(syscall.SYS_CLONE), 0, syscall.CLONE_NEWUSER|uintptr(syscall.SIGCHLD), 0, 0, 0, 0)
|
|
} else {
|
|
pid, _, errno = syscall.RawSyscall6(uintptr(syscall.SYS_CLONE), syscall.CLONE_NEWUSER|uintptr(syscall.SIGCHLD), 0, 0, 0, 0, 0)
|
|
}
|
|
if errno != 0 || pid != 0 {
|
|
afterFork()
|
|
return pid, errno
|
|
}
|
|
|
|
afterForkInChild()
|
|
if _, _, errno = syscall.RawSyscall(syscall.SYS_CLOSE, uintptr(pipeMap[1]), 0, 0); errno != 0 {
|
|
goto err
|
|
}
|
|
if _, _, errno = syscall.RawSyscall6(syscall.SYS_PRCTL, syscall.PR_SET_PDEATHSIG, uintptr(syscall.SIGKILL), 0, 0, 0, 0); errno != 0 {
|
|
goto err
|
|
}
|
|
// wait for parent's signal
|
|
if _, _, errno = syscall.RawSyscall6(syscall.SYS_READ, uintptr(pipeMap[0]), uintptr(unsafe.Pointer(&sync)), unsafe.Sizeof(sync), 0, 0, 0); errno != 0 || sync != ProcSyncExit {
|
|
goto err
|
|
}
|
|
|
|
err:
|
|
syscall.RawSyscall6(syscall.SYS_EXIT, uintptr(errno), 0, 0, 0, 0, 0)
|
|
panic("unreachable")
|
|
}
|