mount: support idmapped mount points
This patch introduces idmapped mounts support for container rootfs. The idmapped mounts support was merged in Linux kernel 5.12 torvalds/linux@7d6beb7. This functionality allows to address chown overhead for containers that use user namespace. The changes are based on experimental patchset published by Mauricio Vásquez #4734. Current version reiplements support of idmapped mounts using Golang. Performance measurement results: Image idmapped mount recursive chown BusyBox 00.135 04.964 Ubuntu 00.171 15.713 Fedora 00.143 38.799 Signed-off-by: Mauricio Vásquez <mauricio@kinvolk.io> Signed-off-by: Artem Kuzin <artem.kuzin@huawei.com> Signed-off-by: Alexey Perevalov <alexey.perevalov@huawei.com> Signed-off-by: Ilya Hanov <ilya.hanov@huawei-partners.com>
This commit is contained in:
		
							
								
								
									
										65
									
								
								sys/userns_unsafe_linux.go
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										65
									
								
								sys/userns_unsafe_linux.go
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,65 @@ | ||||
| /* | ||||
|    Copyright The containerd Authors. | ||||
|  | ||||
|    Licensed under the Apache License, Version 2.0 (the "License"); | ||||
|    you may not use this file except in compliance with the License. | ||||
|    You may obtain a copy of the License at | ||||
|  | ||||
|        http://www.apache.org/licenses/LICENSE-2.0 | ||||
|  | ||||
|    Unless required by applicable law or agreed to in writing, software | ||||
|    distributed under the License is distributed on an "AS IS" BASIS, | ||||
|    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||
|    See the License for the specific language governing permissions and | ||||
|    limitations under the License. | ||||
| */ | ||||
|  | ||||
| package sys | ||||
|  | ||||
| import ( | ||||
| 	"runtime" | ||||
| 	"syscall" | ||||
| 	"unsafe" | ||||
| ) | ||||
|  | ||||
| // ProcSyncType is used for synchronization | ||||
| // between parent and child processes. | ||||
| type ProcSyncType uint8 | ||||
|  | ||||
| const ( | ||||
| 	// ProcSyncExit tells child "it's time to exit". | ||||
| 	ProcSyncExit ProcSyncType = 0x1 | ||||
| ) | ||||
|  | ||||
| //go:norace | ||||
| //go:noinline | ||||
| func ForkUserns(pipeMap [2]int) (pid uintptr, errno syscall.Errno) { | ||||
| 	var sync ProcSyncType | ||||
|  | ||||
| 	beforeFork() | ||||
| 	if runtime.GOARCH == "s390x" { | ||||
| 		pid, _, errno = syscall.RawSyscall6(uintptr(syscall.SYS_CLONE), 0, syscall.CLONE_NEWUSER|uintptr(syscall.SIGCHLD), 0, 0, 0, 0) | ||||
| 	} else { | ||||
| 		pid, _, errno = syscall.RawSyscall6(uintptr(syscall.SYS_CLONE), syscall.CLONE_NEWUSER|uintptr(syscall.SIGCHLD), 0, 0, 0, 0, 0) | ||||
| 	} | ||||
| 	if errno != 0 || pid != 0 { | ||||
| 		afterFork() | ||||
| 		return pid, errno | ||||
| 	} | ||||
|  | ||||
| 	afterForkInChild() | ||||
| 	if _, _, errno = syscall.RawSyscall(syscall.SYS_CLOSE, uintptr(pipeMap[1]), 0, 0); errno != 0 { | ||||
| 		goto err | ||||
| 	} | ||||
| 	if _, _, errno = syscall.RawSyscall6(syscall.SYS_PRCTL, syscall.PR_SET_PDEATHSIG, uintptr(syscall.SIGKILL), 0, 0, 0, 0); errno != 0 { | ||||
| 		goto err | ||||
| 	} | ||||
| 	// wait for parent's signal | ||||
| 	if _, _, errno = syscall.RawSyscall6(syscall.SYS_READ, uintptr(pipeMap[0]), uintptr(unsafe.Pointer(&sync)), unsafe.Sizeof(sync), 0, 0, 0); errno != 0 || sync != ProcSyncExit { | ||||
| 		goto err | ||||
| 	} | ||||
|  | ||||
| err: | ||||
| 	syscall.RawSyscall6(syscall.SYS_EXIT, uintptr(errno), 0, 0, 0, 0, 0) | ||||
| 	panic("unreachable") | ||||
| } | ||||
		Reference in New Issue
	
	Block a user
	 Ilya Hanov
					Ilya Hanov