mount: support idmapped mount points
This patch introduces idmapped mounts support for container rootfs. The idmapped mounts support was merged in Linux kernel 5.12 torvalds/linux@7d6beb7. This functionality allows to address chown overhead for containers that use user namespace. The changes are based on experimental patchset published by Mauricio Vásquez #4734. Current version reiplements support of idmapped mounts using Golang. Performance measurement results: Image idmapped mount recursive chown BusyBox 00.135 04.964 Ubuntu 00.171 15.713 Fedora 00.143 38.799 Signed-off-by: Mauricio Vásquez <mauricio@kinvolk.io> Signed-off-by: Artem Kuzin <artem.kuzin@huawei.com> Signed-off-by: Alexey Perevalov <alexey.perevalov@huawei.com> Signed-off-by: Ilya Hanov <ilya.hanov@huawei-partners.com>
This commit is contained in:
166
mount/mount_idmapped_linux.go
Normal file
166
mount/mount_idmapped_linux.go
Normal file
@@ -0,0 +1,166 @@
|
||||
/*
|
||||
Copyright The containerd Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package mount
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"strconv"
|
||||
"strings"
|
||||
"syscall"
|
||||
"unsafe"
|
||||
|
||||
"golang.org/x/sys/unix"
|
||||
|
||||
"github.com/containerd/containerd/sys"
|
||||
"github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
// TODO: Support multiple mappings in future
|
||||
func parseIDMapping(mapping string) ([]syscall.SysProcIDMap, error) {
|
||||
parts := strings.Split(mapping, ":")
|
||||
if len(parts) != 3 {
|
||||
return nil, fmt.Errorf("user namespace mappings require the format `container-id:host-id:size`")
|
||||
}
|
||||
cID, err := strconv.Atoi(parts[0])
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("invalid container id for user namespace remapping, %w", err)
|
||||
}
|
||||
hID, err := strconv.Atoi(parts[1])
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("invalid host id for user namespace remapping, %w", err)
|
||||
}
|
||||
size, err := strconv.Atoi(parts[2])
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("invalid size for user namespace remapping, %w", err)
|
||||
}
|
||||
if cID != 0 || hID < 0 || size < 0 {
|
||||
return nil, fmt.Errorf("invalid mapping %s, all IDs and size must be positive integers (container ID of 0 is only supported)", mapping)
|
||||
}
|
||||
return []syscall.SysProcIDMap{
|
||||
{
|
||||
ContainerID: cID,
|
||||
HostID: hID,
|
||||
Size: size,
|
||||
},
|
||||
}, nil
|
||||
}
|
||||
|
||||
// IDMapMount applies GID/UID shift according to gidmap/uidmap for target path
|
||||
func IDMapMount(source, target string, usernsFd int) (err error) {
|
||||
var (
|
||||
attr unix.MountAttr
|
||||
)
|
||||
|
||||
attr.Attr_set = unix.MOUNT_ATTR_IDMAP
|
||||
attr.Attr_clr = 0
|
||||
attr.Propagation = 0
|
||||
attr.Userns_fd = uint64(usernsFd)
|
||||
|
||||
dFd, err := unix.OpenTree(-int(unix.EBADF), source, uint(unix.OPEN_TREE_CLONE|unix.OPEN_TREE_CLOEXEC|unix.AT_EMPTY_PATH))
|
||||
if err != nil {
|
||||
return fmt.Errorf("Unable to open tree for %s: %w", target, err)
|
||||
}
|
||||
|
||||
defer unix.Close(dFd)
|
||||
if err = unix.MountSetattr(dFd, "", unix.AT_EMPTY_PATH, &attr); err != nil {
|
||||
return fmt.Errorf("Unable to shift GID/UID for %s: %w", target, err)
|
||||
}
|
||||
|
||||
if err = unix.MoveMount(dFd, "", -int(unix.EBADF), target, unix.MOVE_MOUNT_F_EMPTY_PATH); err != nil {
|
||||
return fmt.Errorf("Unable to attach mount tree to %s: %w", target, err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// GetUsernsFD forks the current process and creates a user namespace using the specified
|
||||
// mappings.
|
||||
//
|
||||
// It returns:
|
||||
// 1. The file descriptor of the /proc/[pid]/ns/user of the newly
|
||||
// created mapping.
|
||||
// 2. "Clean up" function that should be called once user namespace
|
||||
// file descriptor is no longer needed.
|
||||
// 3. Usual error.
|
||||
func GetUsernsFD(uidmap, gidmap string) (_ int, _ func(), err error) {
|
||||
var (
|
||||
usernsFile *os.File
|
||||
pipeMap [2]int
|
||||
pid uintptr
|
||||
errno syscall.Errno
|
||||
uidMaps, gidMaps []syscall.SysProcIDMap
|
||||
)
|
||||
|
||||
if uidMaps, err = parseIDMapping(uidmap); err != nil {
|
||||
return -1, nil, err
|
||||
}
|
||||
if gidMaps, err = parseIDMapping(gidmap); err != nil {
|
||||
return -1, nil, err
|
||||
}
|
||||
|
||||
syscall.ForkLock.Lock()
|
||||
if err = syscall.Pipe2(pipeMap[:], syscall.O_CLOEXEC); err != nil {
|
||||
syscall.ForkLock.Unlock()
|
||||
return -1, nil, err
|
||||
}
|
||||
|
||||
pid, errno = sys.ForkUserns(pipeMap)
|
||||
syscall.ForkLock.Unlock()
|
||||
if errno != 0 {
|
||||
syscall.Close(pipeMap[0])
|
||||
syscall.Close(pipeMap[1])
|
||||
return -1, nil, errno
|
||||
}
|
||||
|
||||
syscall.Close(pipeMap[0])
|
||||
|
||||
writeMappings := func(fname string, idmap []syscall.SysProcIDMap) error {
|
||||
mappings := ""
|
||||
for _, m := range idmap {
|
||||
mappings = fmt.Sprintf("%d %d %d\n", m.ContainerID, m.HostID, m.Size)
|
||||
}
|
||||
return os.WriteFile(fmt.Sprintf("/proc/%d/%s", pid, fname), []byte(mappings), 0600)
|
||||
}
|
||||
|
||||
cleanUpChild := func() {
|
||||
sync := sys.ProcSyncExit
|
||||
if _, _, errno := syscall.Syscall6(syscall.SYS_WRITE, uintptr(pipeMap[1]), uintptr(unsafe.Pointer(&sync)), unsafe.Sizeof(sync), 0, 0, 0); errno != 0 {
|
||||
logrus.WithError(errno).Warnf("failed to sync with child (ProcSyncExit)")
|
||||
}
|
||||
syscall.Close(pipeMap[1])
|
||||
|
||||
if _, err := unix.Wait4(int(pid), nil, 0, nil); err != nil {
|
||||
logrus.WithError(err).Warnf("failed to wait for child process; the SIGHLD might be received by shim reaper")
|
||||
}
|
||||
}
|
||||
defer cleanUpChild()
|
||||
|
||||
if err := writeMappings("uid_map", uidMaps); err != nil {
|
||||
return -1, nil, err
|
||||
}
|
||||
if err := writeMappings("gid_map", gidMaps); err != nil {
|
||||
return -1, nil, err
|
||||
}
|
||||
|
||||
if usernsFile, err = os.Open(fmt.Sprintf("/proc/%d/ns/user", pid)); err != nil {
|
||||
return -1, nil, fmt.Errorf("failed to get user ns file descriptor for - /proc/%d/user/ns: %w", pid, err)
|
||||
}
|
||||
|
||||
return int(usernsFile.Fd()), func() {
|
||||
usernsFile.Close()
|
||||
}, nil
|
||||
}
|
||||
Reference in New Issue
Block a user