containerd/mount/mount_idmapped_linux.go
Ilya Hanov 1555a31bf6 mount: support idmapped mount points
This patch introduces idmapped mounts support for
container rootfs.

The idmapped mounts support was merged in Linux kernel 5.12
torvalds/linux@7d6beb7.
This functionality allows to address chown overhead for containers that
use user namespace.

The changes are based on experimental patchset published by
Mauricio Vásquez #4734.
Current version reiplements support of idmapped mounts using Golang.

Performance measurement results:
Image           idmapped mount  recursive chown
BusyBox         00.135          04.964
Ubuntu          00.171          15.713
Fedora          00.143          38.799

Signed-off-by: Mauricio Vásquez <mauricio@kinvolk.io>
Signed-off-by: Artem Kuzin <artem.kuzin@huawei.com>
Signed-off-by: Alexey Perevalov <alexey.perevalov@huawei.com>
Signed-off-by: Ilya Hanov <ilya.hanov@huawei-partners.com>
2023-09-05 01:23:30 +03:00

167 lines
4.9 KiB
Go

/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package mount
import (
"fmt"
"os"
"strconv"
"strings"
"syscall"
"unsafe"
"golang.org/x/sys/unix"
"github.com/containerd/containerd/sys"
"github.com/sirupsen/logrus"
)
// TODO: Support multiple mappings in future
func parseIDMapping(mapping string) ([]syscall.SysProcIDMap, error) {
parts := strings.Split(mapping, ":")
if len(parts) != 3 {
return nil, fmt.Errorf("user namespace mappings require the format `container-id:host-id:size`")
}
cID, err := strconv.Atoi(parts[0])
if err != nil {
return nil, fmt.Errorf("invalid container id for user namespace remapping, %w", err)
}
hID, err := strconv.Atoi(parts[1])
if err != nil {
return nil, fmt.Errorf("invalid host id for user namespace remapping, %w", err)
}
size, err := strconv.Atoi(parts[2])
if err != nil {
return nil, fmt.Errorf("invalid size for user namespace remapping, %w", err)
}
if cID != 0 || hID < 0 || size < 0 {
return nil, fmt.Errorf("invalid mapping %s, all IDs and size must be positive integers (container ID of 0 is only supported)", mapping)
}
return []syscall.SysProcIDMap{
{
ContainerID: cID,
HostID: hID,
Size: size,
},
}, nil
}
// IDMapMount applies GID/UID shift according to gidmap/uidmap for target path
func IDMapMount(source, target string, usernsFd int) (err error) {
var (
attr unix.MountAttr
)
attr.Attr_set = unix.MOUNT_ATTR_IDMAP
attr.Attr_clr = 0
attr.Propagation = 0
attr.Userns_fd = uint64(usernsFd)
dFd, err := unix.OpenTree(-int(unix.EBADF), source, uint(unix.OPEN_TREE_CLONE|unix.OPEN_TREE_CLOEXEC|unix.AT_EMPTY_PATH))
if err != nil {
return fmt.Errorf("Unable to open tree for %s: %w", target, err)
}
defer unix.Close(dFd)
if err = unix.MountSetattr(dFd, "", unix.AT_EMPTY_PATH, &attr); err != nil {
return fmt.Errorf("Unable to shift GID/UID for %s: %w", target, err)
}
if err = unix.MoveMount(dFd, "", -int(unix.EBADF), target, unix.MOVE_MOUNT_F_EMPTY_PATH); err != nil {
return fmt.Errorf("Unable to attach mount tree to %s: %w", target, err)
}
return nil
}
// GetUsernsFD forks the current process and creates a user namespace using the specified
// mappings.
//
// It returns:
// 1. The file descriptor of the /proc/[pid]/ns/user of the newly
// created mapping.
// 2. "Clean up" function that should be called once user namespace
// file descriptor is no longer needed.
// 3. Usual error.
func GetUsernsFD(uidmap, gidmap string) (_ int, _ func(), err error) {
var (
usernsFile *os.File
pipeMap [2]int
pid uintptr
errno syscall.Errno
uidMaps, gidMaps []syscall.SysProcIDMap
)
if uidMaps, err = parseIDMapping(uidmap); err != nil {
return -1, nil, err
}
if gidMaps, err = parseIDMapping(gidmap); err != nil {
return -1, nil, err
}
syscall.ForkLock.Lock()
if err = syscall.Pipe2(pipeMap[:], syscall.O_CLOEXEC); err != nil {
syscall.ForkLock.Unlock()
return -1, nil, err
}
pid, errno = sys.ForkUserns(pipeMap)
syscall.ForkLock.Unlock()
if errno != 0 {
syscall.Close(pipeMap[0])
syscall.Close(pipeMap[1])
return -1, nil, errno
}
syscall.Close(pipeMap[0])
writeMappings := func(fname string, idmap []syscall.SysProcIDMap) error {
mappings := ""
for _, m := range idmap {
mappings = fmt.Sprintf("%d %d %d\n", m.ContainerID, m.HostID, m.Size)
}
return os.WriteFile(fmt.Sprintf("/proc/%d/%s", pid, fname), []byte(mappings), 0600)
}
cleanUpChild := func() {
sync := sys.ProcSyncExit
if _, _, errno := syscall.Syscall6(syscall.SYS_WRITE, uintptr(pipeMap[1]), uintptr(unsafe.Pointer(&sync)), unsafe.Sizeof(sync), 0, 0, 0); errno != 0 {
logrus.WithError(errno).Warnf("failed to sync with child (ProcSyncExit)")
}
syscall.Close(pipeMap[1])
if _, err := unix.Wait4(int(pid), nil, 0, nil); err != nil {
logrus.WithError(err).Warnf("failed to wait for child process; the SIGHLD might be received by shim reaper")
}
}
defer cleanUpChild()
if err := writeMappings("uid_map", uidMaps); err != nil {
return -1, nil, err
}
if err := writeMappings("gid_map", gidMaps); err != nil {
return -1, nil, err
}
if usernsFile, err = os.Open(fmt.Sprintf("/proc/%d/ns/user", pid)); err != nil {
return -1, nil, fmt.Errorf("failed to get user ns file descriptor for - /proc/%d/user/ns: %w", pid, err)
}
return int(usernsFile.Fd()), func() {
usernsFile.Close()
}, nil
}