mount: support idmapped mount points

This patch introduces idmapped mounts support for
container rootfs.

The idmapped mounts support was merged in Linux kernel 5.12
torvalds/linux@7d6beb7.
This functionality allows to address chown overhead for containers that
use user namespace.

The changes are based on experimental patchset published by
Mauricio Vásquez #4734.
Current version reiplements support of idmapped mounts using Golang.

Performance measurement results:
Image           idmapped mount  recursive chown
BusyBox         00.135          04.964
Ubuntu          00.171          15.713
Fedora          00.143          38.799

Signed-off-by: Mauricio Vásquez <mauricio@kinvolk.io>
Signed-off-by: Artem Kuzin <artem.kuzin@huawei.com>
Signed-off-by: Alexey Perevalov <alexey.perevalov@huawei.com>
Signed-off-by: Ilya Hanov <ilya.hanov@huawei-partners.com>
This commit is contained in:
Ilya Hanov 2023-04-19 17:48:25 +08:00
parent 723c88ce30
commit 1555a31bf6
5 changed files with 401 additions and 28 deletions

View File

@ -186,8 +186,9 @@ func NewContainer(ctx gocontext.Context, client *containerd.Client, context *cli
opts = append(opts, opts = append(opts,
oci.WithUserNamespace([]specs.LinuxIDMapping{uidMap}, []specs.LinuxIDMapping{gidMap})) oci.WithUserNamespace([]specs.LinuxIDMapping{uidMap}, []specs.LinuxIDMapping{gidMap}))
// use snapshotter opts or the remapped snapshot support to shift the filesystem // use snapshotter opts or the remapped snapshot support to shift the filesystem
// currently the only snapshotter known to support the labels is fuse-overlayfs: // currently the snapshotters known to support the labels are:
// https://github.com/AkihiroSuda/containerd-fuse-overlayfs // fuse-overlayfs - https://github.com/containerd/fuse-overlayfs-snapshotter
// overlay - in case of idmapped mount points are supported by host kernel (Linux kernel 5.19)
if context.Bool("remap-labels") { if context.Bool("remap-labels") {
cOpts = append(cOpts, containerd.WithNewSnapshot(id, image, cOpts = append(cOpts, containerd.WithNewSnapshot(id, image,
containerd.WithRemapperLabels(0, uidMap.HostID, 0, gidMap.HostID, uidMap.Size))) containerd.WithRemapperLabels(0, uidMap.HostID, 0, gidMap.HostID, uidMap.Size)))

View File

@ -0,0 +1,166 @@
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package mount
import (
"fmt"
"os"
"strconv"
"strings"
"syscall"
"unsafe"
"golang.org/x/sys/unix"
"github.com/containerd/containerd/sys"
"github.com/sirupsen/logrus"
)
// TODO: Support multiple mappings in future
func parseIDMapping(mapping string) ([]syscall.SysProcIDMap, error) {
parts := strings.Split(mapping, ":")
if len(parts) != 3 {
return nil, fmt.Errorf("user namespace mappings require the format `container-id:host-id:size`")
}
cID, err := strconv.Atoi(parts[0])
if err != nil {
return nil, fmt.Errorf("invalid container id for user namespace remapping, %w", err)
}
hID, err := strconv.Atoi(parts[1])
if err != nil {
return nil, fmt.Errorf("invalid host id for user namespace remapping, %w", err)
}
size, err := strconv.Atoi(parts[2])
if err != nil {
return nil, fmt.Errorf("invalid size for user namespace remapping, %w", err)
}
if cID != 0 || hID < 0 || size < 0 {
return nil, fmt.Errorf("invalid mapping %s, all IDs and size must be positive integers (container ID of 0 is only supported)", mapping)
}
return []syscall.SysProcIDMap{
{
ContainerID: cID,
HostID: hID,
Size: size,
},
}, nil
}
// IDMapMount applies GID/UID shift according to gidmap/uidmap for target path
func IDMapMount(source, target string, usernsFd int) (err error) {
var (
attr unix.MountAttr
)
attr.Attr_set = unix.MOUNT_ATTR_IDMAP
attr.Attr_clr = 0
attr.Propagation = 0
attr.Userns_fd = uint64(usernsFd)
dFd, err := unix.OpenTree(-int(unix.EBADF), source, uint(unix.OPEN_TREE_CLONE|unix.OPEN_TREE_CLOEXEC|unix.AT_EMPTY_PATH))
if err != nil {
return fmt.Errorf("Unable to open tree for %s: %w", target, err)
}
defer unix.Close(dFd)
if err = unix.MountSetattr(dFd, "", unix.AT_EMPTY_PATH, &attr); err != nil {
return fmt.Errorf("Unable to shift GID/UID for %s: %w", target, err)
}
if err = unix.MoveMount(dFd, "", -int(unix.EBADF), target, unix.MOVE_MOUNT_F_EMPTY_PATH); err != nil {
return fmt.Errorf("Unable to attach mount tree to %s: %w", target, err)
}
return nil
}
// GetUsernsFD forks the current process and creates a user namespace using the specified
// mappings.
//
// It returns:
// 1. The file descriptor of the /proc/[pid]/ns/user of the newly
// created mapping.
// 2. "Clean up" function that should be called once user namespace
// file descriptor is no longer needed.
// 3. Usual error.
func GetUsernsFD(uidmap, gidmap string) (_ int, _ func(), err error) {
var (
usernsFile *os.File
pipeMap [2]int
pid uintptr
errno syscall.Errno
uidMaps, gidMaps []syscall.SysProcIDMap
)
if uidMaps, err = parseIDMapping(uidmap); err != nil {
return -1, nil, err
}
if gidMaps, err = parseIDMapping(gidmap); err != nil {
return -1, nil, err
}
syscall.ForkLock.Lock()
if err = syscall.Pipe2(pipeMap[:], syscall.O_CLOEXEC); err != nil {
syscall.ForkLock.Unlock()
return -1, nil, err
}
pid, errno = sys.ForkUserns(pipeMap)
syscall.ForkLock.Unlock()
if errno != 0 {
syscall.Close(pipeMap[0])
syscall.Close(pipeMap[1])
return -1, nil, errno
}
syscall.Close(pipeMap[0])
writeMappings := func(fname string, idmap []syscall.SysProcIDMap) error {
mappings := ""
for _, m := range idmap {
mappings = fmt.Sprintf("%d %d %d\n", m.ContainerID, m.HostID, m.Size)
}
return os.WriteFile(fmt.Sprintf("/proc/%d/%s", pid, fname), []byte(mappings), 0600)
}
cleanUpChild := func() {
sync := sys.ProcSyncExit
if _, _, errno := syscall.Syscall6(syscall.SYS_WRITE, uintptr(pipeMap[1]), uintptr(unsafe.Pointer(&sync)), unsafe.Sizeof(sync), 0, 0, 0); errno != 0 {
logrus.WithError(errno).Warnf("failed to sync with child (ProcSyncExit)")
}
syscall.Close(pipeMap[1])
if _, err := unix.Wait4(int(pid), nil, 0, nil); err != nil {
logrus.WithError(err).Warnf("failed to wait for child process; the SIGHLD might be received by shim reaper")
}
}
defer cleanUpChild()
if err := writeMappings("uid_map", uidMaps); err != nil {
return -1, nil, err
}
if err := writeMappings("gid_map", gidMaps); err != nil {
return -1, nil, err
}
if usernsFile, err = os.Open(fmt.Sprintf("/proc/%d/ns/user", pid)); err != nil {
return -1, nil, fmt.Errorf("failed to get user ns file descriptor for - /proc/%d/user/ns: %w", pid, err)
}
return int(usernsFile.Fd()), func() {
usernsFile.Close()
}, nil
}

View File

@ -21,14 +21,26 @@ import (
"fmt" "fmt"
"os" "os"
"path" "path"
"path/filepath"
"runtime" "runtime"
"strconv"
"strings" "strings"
"time" "time"
"github.com/sirupsen/logrus"
exec "golang.org/x/sys/execabs" exec "golang.org/x/sys/execabs"
"golang.org/x/sys/unix" "golang.org/x/sys/unix"
) )
type mountOpt struct {
flags int
data []string
losetup bool
uidmap string
gidmap string
}
var ( var (
pagesize = 4096 pagesize = 4096
allowedHelperBinaries = []string{"mount.fuse", "mount.fuse3"} allowedHelperBinaries = []string{"mount.fuse", "mount.fuse3"}
@ -38,6 +50,34 @@ func init() {
pagesize = os.Getpagesize() pagesize = os.Getpagesize()
} }
// prepareIDMappedOverlay is a helper function to obtain
// actual "lowerdir=..." mount options. It creates and
// applies id mapping for each lowerdir.
//
// It returns:
// 1. New options that include new "lowedir=..." mount option.
// 2. "Clean up" function -- it should be called as a defer one before
// checking for error, because if do the second and avoid calling "clean up",
// you're going to have "dirty" setup -- there's no guarantee that those
// temporary mount points for lowedirs will be cleaned properly.
// 3. Error -- nil if everything's fine, otherwise an error.
func prepareIDMappedOverlay(usernsFd int, options []string) ([]string, func(), error) {
lowerIdx, lowerDirs := findOverlayLowerdirs(options)
if lowerIdx == -1 {
return options, nil, fmt.Errorf("failed to parse overlay lowerdir's from given options")
}
tmpLowerdirs, idMapCleanUp, err := doPrepareIDMappedOverlay(lowerDirs, usernsFd)
if err != nil {
return options, idMapCleanUp, fmt.Errorf("failed to create idmapped mount: %w", err)
}
options = append(options[:lowerIdx], options[lowerIdx+1:]...)
options = append(options, fmt.Sprintf("lowerdir=%s", strings.Join(tmpLowerdirs, ":")))
return options, idMapCleanUp, nil
}
// Mount to the provided target path. // Mount to the provided target path.
// //
// If m.Type starts with "fuse." or "fuse3.", "mount.fuse" or "mount.fuse3" // If m.Type starts with "fuse." or "fuse3.", "mount.fuse" or "mount.fuse3"
@ -52,44 +92,80 @@ func (m *Mount) mount(target string) (err error) {
} }
var ( var (
chdir string chdir string
recalcOpt bool
usernsFd int
options = m.Options options = m.Options
) )
opt := parseMountOptions(options)
// The only remapping of both GID and UID is supported
if opt.uidmap != "" && opt.gidmap != "" {
var (
childProcCleanUp func()
)
if usernsFd, childProcCleanUp, err = GetUsernsFD(opt.uidmap, opt.gidmap); err != nil {
return err
}
defer childProcCleanUp()
// overlay expects lowerdir's to be remapped instead
if m.Type == "overlay" {
var (
userNsCleanUp func()
)
options, userNsCleanUp, err = prepareIDMappedOverlay(usernsFd, options)
defer userNsCleanUp()
if err != nil {
return fmt.Errorf("failed to prepare idmapped overlay: %w", err)
}
// To not meet concurrency issues while using the same lowedirs
// for different containers, replace them by temporary directories,
if optionsSize(options) >= pagesize-512 {
recalcOpt = true
} else {
opt = parseMountOptions(options)
}
}
}
// avoid hitting one page limit of mount argument buffer // avoid hitting one page limit of mount argument buffer
// //
// NOTE: 512 is a buffer during pagesize check. // NOTE: 512 is a buffer during pagesize check.
if m.Type == "overlay" && optionsSize(options) >= pagesize-512 { if m.Type == "overlay" && optionsSize(options) >= pagesize-512 {
chdir, options = compactLowerdirOption(options) chdir, options = compactLowerdirOption(options)
// recalculate opt in case of lowerdirs have been replaced
// by idmapped ones OR idmapped mounts' not used/supported.
if recalcOpt || (opt.uidmap == "" || opt.gidmap == "") {
opt = parseMountOptions(options)
}
} }
flags, data, losetup := parseMountOptions(options)
// propagation types. // propagation types.
const ptypes = unix.MS_SHARED | unix.MS_PRIVATE | unix.MS_SLAVE | unix.MS_UNBINDABLE const ptypes = unix.MS_SHARED | unix.MS_PRIVATE | unix.MS_SLAVE | unix.MS_UNBINDABLE
// Ensure propagation type change flags aren't included in other calls. // Ensure propagation type change flags aren't included in other calls.
oflags := flags &^ ptypes oflags := opt.flags &^ ptypes
var loopParams LoopParams var loopParams LoopParams
if losetup { if opt.losetup {
loopParams = LoopParams{ loopParams = LoopParams{
Readonly: oflags&unix.MS_RDONLY == unix.MS_RDONLY, Readonly: oflags&unix.MS_RDONLY == unix.MS_RDONLY,
Autoclear: true, Autoclear: true,
} }
loopParams.Direct, data = hasDirectIO(data) loopParams.Direct, opt.data = hasDirectIO(opt.data)
} }
dataInStr := strings.Join(data, ",") dataInStr := strings.Join(opt.data, ",")
if len(dataInStr) > pagesize { if len(dataInStr) > pagesize {
return errors.New("mount options is too long") return errors.New("mount options is too long")
} }
// In the case of remounting with changed data (data != ""), need to call mount (moby/moby#34077). // In the case of remounting with changed data (dataInStr != ""), need to call mount (moby/moby#34077).
if flags&unix.MS_REMOUNT == 0 || dataInStr != "" { if opt.flags&unix.MS_REMOUNT == 0 || dataInStr != "" {
// Initial call applying all non-propagation flags for mount // Initial call applying all non-propagation flags for mount
// or remount with changed data // or remount with changed data
source := m.Source source := m.Source
if losetup { if opt.losetup {
loFile, err := setupLoop(m.Source, loopParams) loFile, err := setupLoop(m.Source, loopParams)
if err != nil { if err != nil {
return err return err
@ -104,10 +180,10 @@ func (m *Mount) mount(target string) (err error) {
} }
} }
if flags&ptypes != 0 { if opt.flags&ptypes != 0 {
// Change the propagation type. // Change the propagation type.
const pflags = ptypes | unix.MS_REC | unix.MS_SILENT const pflags = ptypes | unix.MS_REC | unix.MS_SILENT
if err := unix.Mount("", target, "", uintptr(flags&pflags), ""); err != nil { if err := unix.Mount("", target, "", uintptr(opt.flags&pflags), ""); err != nil {
return err return err
} }
} }
@ -117,9 +193,45 @@ func (m *Mount) mount(target string) (err error) {
// Remount the bind to apply read only. // Remount the bind to apply read only.
return unix.Mount("", target, "", uintptr(oflags|unix.MS_REMOUNT), "") return unix.Mount("", target, "", uintptr(oflags|unix.MS_REMOUNT), "")
} }
// remap non-overlay mount point
if opt.uidmap != "" && opt.gidmap != "" && m.Type != "overlay" {
if err := IDMapMount(target, target, usernsFd); err != nil {
return err
}
}
return nil return nil
} }
func doPrepareIDMappedOverlay(lowerDirs []string, usernsFd int) (tmpLowerDirs []string, _ func(), _ error) {
td, err := os.MkdirTemp(tempMountLocation, "ovl-idmapped")
if err != nil {
return nil, nil, err
}
cleanUp := func() {
for _, lowerDir := range tmpLowerDirs {
if err := unix.Unmount(lowerDir, 0); err != nil {
logrus.WithError(err).Warnf("failed to unmount temp lowerdir %s", lowerDir)
}
}
if terr := os.RemoveAll(filepath.Clean(filepath.Join(tmpLowerDirs[0], ".."))); terr != nil {
logrus.WithError(terr).Warnf("failed to remove temporary overlay lowerdir's")
}
}
for i, lowerDir := range lowerDirs {
tmpLowerDir := filepath.Join(td, strconv.Itoa(i))
tmpLowerDirs = append(tmpLowerDirs, tmpLowerDir)
if err = os.MkdirAll(tmpLowerDir, 0700); err != nil {
return nil, cleanUp, fmt.Errorf("failed to create temporary dir: %w", err)
}
if err = IDMapMount(lowerDir, tmpLowerDir, usernsFd); err != nil {
return nil, cleanUp, err
}
}
return tmpLowerDirs, cleanUp, nil
}
// Unmount the provided mount path with the flags // Unmount the provided mount path with the flags
func Unmount(target string, flags int) error { func Unmount(target string, flags int) error {
if err := unmount(target, flags); err != nil && err != unix.EINVAL { if err := unmount(target, flags); err != nil && err != unix.EINVAL {
@ -208,14 +320,9 @@ func UnmountAll(mount string, flags int) error {
// parseMountOptions takes fstab style mount options and parses them for // parseMountOptions takes fstab style mount options and parses them for
// use with a standard mount() syscall // use with a standard mount() syscall
func parseMountOptions(options []string) (int, []string, bool) { func parseMountOptions(options []string) (opt mountOpt) {
var (
flag int
losetup bool
data []string
)
loopOpt := "loop" loopOpt := "loop"
flags := map[string]struct { flagsMap := map[string]struct {
clear bool clear bool
flag int flag int
}{ }{
@ -249,19 +356,23 @@ func parseMountOptions(options []string) (int, []string, bool) {
// If the option does not exist in the flags table or the flag // If the option does not exist in the flags table or the flag
// is not supported on the platform, // is not supported on the platform,
// then it is a data value for a specific fs type // then it is a data value for a specific fs type
if f, exists := flags[o]; exists && f.flag != 0 { if f, exists := flagsMap[o]; exists && f.flag != 0 {
if f.clear { if f.clear {
flag &^= f.flag opt.flags &^= f.flag
} else { } else {
flag |= f.flag opt.flags |= f.flag
} }
} else if o == loopOpt { } else if o == loopOpt {
losetup = true opt.losetup = true
} else if strings.HasPrefix(o, "uidmap=") {
opt.uidmap = strings.TrimPrefix(o, "uidmap=")
} else if strings.HasPrefix(o, "gidmap=") {
opt.gidmap = strings.TrimPrefix(o, "gidmap=")
} else { } else {
data = append(data, o) opt.data = append(opt.data, o)
} }
} }
return flag, data, losetup return
} }
func hasDirectIO(opts []string) (bool, []string) { func hasDirectIO(opts []string) (bool, []string) {

View File

@ -0,0 +1,30 @@
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package sys
import (
_ "unsafe" // required for go:linkname.
)
//go:linkname beforeFork syscall.runtime_BeforeFork
func beforeFork()
//go:linkname afterFork syscall.runtime_AfterFork
func afterFork()
//go:linkname afterForkInChild syscall.runtime_AfterForkInChild
func afterForkInChild()

View File

@ -0,0 +1,65 @@
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package sys
import (
"runtime"
"syscall"
"unsafe"
)
// ProcSyncType is used for synchronization
// between parent and child processes.
type ProcSyncType uint8
const (
// ProcSyncExit tells child "it's time to exit".
ProcSyncExit ProcSyncType = 0x1
)
//go:norace
//go:noinline
func ForkUserns(pipeMap [2]int) (pid uintptr, errno syscall.Errno) {
var sync ProcSyncType
beforeFork()
if runtime.GOARCH == "s390x" {
pid, _, errno = syscall.RawSyscall6(uintptr(syscall.SYS_CLONE), 0, syscall.CLONE_NEWUSER|uintptr(syscall.SIGCHLD), 0, 0, 0, 0)
} else {
pid, _, errno = syscall.RawSyscall6(uintptr(syscall.SYS_CLONE), syscall.CLONE_NEWUSER|uintptr(syscall.SIGCHLD), 0, 0, 0, 0, 0)
}
if errno != 0 || pid != 0 {
afterFork()
return pid, errno
}
afterForkInChild()
if _, _, errno = syscall.RawSyscall(syscall.SYS_CLOSE, uintptr(pipeMap[1]), 0, 0); errno != 0 {
goto err
}
if _, _, errno = syscall.RawSyscall6(syscall.SYS_PRCTL, syscall.PR_SET_PDEATHSIG, uintptr(syscall.SIGKILL), 0, 0, 0, 0); errno != 0 {
goto err
}
// wait for parent's signal
if _, _, errno = syscall.RawSyscall6(syscall.SYS_READ, uintptr(pipeMap[0]), uintptr(unsafe.Pointer(&sync)), unsafe.Sizeof(sync), 0, 0, 0); errno != 0 || sync != ProcSyncExit {
goto err
}
err:
syscall.RawSyscall6(syscall.SYS_EXIT, uintptr(errno), 0, 0, 0, 0, 0)
panic("unreachable")
}