diff --git a/cmd/ctr/commands/run/run_unix.go b/cmd/ctr/commands/run/run_unix.go index 64996b520..29f3c9043 100644 --- a/cmd/ctr/commands/run/run_unix.go +++ b/cmd/ctr/commands/run/run_unix.go @@ -186,8 +186,9 @@ func NewContainer(ctx gocontext.Context, client *containerd.Client, context *cli opts = append(opts, oci.WithUserNamespace([]specs.LinuxIDMapping{uidMap}, []specs.LinuxIDMapping{gidMap})) // use snapshotter opts or the remapped snapshot support to shift the filesystem - // currently the only snapshotter known to support the labels is fuse-overlayfs: - // https://github.com/AkihiroSuda/containerd-fuse-overlayfs + // currently the snapshotters known to support the labels are: + // fuse-overlayfs - https://github.com/containerd/fuse-overlayfs-snapshotter + // overlay - in case of idmapped mount points are supported by host kernel (Linux kernel 5.19) if context.Bool("remap-labels") { cOpts = append(cOpts, containerd.WithNewSnapshot(id, image, containerd.WithRemapperLabels(0, uidMap.HostID, 0, gidMap.HostID, uidMap.Size))) diff --git a/mount/mount_idmapped_linux.go b/mount/mount_idmapped_linux.go new file mode 100644 index 000000000..92208771e --- /dev/null +++ b/mount/mount_idmapped_linux.go @@ -0,0 +1,166 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package mount + +import ( + "fmt" + "os" + "strconv" + "strings" + "syscall" + "unsafe" + + "golang.org/x/sys/unix" + + "github.com/containerd/containerd/sys" + "github.com/sirupsen/logrus" +) + +// TODO: Support multiple mappings in future +func parseIDMapping(mapping string) ([]syscall.SysProcIDMap, error) { + parts := strings.Split(mapping, ":") + if len(parts) != 3 { + return nil, fmt.Errorf("user namespace mappings require the format `container-id:host-id:size`") + } + cID, err := strconv.Atoi(parts[0]) + if err != nil { + return nil, fmt.Errorf("invalid container id for user namespace remapping, %w", err) + } + hID, err := strconv.Atoi(parts[1]) + if err != nil { + return nil, fmt.Errorf("invalid host id for user namespace remapping, %w", err) + } + size, err := strconv.Atoi(parts[2]) + if err != nil { + return nil, fmt.Errorf("invalid size for user namespace remapping, %w", err) + } + if cID != 0 || hID < 0 || size < 0 { + return nil, fmt.Errorf("invalid mapping %s, all IDs and size must be positive integers (container ID of 0 is only supported)", mapping) + } + return []syscall.SysProcIDMap{ + { + ContainerID: cID, + HostID: hID, + Size: size, + }, + }, nil +} + +// IDMapMount applies GID/UID shift according to gidmap/uidmap for target path +func IDMapMount(source, target string, usernsFd int) (err error) { + var ( + attr unix.MountAttr + ) + + attr.Attr_set = unix.MOUNT_ATTR_IDMAP + attr.Attr_clr = 0 + attr.Propagation = 0 + attr.Userns_fd = uint64(usernsFd) + + dFd, err := unix.OpenTree(-int(unix.EBADF), source, uint(unix.OPEN_TREE_CLONE|unix.OPEN_TREE_CLOEXEC|unix.AT_EMPTY_PATH)) + if err != nil { + return fmt.Errorf("Unable to open tree for %s: %w", target, err) + } + + defer unix.Close(dFd) + if err = unix.MountSetattr(dFd, "", unix.AT_EMPTY_PATH, &attr); err != nil { + return fmt.Errorf("Unable to shift GID/UID for %s: %w", target, err) + } + + if err = unix.MoveMount(dFd, "", -int(unix.EBADF), target, unix.MOVE_MOUNT_F_EMPTY_PATH); err != nil { + return fmt.Errorf("Unable to attach mount tree to %s: %w", target, err) + } + return nil +} + +// GetUsernsFD forks the current process and creates a user namespace using the specified +// mappings. +// +// It returns: +// 1. The file descriptor of the /proc/[pid]/ns/user of the newly +// created mapping. +// 2. "Clean up" function that should be called once user namespace +// file descriptor is no longer needed. +// 3. Usual error. +func GetUsernsFD(uidmap, gidmap string) (_ int, _ func(), err error) { + var ( + usernsFile *os.File + pipeMap [2]int + pid uintptr + errno syscall.Errno + uidMaps, gidMaps []syscall.SysProcIDMap + ) + + if uidMaps, err = parseIDMapping(uidmap); err != nil { + return -1, nil, err + } + if gidMaps, err = parseIDMapping(gidmap); err != nil { + return -1, nil, err + } + + syscall.ForkLock.Lock() + if err = syscall.Pipe2(pipeMap[:], syscall.O_CLOEXEC); err != nil { + syscall.ForkLock.Unlock() + return -1, nil, err + } + + pid, errno = sys.ForkUserns(pipeMap) + syscall.ForkLock.Unlock() + if errno != 0 { + syscall.Close(pipeMap[0]) + syscall.Close(pipeMap[1]) + return -1, nil, errno + } + + syscall.Close(pipeMap[0]) + + writeMappings := func(fname string, idmap []syscall.SysProcIDMap) error { + mappings := "" + for _, m := range idmap { + mappings = fmt.Sprintf("%d %d %d\n", m.ContainerID, m.HostID, m.Size) + } + return os.WriteFile(fmt.Sprintf("/proc/%d/%s", pid, fname), []byte(mappings), 0600) + } + + cleanUpChild := func() { + sync := sys.ProcSyncExit + if _, _, errno := syscall.Syscall6(syscall.SYS_WRITE, uintptr(pipeMap[1]), uintptr(unsafe.Pointer(&sync)), unsafe.Sizeof(sync), 0, 0, 0); errno != 0 { + logrus.WithError(errno).Warnf("failed to sync with child (ProcSyncExit)") + } + syscall.Close(pipeMap[1]) + + if _, err := unix.Wait4(int(pid), nil, 0, nil); err != nil { + logrus.WithError(err).Warnf("failed to wait for child process; the SIGHLD might be received by shim reaper") + } + } + defer cleanUpChild() + + if err := writeMappings("uid_map", uidMaps); err != nil { + return -1, nil, err + } + if err := writeMappings("gid_map", gidMaps); err != nil { + return -1, nil, err + } + + if usernsFile, err = os.Open(fmt.Sprintf("/proc/%d/ns/user", pid)); err != nil { + return -1, nil, fmt.Errorf("failed to get user ns file descriptor for - /proc/%d/user/ns: %w", pid, err) + } + + return int(usernsFile.Fd()), func() { + usernsFile.Close() + }, nil +} diff --git a/mount/mount_linux.go b/mount/mount_linux.go index 90dd941a9..837d9b802 100644 --- a/mount/mount_linux.go +++ b/mount/mount_linux.go @@ -21,14 +21,26 @@ import ( "fmt" "os" "path" + "path/filepath" "runtime" + "strconv" "strings" "time" + "github.com/sirupsen/logrus" + exec "golang.org/x/sys/execabs" "golang.org/x/sys/unix" ) +type mountOpt struct { + flags int + data []string + losetup bool + uidmap string + gidmap string +} + var ( pagesize = 4096 allowedHelperBinaries = []string{"mount.fuse", "mount.fuse3"} @@ -38,6 +50,34 @@ func init() { pagesize = os.Getpagesize() } +// prepareIDMappedOverlay is a helper function to obtain +// actual "lowerdir=..." mount options. It creates and +// applies id mapping for each lowerdir. +// +// It returns: +// 1. New options that include new "lowedir=..." mount option. +// 2. "Clean up" function -- it should be called as a defer one before +// checking for error, because if do the second and avoid calling "clean up", +// you're going to have "dirty" setup -- there's no guarantee that those +// temporary mount points for lowedirs will be cleaned properly. +// 3. Error -- nil if everything's fine, otherwise an error. +func prepareIDMappedOverlay(usernsFd int, options []string) ([]string, func(), error) { + lowerIdx, lowerDirs := findOverlayLowerdirs(options) + if lowerIdx == -1 { + return options, nil, fmt.Errorf("failed to parse overlay lowerdir's from given options") + } + + tmpLowerdirs, idMapCleanUp, err := doPrepareIDMappedOverlay(lowerDirs, usernsFd) + if err != nil { + return options, idMapCleanUp, fmt.Errorf("failed to create idmapped mount: %w", err) + } + + options = append(options[:lowerIdx], options[lowerIdx+1:]...) + options = append(options, fmt.Sprintf("lowerdir=%s", strings.Join(tmpLowerdirs, ":"))) + + return options, idMapCleanUp, nil +} + // Mount to the provided target path. // // If m.Type starts with "fuse." or "fuse3.", "mount.fuse" or "mount.fuse3" @@ -51,45 +91,81 @@ func (m *Mount) mount(target string) (err error) { } } var ( - chdir string - options = m.Options + chdir string + recalcOpt bool + usernsFd int + options = m.Options ) + opt := parseMountOptions(options) + // The only remapping of both GID and UID is supported + if opt.uidmap != "" && opt.gidmap != "" { + var ( + childProcCleanUp func() + ) + if usernsFd, childProcCleanUp, err = GetUsernsFD(opt.uidmap, opt.gidmap); err != nil { + return err + } + defer childProcCleanUp() + + // overlay expects lowerdir's to be remapped instead + if m.Type == "overlay" { + var ( + userNsCleanUp func() + ) + options, userNsCleanUp, err = prepareIDMappedOverlay(usernsFd, options) + defer userNsCleanUp() + + if err != nil { + return fmt.Errorf("failed to prepare idmapped overlay: %w", err) + } + // To not meet concurrency issues while using the same lowedirs + // for different containers, replace them by temporary directories, + if optionsSize(options) >= pagesize-512 { + recalcOpt = true + } else { + opt = parseMountOptions(options) + } + } + } // avoid hitting one page limit of mount argument buffer // // NOTE: 512 is a buffer during pagesize check. if m.Type == "overlay" && optionsSize(options) >= pagesize-512 { chdir, options = compactLowerdirOption(options) + // recalculate opt in case of lowerdirs have been replaced + // by idmapped ones OR idmapped mounts' not used/supported. + if recalcOpt || (opt.uidmap == "" || opt.gidmap == "") { + opt = parseMountOptions(options) + } } - flags, data, losetup := parseMountOptions(options) - // propagation types. const ptypes = unix.MS_SHARED | unix.MS_PRIVATE | unix.MS_SLAVE | unix.MS_UNBINDABLE // Ensure propagation type change flags aren't included in other calls. - oflags := flags &^ ptypes + oflags := opt.flags &^ ptypes var loopParams LoopParams - if losetup { + if opt.losetup { loopParams = LoopParams{ Readonly: oflags&unix.MS_RDONLY == unix.MS_RDONLY, Autoclear: true, } - loopParams.Direct, data = hasDirectIO(data) + loopParams.Direct, opt.data = hasDirectIO(opt.data) } - dataInStr := strings.Join(data, ",") + dataInStr := strings.Join(opt.data, ",") if len(dataInStr) > pagesize { return errors.New("mount options is too long") } - // In the case of remounting with changed data (data != ""), need to call mount (moby/moby#34077). - if flags&unix.MS_REMOUNT == 0 || dataInStr != "" { + // In the case of remounting with changed data (dataInStr != ""), need to call mount (moby/moby#34077). + if opt.flags&unix.MS_REMOUNT == 0 || dataInStr != "" { // Initial call applying all non-propagation flags for mount // or remount with changed data source := m.Source - if losetup { + if opt.losetup { loFile, err := setupLoop(m.Source, loopParams) if err != nil { return err @@ -104,10 +180,10 @@ func (m *Mount) mount(target string) (err error) { } } - if flags&ptypes != 0 { + if opt.flags&ptypes != 0 { // Change the propagation type. const pflags = ptypes | unix.MS_REC | unix.MS_SILENT - if err := unix.Mount("", target, "", uintptr(flags&pflags), ""); err != nil { + if err := unix.Mount("", target, "", uintptr(opt.flags&pflags), ""); err != nil { return err } } @@ -117,9 +193,45 @@ func (m *Mount) mount(target string) (err error) { // Remount the bind to apply read only. return unix.Mount("", target, "", uintptr(oflags|unix.MS_REMOUNT), "") } + + // remap non-overlay mount point + if opt.uidmap != "" && opt.gidmap != "" && m.Type != "overlay" { + if err := IDMapMount(target, target, usernsFd); err != nil { + return err + } + } return nil } +func doPrepareIDMappedOverlay(lowerDirs []string, usernsFd int) (tmpLowerDirs []string, _ func(), _ error) { + td, err := os.MkdirTemp(tempMountLocation, "ovl-idmapped") + if err != nil { + return nil, nil, err + } + cleanUp := func() { + for _, lowerDir := range tmpLowerDirs { + if err := unix.Unmount(lowerDir, 0); err != nil { + logrus.WithError(err).Warnf("failed to unmount temp lowerdir %s", lowerDir) + } + } + if terr := os.RemoveAll(filepath.Clean(filepath.Join(tmpLowerDirs[0], ".."))); terr != nil { + logrus.WithError(terr).Warnf("failed to remove temporary overlay lowerdir's") + } + } + for i, lowerDir := range lowerDirs { + tmpLowerDir := filepath.Join(td, strconv.Itoa(i)) + tmpLowerDirs = append(tmpLowerDirs, tmpLowerDir) + + if err = os.MkdirAll(tmpLowerDir, 0700); err != nil { + return nil, cleanUp, fmt.Errorf("failed to create temporary dir: %w", err) + } + if err = IDMapMount(lowerDir, tmpLowerDir, usernsFd); err != nil { + return nil, cleanUp, err + } + } + return tmpLowerDirs, cleanUp, nil +} + // Unmount the provided mount path with the flags func Unmount(target string, flags int) error { if err := unmount(target, flags); err != nil && err != unix.EINVAL { @@ -208,14 +320,9 @@ func UnmountAll(mount string, flags int) error { // parseMountOptions takes fstab style mount options and parses them for // use with a standard mount() syscall -func parseMountOptions(options []string) (int, []string, bool) { - var ( - flag int - losetup bool - data []string - ) +func parseMountOptions(options []string) (opt mountOpt) { loopOpt := "loop" - flags := map[string]struct { + flagsMap := map[string]struct { clear bool flag int }{ @@ -249,19 +356,23 @@ func parseMountOptions(options []string) (int, []string, bool) { // If the option does not exist in the flags table or the flag // is not supported on the platform, // then it is a data value for a specific fs type - if f, exists := flags[o]; exists && f.flag != 0 { + if f, exists := flagsMap[o]; exists && f.flag != 0 { if f.clear { - flag &^= f.flag + opt.flags &^= f.flag } else { - flag |= f.flag + opt.flags |= f.flag } } else if o == loopOpt { - losetup = true + opt.losetup = true + } else if strings.HasPrefix(o, "uidmap=") { + opt.uidmap = strings.TrimPrefix(o, "uidmap=") + } else if strings.HasPrefix(o, "gidmap=") { + opt.gidmap = strings.TrimPrefix(o, "gidmap=") } else { - data = append(data, o) + opt.data = append(opt.data, o) } } - return flag, data, losetup + return } func hasDirectIO(opts []string) (bool, []string) { diff --git a/sys/subprocess_unsafe_linux.go b/sys/subprocess_unsafe_linux.go new file mode 100644 index 000000000..6e40a9c7d --- /dev/null +++ b/sys/subprocess_unsafe_linux.go @@ -0,0 +1,30 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package sys + +import ( + _ "unsafe" // required for go:linkname. +) + +//go:linkname beforeFork syscall.runtime_BeforeFork +func beforeFork() + +//go:linkname afterFork syscall.runtime_AfterFork +func afterFork() + +//go:linkname afterForkInChild syscall.runtime_AfterForkInChild +func afterForkInChild() diff --git a/sys/userns_unsafe_linux.go b/sys/userns_unsafe_linux.go new file mode 100644 index 000000000..bedf8943c --- /dev/null +++ b/sys/userns_unsafe_linux.go @@ -0,0 +1,65 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package sys + +import ( + "runtime" + "syscall" + "unsafe" +) + +// ProcSyncType is used for synchronization +// between parent and child processes. +type ProcSyncType uint8 + +const ( + // ProcSyncExit tells child "it's time to exit". + ProcSyncExit ProcSyncType = 0x1 +) + +//go:norace +//go:noinline +func ForkUserns(pipeMap [2]int) (pid uintptr, errno syscall.Errno) { + var sync ProcSyncType + + beforeFork() + if runtime.GOARCH == "s390x" { + pid, _, errno = syscall.RawSyscall6(uintptr(syscall.SYS_CLONE), 0, syscall.CLONE_NEWUSER|uintptr(syscall.SIGCHLD), 0, 0, 0, 0) + } else { + pid, _, errno = syscall.RawSyscall6(uintptr(syscall.SYS_CLONE), syscall.CLONE_NEWUSER|uintptr(syscall.SIGCHLD), 0, 0, 0, 0, 0) + } + if errno != 0 || pid != 0 { + afterFork() + return pid, errno + } + + afterForkInChild() + if _, _, errno = syscall.RawSyscall(syscall.SYS_CLOSE, uintptr(pipeMap[1]), 0, 0); errno != 0 { + goto err + } + if _, _, errno = syscall.RawSyscall6(syscall.SYS_PRCTL, syscall.PR_SET_PDEATHSIG, uintptr(syscall.SIGKILL), 0, 0, 0, 0); errno != 0 { + goto err + } + // wait for parent's signal + if _, _, errno = syscall.RawSyscall6(syscall.SYS_READ, uintptr(pipeMap[0]), uintptr(unsafe.Pointer(&sync)), unsafe.Sizeof(sync), 0, 0, 0); errno != 0 || sync != ProcSyncExit { + goto err + } + +err: + syscall.RawSyscall6(syscall.SYS_EXIT, uintptr(errno), 0, 0, 0, 0, 0) + panic("unreachable") +}