560
core/mount/mount_linux.go
Normal file
560
core/mount/mount_linux.go
Normal file
@@ -0,0 +1,560 @@
|
||||
/*
|
||||
Copyright The containerd Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package mount
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path"
|
||||
"path/filepath"
|
||||
"runtime"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/sirupsen/logrus"
|
||||
"golang.org/x/sys/unix"
|
||||
)
|
||||
|
||||
type mountOpt struct {
|
||||
flags int
|
||||
data []string
|
||||
losetup bool
|
||||
uidmap string
|
||||
gidmap string
|
||||
}
|
||||
|
||||
var (
|
||||
pagesize = 4096
|
||||
allowedHelperBinaries = []string{"mount.fuse", "mount.fuse3"}
|
||||
)
|
||||
|
||||
func init() {
|
||||
pagesize = os.Getpagesize()
|
||||
}
|
||||
|
||||
// prepareIDMappedOverlay is a helper function to obtain
|
||||
// actual "lowerdir=..." mount options. It creates and
|
||||
// applies id mapping for each lowerdir.
|
||||
//
|
||||
// It returns:
|
||||
// 1. New options that include new "lowedir=..." mount option.
|
||||
// 2. "Clean up" function -- it should be called as a defer one before
|
||||
// checking for error, because if do the second and avoid calling "clean up",
|
||||
// you're going to have "dirty" setup -- there's no guarantee that those
|
||||
// temporary mount points for lowedirs will be cleaned properly.
|
||||
// 3. Error -- nil if everything's fine, otherwise an error.
|
||||
func prepareIDMappedOverlay(usernsFd int, options []string) ([]string, func(), error) {
|
||||
lowerIdx, lowerDirs := findOverlayLowerdirs(options)
|
||||
if lowerIdx == -1 {
|
||||
return options, nil, fmt.Errorf("failed to parse overlay lowerdir's from given options")
|
||||
}
|
||||
|
||||
tmpLowerdirs, idMapCleanUp, err := doPrepareIDMappedOverlay(lowerDirs, usernsFd)
|
||||
if err != nil {
|
||||
return options, idMapCleanUp, fmt.Errorf("failed to create idmapped mount: %w", err)
|
||||
}
|
||||
|
||||
options = append(options[:lowerIdx], options[lowerIdx+1:]...)
|
||||
options = append(options, fmt.Sprintf("lowerdir=%s", strings.Join(tmpLowerdirs, ":")))
|
||||
|
||||
return options, idMapCleanUp, nil
|
||||
}
|
||||
|
||||
// Mount to the provided target path.
|
||||
//
|
||||
// If m.Type starts with "fuse." or "fuse3.", "mount.fuse" or "mount.fuse3"
|
||||
// helper binary is called.
|
||||
func (m *Mount) mount(target string) (err error) {
|
||||
for _, helperBinary := range allowedHelperBinaries {
|
||||
// helperBinary = "mount.fuse", typePrefix = "fuse."
|
||||
typePrefix := strings.TrimPrefix(helperBinary, "mount.") + "."
|
||||
if strings.HasPrefix(m.Type, typePrefix) {
|
||||
return m.mountWithHelper(helperBinary, typePrefix, target)
|
||||
}
|
||||
}
|
||||
var (
|
||||
chdir string
|
||||
recalcOpt bool
|
||||
usernsFd *os.File
|
||||
options = m.Options
|
||||
)
|
||||
opt := parseMountOptions(options)
|
||||
// The only remapping of both GID and UID is supported
|
||||
if opt.uidmap != "" && opt.gidmap != "" {
|
||||
if usernsFd, err = GetUsernsFD(opt.uidmap, opt.gidmap); err != nil {
|
||||
return err
|
||||
}
|
||||
defer usernsFd.Close()
|
||||
|
||||
// overlay expects lowerdir's to be remapped instead
|
||||
if m.Type == "overlay" {
|
||||
var (
|
||||
userNsCleanUp func()
|
||||
)
|
||||
options, userNsCleanUp, err = prepareIDMappedOverlay(int(usernsFd.Fd()), options)
|
||||
defer userNsCleanUp()
|
||||
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to prepare idmapped overlay: %w", err)
|
||||
}
|
||||
// To not meet concurrency issues while using the same lowedirs
|
||||
// for different containers, replace them by temporary directories,
|
||||
if optionsSize(options) >= pagesize-512 {
|
||||
recalcOpt = true
|
||||
} else {
|
||||
opt = parseMountOptions(options)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// avoid hitting one page limit of mount argument buffer
|
||||
//
|
||||
// NOTE: 512 is a buffer during pagesize check.
|
||||
if m.Type == "overlay" && optionsSize(options) >= pagesize-512 {
|
||||
chdir, options = compactLowerdirOption(options)
|
||||
// recalculate opt in case of lowerdirs have been replaced
|
||||
// by idmapped ones OR idmapped mounts' not used/supported.
|
||||
if recalcOpt || (opt.uidmap == "" || opt.gidmap == "") {
|
||||
opt = parseMountOptions(options)
|
||||
}
|
||||
}
|
||||
|
||||
// propagation types.
|
||||
const ptypes = unix.MS_SHARED | unix.MS_PRIVATE | unix.MS_SLAVE | unix.MS_UNBINDABLE
|
||||
|
||||
// Ensure propagation type change flags aren't included in other calls.
|
||||
oflags := opt.flags &^ ptypes
|
||||
|
||||
var loopParams LoopParams
|
||||
if opt.losetup {
|
||||
loopParams = LoopParams{
|
||||
Readonly: oflags&unix.MS_RDONLY == unix.MS_RDONLY,
|
||||
Autoclear: true,
|
||||
}
|
||||
loopParams.Direct, opt.data = hasDirectIO(opt.data)
|
||||
}
|
||||
|
||||
dataInStr := strings.Join(opt.data, ",")
|
||||
if len(dataInStr) > pagesize {
|
||||
return errors.New("mount options is too long")
|
||||
}
|
||||
|
||||
// In the case of remounting with changed data (dataInStr != ""), need to call mount (moby/moby#34077).
|
||||
if opt.flags&unix.MS_REMOUNT == 0 || dataInStr != "" {
|
||||
// Initial call applying all non-propagation flags for mount
|
||||
// or remount with changed data
|
||||
source := m.Source
|
||||
if opt.losetup {
|
||||
loFile, err := setupLoop(m.Source, loopParams)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer loFile.Close()
|
||||
|
||||
// Mount the loop device instead
|
||||
source = loFile.Name()
|
||||
}
|
||||
if err := mountAt(chdir, source, target, m.Type, uintptr(oflags), dataInStr); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
if opt.flags&ptypes != 0 {
|
||||
// Change the propagation type.
|
||||
const pflags = ptypes | unix.MS_REC | unix.MS_SILENT
|
||||
if err := unix.Mount("", target, "", uintptr(opt.flags&pflags), ""); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
const broflags = unix.MS_BIND | unix.MS_RDONLY
|
||||
if oflags&broflags == broflags {
|
||||
// Remount the bind to apply read only.
|
||||
return unix.Mount("", target, "", uintptr(oflags|unix.MS_REMOUNT), "")
|
||||
}
|
||||
|
||||
// remap non-overlay mount point
|
||||
if opt.uidmap != "" && opt.gidmap != "" && m.Type != "overlay" {
|
||||
if err := IDMapMount(target, target, int(usernsFd.Fd())); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func doPrepareIDMappedOverlay(lowerDirs []string, usernsFd int) (tmpLowerDirs []string, _ func(), _ error) {
|
||||
td, err := os.MkdirTemp(tempMountLocation, "ovl-idmapped")
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
cleanUp := func() {
|
||||
for _, lowerDir := range tmpLowerDirs {
|
||||
if err := unix.Unmount(lowerDir, 0); err != nil {
|
||||
logrus.WithError(err).Warnf("failed to unmount temp lowerdir %s", lowerDir)
|
||||
}
|
||||
}
|
||||
if terr := os.RemoveAll(filepath.Clean(filepath.Join(tmpLowerDirs[0], ".."))); terr != nil {
|
||||
logrus.WithError(terr).Warnf("failed to remove temporary overlay lowerdir's")
|
||||
}
|
||||
}
|
||||
for i, lowerDir := range lowerDirs {
|
||||
tmpLowerDir := filepath.Join(td, strconv.Itoa(i))
|
||||
tmpLowerDirs = append(tmpLowerDirs, tmpLowerDir)
|
||||
|
||||
if err = os.MkdirAll(tmpLowerDir, 0700); err != nil {
|
||||
return nil, cleanUp, fmt.Errorf("failed to create temporary dir: %w", err)
|
||||
}
|
||||
if err = IDMapMount(lowerDir, tmpLowerDir, usernsFd); err != nil {
|
||||
return nil, cleanUp, err
|
||||
}
|
||||
}
|
||||
return tmpLowerDirs, cleanUp, nil
|
||||
}
|
||||
|
||||
// Unmount the provided mount path with the flags
|
||||
func Unmount(target string, flags int) error {
|
||||
if err := unmount(target, flags); err != nil && err != unix.EINVAL {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// fuseSuperMagic is defined in statfs(2)
|
||||
const fuseSuperMagic = 0x65735546
|
||||
|
||||
func isFUSE(dir string) bool {
|
||||
var st unix.Statfs_t
|
||||
if err := unix.Statfs(dir, &st); err != nil {
|
||||
return false
|
||||
}
|
||||
return st.Type == fuseSuperMagic
|
||||
}
|
||||
|
||||
// unmountFUSE attempts to unmount using fusermount/fusermount3 helper binary.
|
||||
//
|
||||
// For FUSE mounts, using these helper binaries is preferred, see:
|
||||
// https://github.com/containerd/containerd/pull/3765#discussion_r342083514
|
||||
func unmountFUSE(target string) error {
|
||||
var err error
|
||||
for _, helperBinary := range []string{"fusermount3", "fusermount"} {
|
||||
cmd := exec.Command(helperBinary, "-u", target)
|
||||
err = cmd.Run()
|
||||
if err == nil {
|
||||
return nil
|
||||
}
|
||||
}
|
||||
return err
|
||||
}
|
||||
|
||||
func unmount(target string, flags int) error {
|
||||
if isFUSE(target) {
|
||||
if err := unmountFUSE(target); err == nil {
|
||||
return nil
|
||||
}
|
||||
}
|
||||
for i := 0; i < 50; i++ {
|
||||
if err := unix.Unmount(target, flags); err != nil {
|
||||
switch err {
|
||||
case unix.EBUSY:
|
||||
time.Sleep(50 * time.Millisecond)
|
||||
continue
|
||||
default:
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
return fmt.Errorf("failed to unmount target %s: %w", target, unix.EBUSY)
|
||||
}
|
||||
|
||||
// UnmountAll repeatedly unmounts the given mount point until there
|
||||
// are no mounts remaining (EINVAL is returned by mount), which is
|
||||
// useful for undoing a stack of mounts on the same mount point.
|
||||
// UnmountAll all is noop when the first argument is an empty string.
|
||||
// This is done when the containerd client did not specify any rootfs
|
||||
// mounts (e.g. because the rootfs is managed outside containerd)
|
||||
// UnmountAll is noop when the mount path does not exist.
|
||||
func UnmountAll(mount string, flags int) error {
|
||||
if mount == "" {
|
||||
return nil
|
||||
}
|
||||
if _, err := os.Stat(mount); os.IsNotExist(err) {
|
||||
return nil
|
||||
}
|
||||
|
||||
for {
|
||||
if err := unmount(mount, flags); err != nil {
|
||||
// EINVAL is returned if the target is not a
|
||||
// mount point, indicating that we are
|
||||
// done. It can also indicate a few other
|
||||
// things (such as invalid flags) which we
|
||||
// unfortunately end up squelching here too.
|
||||
if err == unix.EINVAL {
|
||||
return nil
|
||||
}
|
||||
return err
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// parseMountOptions takes fstab style mount options and parses them for
|
||||
// use with a standard mount() syscall
|
||||
func parseMountOptions(options []string) (opt mountOpt) {
|
||||
loopOpt := "loop"
|
||||
flagsMap := map[string]struct {
|
||||
clear bool
|
||||
flag int
|
||||
}{
|
||||
"async": {true, unix.MS_SYNCHRONOUS},
|
||||
"atime": {true, unix.MS_NOATIME},
|
||||
"bind": {false, unix.MS_BIND},
|
||||
"defaults": {false, 0},
|
||||
"dev": {true, unix.MS_NODEV},
|
||||
"diratime": {true, unix.MS_NODIRATIME},
|
||||
"dirsync": {false, unix.MS_DIRSYNC},
|
||||
"exec": {true, unix.MS_NOEXEC},
|
||||
"mand": {false, unix.MS_MANDLOCK},
|
||||
"noatime": {false, unix.MS_NOATIME},
|
||||
"nodev": {false, unix.MS_NODEV},
|
||||
"nodiratime": {false, unix.MS_NODIRATIME},
|
||||
"noexec": {false, unix.MS_NOEXEC},
|
||||
"nomand": {true, unix.MS_MANDLOCK},
|
||||
"norelatime": {true, unix.MS_RELATIME},
|
||||
"nostrictatime": {true, unix.MS_STRICTATIME},
|
||||
"nosuid": {false, unix.MS_NOSUID},
|
||||
"rbind": {false, unix.MS_BIND | unix.MS_REC},
|
||||
"relatime": {false, unix.MS_RELATIME},
|
||||
"remount": {false, unix.MS_REMOUNT},
|
||||
"ro": {false, unix.MS_RDONLY},
|
||||
"rw": {true, unix.MS_RDONLY},
|
||||
"strictatime": {false, unix.MS_STRICTATIME},
|
||||
"suid": {true, unix.MS_NOSUID},
|
||||
"sync": {false, unix.MS_SYNCHRONOUS},
|
||||
}
|
||||
for _, o := range options {
|
||||
// If the option does not exist in the flags table or the flag
|
||||
// is not supported on the platform,
|
||||
// then it is a data value for a specific fs type
|
||||
if f, exists := flagsMap[o]; exists && f.flag != 0 {
|
||||
if f.clear {
|
||||
opt.flags &^= f.flag
|
||||
} else {
|
||||
opt.flags |= f.flag
|
||||
}
|
||||
} else if o == loopOpt {
|
||||
opt.losetup = true
|
||||
} else if strings.HasPrefix(o, "uidmap=") {
|
||||
opt.uidmap = strings.TrimPrefix(o, "uidmap=")
|
||||
} else if strings.HasPrefix(o, "gidmap=") {
|
||||
opt.gidmap = strings.TrimPrefix(o, "gidmap=")
|
||||
} else {
|
||||
opt.data = append(opt.data, o)
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
func hasDirectIO(opts []string) (bool, []string) {
|
||||
for idx, opt := range opts {
|
||||
if opt == "direct-io" {
|
||||
return true, append(opts[:idx], opts[idx+1:]...)
|
||||
}
|
||||
}
|
||||
return false, opts
|
||||
}
|
||||
|
||||
// compactLowerdirOption updates overlay lowdir option and returns the common
|
||||
// dir among all the lowdirs.
|
||||
func compactLowerdirOption(opts []string) (string, []string) {
|
||||
idx, dirs := findOverlayLowerdirs(opts)
|
||||
if idx == -1 || len(dirs) == 1 {
|
||||
// no need to compact if there is only one lowerdir
|
||||
return "", opts
|
||||
}
|
||||
|
||||
// find out common dir
|
||||
commondir := longestCommonPrefix(dirs)
|
||||
if commondir == "" {
|
||||
return "", opts
|
||||
}
|
||||
|
||||
// NOTE: the snapshot id is based on digits.
|
||||
// in order to avoid to get snapshots/x, should be back to parent dir.
|
||||
// however, there is assumption that the common dir is ${root}/io.containerd.v1.overlayfs/snapshots.
|
||||
commondir = path.Dir(commondir)
|
||||
if commondir == "/" || commondir == "." {
|
||||
return "", opts
|
||||
}
|
||||
commondir = commondir + "/"
|
||||
|
||||
newdirs := make([]string, 0, len(dirs))
|
||||
for _, dir := range dirs {
|
||||
if len(dir) <= len(commondir) {
|
||||
return "", opts
|
||||
}
|
||||
newdirs = append(newdirs, dir[len(commondir):])
|
||||
}
|
||||
|
||||
newopts := copyOptions(opts)
|
||||
newopts = append(newopts[:idx], newopts[idx+1:]...)
|
||||
newopts = append(newopts, fmt.Sprintf("lowerdir=%s", strings.Join(newdirs, ":")))
|
||||
return commondir, newopts
|
||||
}
|
||||
|
||||
// findOverlayLowerdirs returns the index of lowerdir in mount's options and
|
||||
// all the lowerdir target.
|
||||
func findOverlayLowerdirs(opts []string) (int, []string) {
|
||||
var (
|
||||
idx = -1
|
||||
prefix = "lowerdir="
|
||||
)
|
||||
|
||||
for i, opt := range opts {
|
||||
if strings.HasPrefix(opt, prefix) {
|
||||
idx = i
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if idx == -1 {
|
||||
return -1, nil
|
||||
}
|
||||
return idx, strings.Split(opts[idx][len(prefix):], ":")
|
||||
}
|
||||
|
||||
// longestCommonPrefix finds the longest common prefix in the string slice.
|
||||
func longestCommonPrefix(strs []string) string {
|
||||
if len(strs) == 0 {
|
||||
return ""
|
||||
} else if len(strs) == 1 {
|
||||
return strs[0]
|
||||
}
|
||||
|
||||
// find out the min/max value by alphabetical order
|
||||
min, max := strs[0], strs[0]
|
||||
for _, str := range strs[1:] {
|
||||
if min > str {
|
||||
min = str
|
||||
}
|
||||
if max < str {
|
||||
max = str
|
||||
}
|
||||
}
|
||||
|
||||
// find out the common part between min and max
|
||||
for i := 0; i < len(min) && i < len(max); i++ {
|
||||
if min[i] != max[i] {
|
||||
return min[:i]
|
||||
}
|
||||
}
|
||||
return min
|
||||
}
|
||||
|
||||
// copyOptions copies the options.
|
||||
func copyOptions(opts []string) []string {
|
||||
if len(opts) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
acopy := make([]string, len(opts))
|
||||
copy(acopy, opts)
|
||||
return acopy
|
||||
}
|
||||
|
||||
// optionsSize returns the byte size of options of mount.
|
||||
func optionsSize(opts []string) int {
|
||||
size := 0
|
||||
for _, opt := range opts {
|
||||
size += len(opt)
|
||||
}
|
||||
return size
|
||||
}
|
||||
|
||||
func mountAt(chdir string, source, target, fstype string, flags uintptr, data string) error {
|
||||
if chdir == "" {
|
||||
return unix.Mount(source, target, fstype, flags, data)
|
||||
}
|
||||
|
||||
ch := make(chan error, 1)
|
||||
go func() {
|
||||
runtime.LockOSThread()
|
||||
|
||||
// Do not unlock this thread.
|
||||
// If the thread is unlocked go will try to use it for other goroutines.
|
||||
// However it is not possible to restore the thread state after CLONE_FS.
|
||||
//
|
||||
// Once the goroutine exits the thread should eventually be terminated by go.
|
||||
|
||||
if err := unix.Unshare(unix.CLONE_FS); err != nil {
|
||||
ch <- err
|
||||
return
|
||||
}
|
||||
|
||||
if err := unix.Chdir(chdir); err != nil {
|
||||
ch <- err
|
||||
return
|
||||
}
|
||||
|
||||
ch <- unix.Mount(source, target, fstype, flags, data)
|
||||
}()
|
||||
return <-ch
|
||||
}
|
||||
|
||||
func (m *Mount) mountWithHelper(helperBinary, typePrefix, target string) error {
|
||||
// helperBinary: "mount.fuse3"
|
||||
// target: "/foo/merged"
|
||||
// m.Type: "fuse3.fuse-overlayfs"
|
||||
// command: "mount.fuse3 overlay /foo/merged -o lowerdir=/foo/lower2:/foo/lower1,upperdir=/foo/upper,workdir=/foo/work -t fuse-overlayfs"
|
||||
args := []string{m.Source, target}
|
||||
for _, o := range m.Options {
|
||||
args = append(args, "-o", o)
|
||||
}
|
||||
args = append(args, "-t", strings.TrimPrefix(m.Type, typePrefix))
|
||||
|
||||
infoBeforeMount, err := Lookup(target)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// cmd.CombinedOutput() may intermittently return ECHILD because of our signal handling in shim.
|
||||
// See #4387 and wait(2).
|
||||
const retriesOnECHILD = 10
|
||||
for i := 0; i < retriesOnECHILD; i++ {
|
||||
cmd := exec.Command(helperBinary, args...)
|
||||
out, err := cmd.CombinedOutput()
|
||||
if err == nil {
|
||||
return nil
|
||||
}
|
||||
if !errors.Is(err, unix.ECHILD) {
|
||||
return fmt.Errorf("mount helper [%s %v] failed: %q: %w", helperBinary, args, string(out), err)
|
||||
}
|
||||
// We got ECHILD, we are not sure whether the mount was successful.
|
||||
// If the mount ID has changed, we are sure we got some new mount, but still not sure it is fully completed.
|
||||
// So we attempt to unmount the new mount before retrying.
|
||||
infoAfterMount, err := Lookup(target)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if infoAfterMount.ID != infoBeforeMount.ID {
|
||||
_ = unmount(target, 0)
|
||||
}
|
||||
}
|
||||
return fmt.Errorf("mount helper [%s %v] failed with ECHILD (retried %d times)", helperBinary, args, retriesOnECHILD)
|
||||
}
|
||||
Reference in New Issue
Block a user