From 67b54c667056935b2cdc0e5a9e9d866c69342741 Mon Sep 17 00:00:00 2001 From: Wei Fu Date: Thu, 26 Jul 2018 15:21:44 +0800 Subject: [PATCH] Support >= 128 layers in overlayfs snapshots Auto-detect longest common dir in lowerdir option and compact it if the option size is hitting one page size. If does, Use chdir + CLONE to do mount thing to avoid hitting one page argument buffer in linux kernel mount. Signed-off-by: Wei Fu --- mount/mount_linux.go | 155 +++++++++++++++++++++++++++- mount/mount_linux_test.go | 94 +++++++++++++++++ snapshots/testsuite/testsuite.go | 93 +++++++++++++++++ sys/mount_linux.go | 119 ++++++++++++++++++++++ sys/mount_linux_test.go | 169 +++++++++++++++++++++++++++++++ sys/subprocess_unsafe_linux.go | 30 ++++++ sys/subprocess_unsafe_linux.s | 15 +++ 7 files changed, 673 insertions(+), 2 deletions(-) create mode 100644 mount/mount_linux_test.go create mode 100644 sys/mount_linux.go create mode 100644 sys/mount_linux_test.go create mode 100644 sys/subprocess_unsafe_linux.go create mode 100644 sys/subprocess_unsafe_linux.s diff --git a/mount/mount_linux.go b/mount/mount_linux.go index 82fc0b279..b5a16148a 100644 --- a/mount/mount_linux.go +++ b/mount/mount_linux.go @@ -17,16 +17,41 @@ package mount import ( + "fmt" + "os" + "path" "strings" "time" + "github.com/containerd/containerd/sys" "github.com/pkg/errors" "golang.org/x/sys/unix" ) +var pagesize = 4096 + +func init() { + pagesize = os.Getpagesize() +} + // Mount to the provided target path func (m *Mount) Mount(target string) error { - flags, data := parseMountOptions(m.Options) + var ( + chdir string + options = m.Options + ) + + // avoid hitting one page limit of mount argument buffer + // + // NOTE: 512 is a buffer during pagesize check. + if m.Type == "overlay" && optionsSize(options) >= pagesize-512 { + chdir, options = compactLowerdirOption(options) + } + + flags, data := parseMountOptions(options) + if len(data) > pagesize { + return errors.Errorf("mount options is too long") + } // propagation types. const ptypes = unix.MS_SHARED | unix.MS_PRIVATE | unix.MS_SLAVE | unix.MS_UNBINDABLE @@ -38,7 +63,7 @@ func (m *Mount) Mount(target string) error { if flags&unix.MS_REMOUNT == 0 || data != "" { // Initial call applying all non-propagation flags for mount // or remount with changed data - if err := unix.Mount(m.Source, target, m.Type, uintptr(oflags), data); err != nil { + if err := mountAt(chdir, m.Source, target, m.Type, uintptr(oflags), data); err != nil { return err } } @@ -155,3 +180,129 @@ func parseMountOptions(options []string) (int, string) { } return flag, strings.Join(data, ",") } + +// compactLowerdirOption updates overlay lowdir option and returns the common +// dir among all the lowdirs. +func compactLowerdirOption(opts []string) (string, []string) { + idx, dirs := findOverlayLowerdirs(opts) + if idx == -1 || len(dirs) == 1 { + // no need to compact if there is only one lowerdir + return "", opts + } + + // find out common dir + commondir := longestCommonPrefix(dirs) + if commondir == "" { + return "", opts + } + + // NOTE: the snapshot id is based on digits. + // in order to avoid to get snapshots/x, should be back to parent dir. + // however, there is assumption that the common dir is ${root}/io.containerd.v1.overlayfs/snapshots. + commondir = path.Dir(commondir) + if commondir == "/" { + return "", opts + } + commondir = commondir + "/" + + newdirs := make([]string, 0, len(dirs)) + for _, dir := range dirs { + newdirs = append(newdirs, dir[len(commondir):]) + } + + newopts := copyOptions(opts) + newopts = append(newopts[:idx], newopts[idx+1:]...) + newopts = append(newopts, fmt.Sprintf("lowerdir=%s", strings.Join(newdirs, ":"))) + return commondir, newopts +} + +// findOverlayLowerdirs returns the index of lowerdir in mount's options and +// all the lowerdir target. +func findOverlayLowerdirs(opts []string) (int, []string) { + var ( + idx = -1 + prefix = "lowerdir=" + ) + + for i, opt := range opts { + if strings.HasPrefix(opt, prefix) { + idx = i + break + } + } + + if idx == -1 { + return -1, nil + } + return idx, strings.Split(opts[idx][len(prefix):], ":") +} + +// longestCommonPrefix finds the longest common prefix in the string slice. +func longestCommonPrefix(strs []string) string { + if len(strs) == 0 { + return "" + } else if len(strs) == 1 { + return strs[0] + } + + // find out the min/max value by alphabetical order + min, max := strs[0], strs[0] + for _, str := range strs[1:] { + if min > str { + min = str + } + if max < str { + max = str + } + } + + // find out the common part between min and max + for i := 0; i < len(min) && i < len(max); i++ { + if min[i] != max[i] { + return min[:i] + } + } + return min +} + +// copyOptions copies the options. +func copyOptions(opts []string) []string { + if len(opts) == 0 { + return nil + } + + acopy := make([]string, len(opts)) + copy(acopy, opts) + return acopy +} + +// optionsSize returns the byte size of options of mount. +func optionsSize(opts []string) int { + size := 0 + for _, opt := range opts { + size += len(opt) + } + return size +} + +func mountAt(chdir string, source, target, fstype string, flags uintptr, data string) error { + if chdir == "" { + return unix.Mount(source, target, fstype, flags, data) + } + + f, err := os.Open(chdir) + if err != nil { + return errors.Wrap(err, "failed to mountat") + } + defer f.Close() + + fs, err := f.Stat() + if err != nil { + return errors.Wrap(err, "failed to mountat") + } + + if !fs.IsDir() { + return errors.Wrap(errors.Errorf("%s is not dir", chdir), "failed to mountat") + } + return errors.Wrap(sys.FMountat(f.Fd(), source, target, fstype, flags, data), "failed to mountat") +} diff --git a/mount/mount_linux_test.go b/mount/mount_linux_test.go new file mode 100644 index 000000000..c3e6a018d --- /dev/null +++ b/mount/mount_linux_test.go @@ -0,0 +1,94 @@ +// +build linux + +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package mount + +import ( + "reflect" + "testing" +) + +func TestLongestCommonPrefix(t *testing.T) { + tcases := []struct { + in []string + expected string + }{ + {[]string{}, ""}, + {[]string{"foo"}, "foo"}, + {[]string{"foo", "bar"}, ""}, + {[]string{"foo", "foo"}, "foo"}, + {[]string{"foo", "foobar"}, "foo"}, + {[]string{"foo", "", "foobar"}, ""}, + } + + for i, tc := range tcases { + if got := longestCommonPrefix(tc.in); got != tc.expected { + t.Fatalf("[%d case] expected (%s), but got (%s)", i+1, tc.expected, got) + } + } +} + +func TestCompactLowerdirOption(t *testing.T) { + tcases := []struct { + opts []string + commondir string + newopts []string + }{ + // no lowerdir or only one + { + []string{"workdir=a"}, + "", + []string{"workdir=a"}, + }, + { + []string{"workdir=a", "lowerdir=b"}, + "", + []string{"workdir=a", "lowerdir=b"}, + }, + + // >= 2 lowerdir + { + []string{"lowerdir=/snapshots/1/fs:/snapshots/10/fs"}, + "/snapshots/", + []string{"lowerdir=1/fs:10/fs"}, + }, + { + []string{"lowerdir=/snapshots/1/fs:/snapshots/10/fs:/snapshots/2/fs"}, + "/snapshots/", + []string{"lowerdir=1/fs:10/fs:2/fs"}, + }, + + // if common dir is / + { + []string{"lowerdir=/snapshots/1/fs:/other_snapshots/1/fs"}, + "", + []string{"lowerdir=/snapshots/1/fs:/other_snapshots/1/fs"}, + }, + } + + for i, tc := range tcases { + dir, opts := compactLowerdirOption(tc.opts) + if dir != tc.commondir { + t.Fatalf("[%d case] expected common dir (%s), but got (%s)", i+1, tc.commondir, dir) + } + + if !reflect.DeepEqual(opts, tc.newopts) { + t.Fatalf("[%d case] expected options (%v), but got (%v)", i+1, tc.newopts, opts) + } + } +} diff --git a/snapshots/testsuite/testsuite.go b/snapshots/testsuite/testsuite.go index 851cebf60..49e7904dc 100644 --- a/snapshots/testsuite/testsuite.go +++ b/snapshots/testsuite/testsuite.go @@ -63,6 +63,8 @@ func SnapshotterSuite(t *testing.T, name string, snapshotterFn func(ctx context. t.Run("StatInWalk", makeTest(name, snapshotterFn, checkStatInWalk)) t.Run("CloseTwice", makeTest(name, snapshotterFn, closeTwice)) t.Run("RootPermission", makeTest(name, snapshotterFn, checkRootPermission)) + + t.Run("128LayersMount", makeTest(name, snapshotterFn, check128LayersMount)) } func makeTest(name string, snapshotterFn func(ctx context.Context, root string) (snapshots.Snapshotter, func() error, error), fn func(ctx context.Context, t *testing.T, snapshotter snapshots.Snapshotter, work string)) func(t *testing.T) { @@ -860,3 +862,94 @@ func checkRootPermission(ctx context.Context, t *testing.T, snapshotter snapshot t.Fatalf("expected 0755, got 0%o", mode) } } + +func check128LayersMount(ctx context.Context, t *testing.T, snapshotter snapshots.Snapshotter, work string) { + lowestApply := fstest.Apply( + fstest.CreateFile("/bottom", []byte("way at the bottom\n"), 0777), + fstest.CreateFile("/overwriteme", []byte("FIRST!\n"), 0777), + fstest.CreateDir("/ADDHERE", 0755), + fstest.CreateDir("/ONLYME", 0755), + fstest.CreateFile("/ONLYME/bottom", []byte("bye!\n"), 0777), + ) + + appliers := []fstest.Applier{lowestApply} + for i := 1; i <= 127; i++ { + appliers = append(appliers, fstest.Apply( + fstest.CreateFile("/overwriteme", []byte(fmt.Sprintf("%d WAS HERE!\n", i)), 0777), + fstest.CreateFile(fmt.Sprintf("/ADDHERE/file-%d", i), []byte("same\n"), 0755), + fstest.RemoveAll("/ONLYME"), + fstest.CreateDir("/ONLYME", 0755), + fstest.CreateFile(fmt.Sprintf("/ONLYME/file-%d", i), []byte("only me!\n"), 0777), + )) + } + + flat := filepath.Join(work, "flat") + if err := os.MkdirAll(flat, 0777); err != nil { + t.Fatalf("failed to create flat dir(%s): %+v", flat, err) + } + + // NOTE: add gc labels to avoid snapshots get removed by gc... + parent := "" + for i, applier := range appliers { + preparing := filepath.Join(work, fmt.Sprintf("prepare-layer-%d", i)) + if err := os.MkdirAll(preparing, 0777); err != nil { + t.Fatalf("[layer %d] failed to create preparing dir(%s): %+v", i, preparing, err) + } + + mounts, err := snapshotter.Prepare(ctx, preparing, parent, opt) + if err != nil { + t.Fatalf("[layer %d] failed to get mount info: %+v", i, err) + } + + if err := mount.All(mounts, preparing); err != nil { + t.Fatalf("[layer %d] failed to mount on the target(%s): %+v", i, preparing, err) + } + + if err := fstest.CheckDirectoryEqual(preparing, flat); err != nil { + testutil.Unmount(t, preparing) + t.Fatalf("[layer %d] preparing doesn't equal to flat before apply: %+v", i, err) + } + + if err := applier.Apply(flat); err != nil { + testutil.Unmount(t, preparing) + t.Fatalf("[layer %d] failed to apply on flat dir: %+v", i, err) + } + + if err = applier.Apply(preparing); err != nil { + testutil.Unmount(t, preparing) + t.Fatalf("[layer %d] failed to apply on preparing dir: %+v", i, err) + } + + if err := fstest.CheckDirectoryEqual(preparing, flat); err != nil { + testutil.Unmount(t, preparing) + t.Fatalf("[layer %d] preparing doesn't equal to flat after apply: %+v", i, err) + } + + testutil.Unmount(t, preparing) + + parent = filepath.Join(work, fmt.Sprintf("committed-%d", i)) + if err := snapshotter.Commit(ctx, parent, preparing, opt); err != nil { + t.Fatalf("[layer %d] failed to commit the preparing: %+v", i, err) + } + + } + + view := filepath.Join(work, "fullview") + if err := os.MkdirAll(view, 0777); err != nil { + t.Fatalf("failed to create fullview dir(%s): %+v", view, err) + } + + mounts, err := snapshotter.View(ctx, view, parent, opt) + if err != nil { + t.Fatalf("failed to get view's mount info: %+v", err) + } + + if err := mount.All(mounts, view); err != nil { + t.Fatalf("failed to mount on the target(%s): %+v", view, err) + } + defer testutil.Unmount(t, view) + + if err := fstest.CheckDirectoryEqual(view, flat); err != nil { + t.Fatalf("fullview should equal to flat: %+v", err) + } +} diff --git a/sys/mount_linux.go b/sys/mount_linux.go new file mode 100644 index 000000000..a9eee9b73 --- /dev/null +++ b/sys/mount_linux.go @@ -0,0 +1,119 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package sys + +import ( + "runtime" + "syscall" + "unsafe" + + "github.com/pkg/errors" + "golang.org/x/sys/unix" +) + +// FMountat performs mount from the provided directory. +func FMountat(dirfd uintptr, source, target, fstype string, flags uintptr, data string) error { + var ( + sourceP, targetP, fstypeP, dataP *byte + pid uintptr + ws unix.WaitStatus + err error + errno syscall.Errno + ) + + sourceP, err = syscall.BytePtrFromString(source) + if err != nil { + return err + } + + targetP, err = syscall.BytePtrFromString(target) + if err != nil { + return err + } + + fstypeP, err = syscall.BytePtrFromString(fstype) + if err != nil { + return err + } + + if data != "" { + dataP, err = syscall.BytePtrFromString(data) + if err != nil { + return err + } + } + + runtime.LockOSThread() + defer runtime.UnlockOSThread() + + pid, errno = forkAndMountat(dirfd, + uintptr(unsafe.Pointer(sourceP)), + uintptr(unsafe.Pointer(targetP)), + uintptr(unsafe.Pointer(fstypeP)), + flags, + uintptr(unsafe.Pointer(dataP))) + + if errno != 0 { + return errors.Wrap(errno, "failed to fork thread") + } + + _, err = unix.Wait4(int(pid), &ws, 0, nil) + for err == syscall.EINTR { + _, err = unix.Wait4(int(pid), &ws, 0, nil) + } + + if err != nil { + return errors.Wrapf(err, "failed to find pid=%d process", pid) + } + + errno = syscall.Errno(ws.ExitStatus()) + if errno != 0 { + return errors.Wrap(errno, "failed to mount") + } + return nil +} + +// forkAndMountat will fork thread, change working dir and mount. +// +// precondition: the runtime OS thread must be locked. +func forkAndMountat(dirfd uintptr, source, target, fstype, flags, data uintptr) (pid uintptr, errno syscall.Errno) { + // block signal during clone + beforeFork() + + // the cloned thread shares the open file descriptor, but the thread + // never be reused by runtime. + pid, _, errno = syscall.RawSyscall6(syscall.SYS_CLONE, uintptr(syscall.SIGCHLD)|syscall.CLONE_FILES, 0, 0, 0, 0, 0) + if errno != 0 || pid != 0 { + // restore all signals + afterFork() + return + } + + // restore all signals + afterForkInChild() + + // change working dir + _, _, errno = syscall.RawSyscall(syscall.SYS_FCHDIR, dirfd, 0, 0) + if errno != 0 { + goto childerr + } + _, _, errno = syscall.RawSyscall6(syscall.SYS_MOUNT, source, target, fstype, flags, data, 0) + +childerr: + syscall.RawSyscall(syscall.SYS_EXIT, uintptr(errno), 0, 0) + panic("unreachable") +} diff --git a/sys/mount_linux_test.go b/sys/mount_linux_test.go new file mode 100644 index 000000000..b548615a6 --- /dev/null +++ b/sys/mount_linux_test.go @@ -0,0 +1,169 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package sys + +import ( + "io/ioutil" + "os" + "path/filepath" + "syscall" + "testing" + "time" + + "github.com/containerd/continuity/fs/fstest" + "github.com/pkg/errors" + "golang.org/x/sys/unix" +) + +type fMountatCaseFunc func(t *testing.T, root string) + +func TestFMountat(t *testing.T) { + if RunningUnprivileged() { + t.Skip("Needs to be run as root") + return + } + + t.Run("Normal", makeTestForFMountat(testFMountatNormal)) + t.Run("ChdirWithFileFd", makeTestForFMountat(testFMountatWithFileFd)) + t.Run("MountWithInvalidSource", makeTestForFMountat(testFMountatWithInvalidSource)) +} + +func makeTestForFMountat(fn fMountatCaseFunc) func(t *testing.T) { + return func(t *testing.T) { + t.Parallel() + + suiteDir, err := ioutil.TempDir("", "fmountat-test-") + if err != nil { + t.Fatal(err) + } + defer os.RemoveAll(suiteDir) + + fn(t, suiteDir) + } +} + +func testFMountatNormal(t *testing.T, root string) { + expectedContent := "bye re-exec!\n" + apply := fstest.Apply( + fstest.CreateFile("/hi", []byte(expectedContent), 0777), + ) + + workdir := filepath.Join(root, "work") + if err := os.MkdirAll(workdir, 0777); err != nil { + t.Fatalf("failed to create dir(%s): %+v", workdir, err) + } + + if err := apply.Apply(workdir); err != nil { + t.Fatalf("failed to prepare source dir: %+v", err) + } + + atdir := filepath.Join(root, "at") + if err := os.MkdirAll(atdir, 0777); err != nil { + t.Fatalf("failed to create working dir(%s): %+v", atdir, err) + } + + fsdir := filepath.Join(atdir, "fs") + if err := os.MkdirAll(fsdir, 0777); err != nil { + t.Fatalf("failed to create mount point dir(%s): %+v", fsdir, err) + } + + f, err := os.Open(atdir) + if err != nil { + t.Fatalf("failed to open dir(%s): %+v", atdir, err) + } + defer f.Close() + + // mount work to fs + if err = FMountat(f.Fd(), workdir, "fs", "bind", unix.MS_BIND|unix.MS_RDONLY, ""); err != nil { + t.Fatalf("expected no error here, but got error: %+v", err) + } + defer umount(t, fsdir) + + // check hi file + content, err := ioutil.ReadFile(filepath.Join(fsdir, "hi")) + if err != nil { + t.Fatalf("failed to read file: %+v", err) + } + if got := string(content); got != expectedContent { + t.Fatalf("expected to get(%v), but got(%v)", expectedContent, got) + } + + // check the working directory + cwd, err := os.Getwd() + if err != nil { + t.Fatalf("failed to get current working dir: %+v", err) + } + + if cwd == atdir { + t.Fatal("should not change the current working directory") + } +} + +func testFMountatWithFileFd(t *testing.T, root string) { + // not a directory + expectedErr := syscall.Errno(20) + + emptyFile := filepath.Join(root, "emptyFile") + f, err := os.Create(emptyFile) + if err != nil { + t.Fatalf("failed to create file(%s): %+v", emptyFile, err) + } + defer f.Close() + + err = FMountat(f.Fd(), filepath.Join(root, "empty"), filepath.Join(root, "work"), "", 0, "") + if got := errors.Cause(err); got != expectedErr { + t.Fatalf("expected error %v, but got %v", expectedErr, got) + } +} + +func testFMountatWithInvalidSource(t *testing.T, root string) { + // no such file or directory + expectedErr := syscall.Errno(2) + + atdir := filepath.Join(root, "at") + if err := os.MkdirAll(atdir, 0777); err != nil { + t.Fatalf("failed to create dir(%s): %+v", atdir, err) + } + + f, err := os.Open(root) + if err != nil { + t.Fatalf("failed to open dir(%s): %+v", atdir, err) + } + defer f.Close() + + err = FMountat(f.Fd(), filepath.Join(root, "oops"), "at", "bind", unix.MS_BIND, "") + if got := errors.Cause(err); got != expectedErr { + t.Fatalf("expected error %v, but got %v", expectedErr, got) + } +} + +func umount(t *testing.T, target string) { + for i := 0; i < 50; i++ { + if err := unix.Unmount(target, unix.MNT_DETACH); err != nil { + switch err { + case unix.EBUSY: + time.Sleep(50 * time.Millisecond) + continue + case unix.EINVAL: + return + default: + continue + } + } + } + t.Fatalf("failed to unmount target %s", target) +} diff --git a/sys/subprocess_unsafe_linux.go b/sys/subprocess_unsafe_linux.go new file mode 100644 index 000000000..6e40a9c7d --- /dev/null +++ b/sys/subprocess_unsafe_linux.go @@ -0,0 +1,30 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package sys + +import ( + _ "unsafe" // required for go:linkname. +) + +//go:linkname beforeFork syscall.runtime_BeforeFork +func beforeFork() + +//go:linkname afterFork syscall.runtime_AfterFork +func afterFork() + +//go:linkname afterForkInChild syscall.runtime_AfterForkInChild +func afterForkInChild() diff --git a/sys/subprocess_unsafe_linux.s b/sys/subprocess_unsafe_linux.s new file mode 100644 index 000000000..c073fa4ad --- /dev/null +++ b/sys/subprocess_unsafe_linux.s @@ -0,0 +1,15 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/