Support >= 128 layers in overlayfs snapshots

Auto-detect longest common dir in lowerdir option and compact it if the
option size is hitting one page size. If does, Use chdir + CLONE to do
mount thing to avoid hitting one page argument buffer in linux kernel
mount.

Signed-off-by: Wei Fu <fhfuwei@163.com>
This commit is contained in:
Wei Fu 2018-07-26 15:21:44 +08:00
parent 26e2dd6754
commit 67b54c6670
7 changed files with 673 additions and 2 deletions

View File

@ -17,16 +17,41 @@
package mount
import (
"fmt"
"os"
"path"
"strings"
"time"
"github.com/containerd/containerd/sys"
"github.com/pkg/errors"
"golang.org/x/sys/unix"
)
var pagesize = 4096
func init() {
pagesize = os.Getpagesize()
}
// Mount to the provided target path
func (m *Mount) Mount(target string) error {
flags, data := parseMountOptions(m.Options)
var (
chdir string
options = m.Options
)
// avoid hitting one page limit of mount argument buffer
//
// NOTE: 512 is a buffer during pagesize check.
if m.Type == "overlay" && optionsSize(options) >= pagesize-512 {
chdir, options = compactLowerdirOption(options)
}
flags, data := parseMountOptions(options)
if len(data) > pagesize {
return errors.Errorf("mount options is too long")
}
// propagation types.
const ptypes = unix.MS_SHARED | unix.MS_PRIVATE | unix.MS_SLAVE | unix.MS_UNBINDABLE
@ -38,7 +63,7 @@ func (m *Mount) Mount(target string) error {
if flags&unix.MS_REMOUNT == 0 || data != "" {
// Initial call applying all non-propagation flags for mount
// or remount with changed data
if err := unix.Mount(m.Source, target, m.Type, uintptr(oflags), data); err != nil {
if err := mountAt(chdir, m.Source, target, m.Type, uintptr(oflags), data); err != nil {
return err
}
}
@ -155,3 +180,129 @@ func parseMountOptions(options []string) (int, string) {
}
return flag, strings.Join(data, ",")
}
// compactLowerdirOption updates overlay lowdir option and returns the common
// dir among all the lowdirs.
func compactLowerdirOption(opts []string) (string, []string) {
idx, dirs := findOverlayLowerdirs(opts)
if idx == -1 || len(dirs) == 1 {
// no need to compact if there is only one lowerdir
return "", opts
}
// find out common dir
commondir := longestCommonPrefix(dirs)
if commondir == "" {
return "", opts
}
// NOTE: the snapshot id is based on digits.
// in order to avoid to get snapshots/x, should be back to parent dir.
// however, there is assumption that the common dir is ${root}/io.containerd.v1.overlayfs/snapshots.
commondir = path.Dir(commondir)
if commondir == "/" {
return "", opts
}
commondir = commondir + "/"
newdirs := make([]string, 0, len(dirs))
for _, dir := range dirs {
newdirs = append(newdirs, dir[len(commondir):])
}
newopts := copyOptions(opts)
newopts = append(newopts[:idx], newopts[idx+1:]...)
newopts = append(newopts, fmt.Sprintf("lowerdir=%s", strings.Join(newdirs, ":")))
return commondir, newopts
}
// findOverlayLowerdirs returns the index of lowerdir in mount's options and
// all the lowerdir target.
func findOverlayLowerdirs(opts []string) (int, []string) {
var (
idx = -1
prefix = "lowerdir="
)
for i, opt := range opts {
if strings.HasPrefix(opt, prefix) {
idx = i
break
}
}
if idx == -1 {
return -1, nil
}
return idx, strings.Split(opts[idx][len(prefix):], ":")
}
// longestCommonPrefix finds the longest common prefix in the string slice.
func longestCommonPrefix(strs []string) string {
if len(strs) == 0 {
return ""
} else if len(strs) == 1 {
return strs[0]
}
// find out the min/max value by alphabetical order
min, max := strs[0], strs[0]
for _, str := range strs[1:] {
if min > str {
min = str
}
if max < str {
max = str
}
}
// find out the common part between min and max
for i := 0; i < len(min) && i < len(max); i++ {
if min[i] != max[i] {
return min[:i]
}
}
return min
}
// copyOptions copies the options.
func copyOptions(opts []string) []string {
if len(opts) == 0 {
return nil
}
acopy := make([]string, len(opts))
copy(acopy, opts)
return acopy
}
// optionsSize returns the byte size of options of mount.
func optionsSize(opts []string) int {
size := 0
for _, opt := range opts {
size += len(opt)
}
return size
}
func mountAt(chdir string, source, target, fstype string, flags uintptr, data string) error {
if chdir == "" {
return unix.Mount(source, target, fstype, flags, data)
}
f, err := os.Open(chdir)
if err != nil {
return errors.Wrap(err, "failed to mountat")
}
defer f.Close()
fs, err := f.Stat()
if err != nil {
return errors.Wrap(err, "failed to mountat")
}
if !fs.IsDir() {
return errors.Wrap(errors.Errorf("%s is not dir", chdir), "failed to mountat")
}
return errors.Wrap(sys.FMountat(f.Fd(), source, target, fstype, flags, data), "failed to mountat")
}

94
mount/mount_linux_test.go Normal file
View File

@ -0,0 +1,94 @@
// +build linux
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package mount
import (
"reflect"
"testing"
)
func TestLongestCommonPrefix(t *testing.T) {
tcases := []struct {
in []string
expected string
}{
{[]string{}, ""},
{[]string{"foo"}, "foo"},
{[]string{"foo", "bar"}, ""},
{[]string{"foo", "foo"}, "foo"},
{[]string{"foo", "foobar"}, "foo"},
{[]string{"foo", "", "foobar"}, ""},
}
for i, tc := range tcases {
if got := longestCommonPrefix(tc.in); got != tc.expected {
t.Fatalf("[%d case] expected (%s), but got (%s)", i+1, tc.expected, got)
}
}
}
func TestCompactLowerdirOption(t *testing.T) {
tcases := []struct {
opts []string
commondir string
newopts []string
}{
// no lowerdir or only one
{
[]string{"workdir=a"},
"",
[]string{"workdir=a"},
},
{
[]string{"workdir=a", "lowerdir=b"},
"",
[]string{"workdir=a", "lowerdir=b"},
},
// >= 2 lowerdir
{
[]string{"lowerdir=/snapshots/1/fs:/snapshots/10/fs"},
"/snapshots/",
[]string{"lowerdir=1/fs:10/fs"},
},
{
[]string{"lowerdir=/snapshots/1/fs:/snapshots/10/fs:/snapshots/2/fs"},
"/snapshots/",
[]string{"lowerdir=1/fs:10/fs:2/fs"},
},
// if common dir is /
{
[]string{"lowerdir=/snapshots/1/fs:/other_snapshots/1/fs"},
"",
[]string{"lowerdir=/snapshots/1/fs:/other_snapshots/1/fs"},
},
}
for i, tc := range tcases {
dir, opts := compactLowerdirOption(tc.opts)
if dir != tc.commondir {
t.Fatalf("[%d case] expected common dir (%s), but got (%s)", i+1, tc.commondir, dir)
}
if !reflect.DeepEqual(opts, tc.newopts) {
t.Fatalf("[%d case] expected options (%v), but got (%v)", i+1, tc.newopts, opts)
}
}
}

View File

@ -63,6 +63,8 @@ func SnapshotterSuite(t *testing.T, name string, snapshotterFn func(ctx context.
t.Run("StatInWalk", makeTest(name, snapshotterFn, checkStatInWalk))
t.Run("CloseTwice", makeTest(name, snapshotterFn, closeTwice))
t.Run("RootPermission", makeTest(name, snapshotterFn, checkRootPermission))
t.Run("128LayersMount", makeTest(name, snapshotterFn, check128LayersMount))
}
func makeTest(name string, snapshotterFn func(ctx context.Context, root string) (snapshots.Snapshotter, func() error, error), fn func(ctx context.Context, t *testing.T, snapshotter snapshots.Snapshotter, work string)) func(t *testing.T) {
@ -860,3 +862,94 @@ func checkRootPermission(ctx context.Context, t *testing.T, snapshotter snapshot
t.Fatalf("expected 0755, got 0%o", mode)
}
}
func check128LayersMount(ctx context.Context, t *testing.T, snapshotter snapshots.Snapshotter, work string) {
lowestApply := fstest.Apply(
fstest.CreateFile("/bottom", []byte("way at the bottom\n"), 0777),
fstest.CreateFile("/overwriteme", []byte("FIRST!\n"), 0777),
fstest.CreateDir("/ADDHERE", 0755),
fstest.CreateDir("/ONLYME", 0755),
fstest.CreateFile("/ONLYME/bottom", []byte("bye!\n"), 0777),
)
appliers := []fstest.Applier{lowestApply}
for i := 1; i <= 127; i++ {
appliers = append(appliers, fstest.Apply(
fstest.CreateFile("/overwriteme", []byte(fmt.Sprintf("%d WAS HERE!\n", i)), 0777),
fstest.CreateFile(fmt.Sprintf("/ADDHERE/file-%d", i), []byte("same\n"), 0755),
fstest.RemoveAll("/ONLYME"),
fstest.CreateDir("/ONLYME", 0755),
fstest.CreateFile(fmt.Sprintf("/ONLYME/file-%d", i), []byte("only me!\n"), 0777),
))
}
flat := filepath.Join(work, "flat")
if err := os.MkdirAll(flat, 0777); err != nil {
t.Fatalf("failed to create flat dir(%s): %+v", flat, err)
}
// NOTE: add gc labels to avoid snapshots get removed by gc...
parent := ""
for i, applier := range appliers {
preparing := filepath.Join(work, fmt.Sprintf("prepare-layer-%d", i))
if err := os.MkdirAll(preparing, 0777); err != nil {
t.Fatalf("[layer %d] failed to create preparing dir(%s): %+v", i, preparing, err)
}
mounts, err := snapshotter.Prepare(ctx, preparing, parent, opt)
if err != nil {
t.Fatalf("[layer %d] failed to get mount info: %+v", i, err)
}
if err := mount.All(mounts, preparing); err != nil {
t.Fatalf("[layer %d] failed to mount on the target(%s): %+v", i, preparing, err)
}
if err := fstest.CheckDirectoryEqual(preparing, flat); err != nil {
testutil.Unmount(t, preparing)
t.Fatalf("[layer %d] preparing doesn't equal to flat before apply: %+v", i, err)
}
if err := applier.Apply(flat); err != nil {
testutil.Unmount(t, preparing)
t.Fatalf("[layer %d] failed to apply on flat dir: %+v", i, err)
}
if err = applier.Apply(preparing); err != nil {
testutil.Unmount(t, preparing)
t.Fatalf("[layer %d] failed to apply on preparing dir: %+v", i, err)
}
if err := fstest.CheckDirectoryEqual(preparing, flat); err != nil {
testutil.Unmount(t, preparing)
t.Fatalf("[layer %d] preparing doesn't equal to flat after apply: %+v", i, err)
}
testutil.Unmount(t, preparing)
parent = filepath.Join(work, fmt.Sprintf("committed-%d", i))
if err := snapshotter.Commit(ctx, parent, preparing, opt); err != nil {
t.Fatalf("[layer %d] failed to commit the preparing: %+v", i, err)
}
}
view := filepath.Join(work, "fullview")
if err := os.MkdirAll(view, 0777); err != nil {
t.Fatalf("failed to create fullview dir(%s): %+v", view, err)
}
mounts, err := snapshotter.View(ctx, view, parent, opt)
if err != nil {
t.Fatalf("failed to get view's mount info: %+v", err)
}
if err := mount.All(mounts, view); err != nil {
t.Fatalf("failed to mount on the target(%s): %+v", view, err)
}
defer testutil.Unmount(t, view)
if err := fstest.CheckDirectoryEqual(view, flat); err != nil {
t.Fatalf("fullview should equal to flat: %+v", err)
}
}

119
sys/mount_linux.go Normal file
View File

@ -0,0 +1,119 @@
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package sys
import (
"runtime"
"syscall"
"unsafe"
"github.com/pkg/errors"
"golang.org/x/sys/unix"
)
// FMountat performs mount from the provided directory.
func FMountat(dirfd uintptr, source, target, fstype string, flags uintptr, data string) error {
var (
sourceP, targetP, fstypeP, dataP *byte
pid uintptr
ws unix.WaitStatus
err error
errno syscall.Errno
)
sourceP, err = syscall.BytePtrFromString(source)
if err != nil {
return err
}
targetP, err = syscall.BytePtrFromString(target)
if err != nil {
return err
}
fstypeP, err = syscall.BytePtrFromString(fstype)
if err != nil {
return err
}
if data != "" {
dataP, err = syscall.BytePtrFromString(data)
if err != nil {
return err
}
}
runtime.LockOSThread()
defer runtime.UnlockOSThread()
pid, errno = forkAndMountat(dirfd,
uintptr(unsafe.Pointer(sourceP)),
uintptr(unsafe.Pointer(targetP)),
uintptr(unsafe.Pointer(fstypeP)),
flags,
uintptr(unsafe.Pointer(dataP)))
if errno != 0 {
return errors.Wrap(errno, "failed to fork thread")
}
_, err = unix.Wait4(int(pid), &ws, 0, nil)
for err == syscall.EINTR {
_, err = unix.Wait4(int(pid), &ws, 0, nil)
}
if err != nil {
return errors.Wrapf(err, "failed to find pid=%d process", pid)
}
errno = syscall.Errno(ws.ExitStatus())
if errno != 0 {
return errors.Wrap(errno, "failed to mount")
}
return nil
}
// forkAndMountat will fork thread, change working dir and mount.
//
// precondition: the runtime OS thread must be locked.
func forkAndMountat(dirfd uintptr, source, target, fstype, flags, data uintptr) (pid uintptr, errno syscall.Errno) {
// block signal during clone
beforeFork()
// the cloned thread shares the open file descriptor, but the thread
// never be reused by runtime.
pid, _, errno = syscall.RawSyscall6(syscall.SYS_CLONE, uintptr(syscall.SIGCHLD)|syscall.CLONE_FILES, 0, 0, 0, 0, 0)
if errno != 0 || pid != 0 {
// restore all signals
afterFork()
return
}
// restore all signals
afterForkInChild()
// change working dir
_, _, errno = syscall.RawSyscall(syscall.SYS_FCHDIR, dirfd, 0, 0)
if errno != 0 {
goto childerr
}
_, _, errno = syscall.RawSyscall6(syscall.SYS_MOUNT, source, target, fstype, flags, data, 0)
childerr:
syscall.RawSyscall(syscall.SYS_EXIT, uintptr(errno), 0, 0)
panic("unreachable")
}

169
sys/mount_linux_test.go Normal file
View File

@ -0,0 +1,169 @@
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package sys
import (
"io/ioutil"
"os"
"path/filepath"
"syscall"
"testing"
"time"
"github.com/containerd/continuity/fs/fstest"
"github.com/pkg/errors"
"golang.org/x/sys/unix"
)
type fMountatCaseFunc func(t *testing.T, root string)
func TestFMountat(t *testing.T) {
if RunningUnprivileged() {
t.Skip("Needs to be run as root")
return
}
t.Run("Normal", makeTestForFMountat(testFMountatNormal))
t.Run("ChdirWithFileFd", makeTestForFMountat(testFMountatWithFileFd))
t.Run("MountWithInvalidSource", makeTestForFMountat(testFMountatWithInvalidSource))
}
func makeTestForFMountat(fn fMountatCaseFunc) func(t *testing.T) {
return func(t *testing.T) {
t.Parallel()
suiteDir, err := ioutil.TempDir("", "fmountat-test-")
if err != nil {
t.Fatal(err)
}
defer os.RemoveAll(suiteDir)
fn(t, suiteDir)
}
}
func testFMountatNormal(t *testing.T, root string) {
expectedContent := "bye re-exec!\n"
apply := fstest.Apply(
fstest.CreateFile("/hi", []byte(expectedContent), 0777),
)
workdir := filepath.Join(root, "work")
if err := os.MkdirAll(workdir, 0777); err != nil {
t.Fatalf("failed to create dir(%s): %+v", workdir, err)
}
if err := apply.Apply(workdir); err != nil {
t.Fatalf("failed to prepare source dir: %+v", err)
}
atdir := filepath.Join(root, "at")
if err := os.MkdirAll(atdir, 0777); err != nil {
t.Fatalf("failed to create working dir(%s): %+v", atdir, err)
}
fsdir := filepath.Join(atdir, "fs")
if err := os.MkdirAll(fsdir, 0777); err != nil {
t.Fatalf("failed to create mount point dir(%s): %+v", fsdir, err)
}
f, err := os.Open(atdir)
if err != nil {
t.Fatalf("failed to open dir(%s): %+v", atdir, err)
}
defer f.Close()
// mount work to fs
if err = FMountat(f.Fd(), workdir, "fs", "bind", unix.MS_BIND|unix.MS_RDONLY, ""); err != nil {
t.Fatalf("expected no error here, but got error: %+v", err)
}
defer umount(t, fsdir)
// check hi file
content, err := ioutil.ReadFile(filepath.Join(fsdir, "hi"))
if err != nil {
t.Fatalf("failed to read file: %+v", err)
}
if got := string(content); got != expectedContent {
t.Fatalf("expected to get(%v), but got(%v)", expectedContent, got)
}
// check the working directory
cwd, err := os.Getwd()
if err != nil {
t.Fatalf("failed to get current working dir: %+v", err)
}
if cwd == atdir {
t.Fatal("should not change the current working directory")
}
}
func testFMountatWithFileFd(t *testing.T, root string) {
// not a directory
expectedErr := syscall.Errno(20)
emptyFile := filepath.Join(root, "emptyFile")
f, err := os.Create(emptyFile)
if err != nil {
t.Fatalf("failed to create file(%s): %+v", emptyFile, err)
}
defer f.Close()
err = FMountat(f.Fd(), filepath.Join(root, "empty"), filepath.Join(root, "work"), "", 0, "")
if got := errors.Cause(err); got != expectedErr {
t.Fatalf("expected error %v, but got %v", expectedErr, got)
}
}
func testFMountatWithInvalidSource(t *testing.T, root string) {
// no such file or directory
expectedErr := syscall.Errno(2)
atdir := filepath.Join(root, "at")
if err := os.MkdirAll(atdir, 0777); err != nil {
t.Fatalf("failed to create dir(%s): %+v", atdir, err)
}
f, err := os.Open(root)
if err != nil {
t.Fatalf("failed to open dir(%s): %+v", atdir, err)
}
defer f.Close()
err = FMountat(f.Fd(), filepath.Join(root, "oops"), "at", "bind", unix.MS_BIND, "")
if got := errors.Cause(err); got != expectedErr {
t.Fatalf("expected error %v, but got %v", expectedErr, got)
}
}
func umount(t *testing.T, target string) {
for i := 0; i < 50; i++ {
if err := unix.Unmount(target, unix.MNT_DETACH); err != nil {
switch err {
case unix.EBUSY:
time.Sleep(50 * time.Millisecond)
continue
case unix.EINVAL:
return
default:
continue
}
}
}
t.Fatalf("failed to unmount target %s", target)
}

View File

@ -0,0 +1,30 @@
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package sys
import (
_ "unsafe" // required for go:linkname.
)
//go:linkname beforeFork syscall.runtime_BeforeFork
func beforeFork()
//go:linkname afterFork syscall.runtime_AfterFork
func afterFork()
//go:linkname afterForkInChild syscall.runtime_AfterForkInChild
func afterForkInChild()

View File

@ -0,0 +1,15 @@
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/