Support >= 128 layers in overlayfs snapshots

Auto-detect longest common dir in lowerdir option and compact it if the
option size is hitting one page size. If does, Use chdir + CLONE to do
mount thing to avoid hitting one page argument buffer in linux kernel
mount.

Signed-off-by: Wei Fu <fhfuwei@163.com>
This commit is contained in:
Wei Fu
2018-07-26 15:21:44 +08:00
parent 26e2dd6754
commit 67b54c6670
7 changed files with 673 additions and 2 deletions

119
sys/mount_linux.go Normal file
View File

@@ -0,0 +1,119 @@
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package sys
import (
"runtime"
"syscall"
"unsafe"
"github.com/pkg/errors"
"golang.org/x/sys/unix"
)
// FMountat performs mount from the provided directory.
func FMountat(dirfd uintptr, source, target, fstype string, flags uintptr, data string) error {
var (
sourceP, targetP, fstypeP, dataP *byte
pid uintptr
ws unix.WaitStatus
err error
errno syscall.Errno
)
sourceP, err = syscall.BytePtrFromString(source)
if err != nil {
return err
}
targetP, err = syscall.BytePtrFromString(target)
if err != nil {
return err
}
fstypeP, err = syscall.BytePtrFromString(fstype)
if err != nil {
return err
}
if data != "" {
dataP, err = syscall.BytePtrFromString(data)
if err != nil {
return err
}
}
runtime.LockOSThread()
defer runtime.UnlockOSThread()
pid, errno = forkAndMountat(dirfd,
uintptr(unsafe.Pointer(sourceP)),
uintptr(unsafe.Pointer(targetP)),
uintptr(unsafe.Pointer(fstypeP)),
flags,
uintptr(unsafe.Pointer(dataP)))
if errno != 0 {
return errors.Wrap(errno, "failed to fork thread")
}
_, err = unix.Wait4(int(pid), &ws, 0, nil)
for err == syscall.EINTR {
_, err = unix.Wait4(int(pid), &ws, 0, nil)
}
if err != nil {
return errors.Wrapf(err, "failed to find pid=%d process", pid)
}
errno = syscall.Errno(ws.ExitStatus())
if errno != 0 {
return errors.Wrap(errno, "failed to mount")
}
return nil
}
// forkAndMountat will fork thread, change working dir and mount.
//
// precondition: the runtime OS thread must be locked.
func forkAndMountat(dirfd uintptr, source, target, fstype, flags, data uintptr) (pid uintptr, errno syscall.Errno) {
// block signal during clone
beforeFork()
// the cloned thread shares the open file descriptor, but the thread
// never be reused by runtime.
pid, _, errno = syscall.RawSyscall6(syscall.SYS_CLONE, uintptr(syscall.SIGCHLD)|syscall.CLONE_FILES, 0, 0, 0, 0, 0)
if errno != 0 || pid != 0 {
// restore all signals
afterFork()
return
}
// restore all signals
afterForkInChild()
// change working dir
_, _, errno = syscall.RawSyscall(syscall.SYS_FCHDIR, dirfd, 0, 0)
if errno != 0 {
goto childerr
}
_, _, errno = syscall.RawSyscall6(syscall.SYS_MOUNT, source, target, fstype, flags, data, 0)
childerr:
syscall.RawSyscall(syscall.SYS_EXIT, uintptr(errno), 0, 0)
panic("unreachable")
}

169
sys/mount_linux_test.go Normal file
View File

@@ -0,0 +1,169 @@
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package sys
import (
"io/ioutil"
"os"
"path/filepath"
"syscall"
"testing"
"time"
"github.com/containerd/continuity/fs/fstest"
"github.com/pkg/errors"
"golang.org/x/sys/unix"
)
type fMountatCaseFunc func(t *testing.T, root string)
func TestFMountat(t *testing.T) {
if RunningUnprivileged() {
t.Skip("Needs to be run as root")
return
}
t.Run("Normal", makeTestForFMountat(testFMountatNormal))
t.Run("ChdirWithFileFd", makeTestForFMountat(testFMountatWithFileFd))
t.Run("MountWithInvalidSource", makeTestForFMountat(testFMountatWithInvalidSource))
}
func makeTestForFMountat(fn fMountatCaseFunc) func(t *testing.T) {
return func(t *testing.T) {
t.Parallel()
suiteDir, err := ioutil.TempDir("", "fmountat-test-")
if err != nil {
t.Fatal(err)
}
defer os.RemoveAll(suiteDir)
fn(t, suiteDir)
}
}
func testFMountatNormal(t *testing.T, root string) {
expectedContent := "bye re-exec!\n"
apply := fstest.Apply(
fstest.CreateFile("/hi", []byte(expectedContent), 0777),
)
workdir := filepath.Join(root, "work")
if err := os.MkdirAll(workdir, 0777); err != nil {
t.Fatalf("failed to create dir(%s): %+v", workdir, err)
}
if err := apply.Apply(workdir); err != nil {
t.Fatalf("failed to prepare source dir: %+v", err)
}
atdir := filepath.Join(root, "at")
if err := os.MkdirAll(atdir, 0777); err != nil {
t.Fatalf("failed to create working dir(%s): %+v", atdir, err)
}
fsdir := filepath.Join(atdir, "fs")
if err := os.MkdirAll(fsdir, 0777); err != nil {
t.Fatalf("failed to create mount point dir(%s): %+v", fsdir, err)
}
f, err := os.Open(atdir)
if err != nil {
t.Fatalf("failed to open dir(%s): %+v", atdir, err)
}
defer f.Close()
// mount work to fs
if err = FMountat(f.Fd(), workdir, "fs", "bind", unix.MS_BIND|unix.MS_RDONLY, ""); err != nil {
t.Fatalf("expected no error here, but got error: %+v", err)
}
defer umount(t, fsdir)
// check hi file
content, err := ioutil.ReadFile(filepath.Join(fsdir, "hi"))
if err != nil {
t.Fatalf("failed to read file: %+v", err)
}
if got := string(content); got != expectedContent {
t.Fatalf("expected to get(%v), but got(%v)", expectedContent, got)
}
// check the working directory
cwd, err := os.Getwd()
if err != nil {
t.Fatalf("failed to get current working dir: %+v", err)
}
if cwd == atdir {
t.Fatal("should not change the current working directory")
}
}
func testFMountatWithFileFd(t *testing.T, root string) {
// not a directory
expectedErr := syscall.Errno(20)
emptyFile := filepath.Join(root, "emptyFile")
f, err := os.Create(emptyFile)
if err != nil {
t.Fatalf("failed to create file(%s): %+v", emptyFile, err)
}
defer f.Close()
err = FMountat(f.Fd(), filepath.Join(root, "empty"), filepath.Join(root, "work"), "", 0, "")
if got := errors.Cause(err); got != expectedErr {
t.Fatalf("expected error %v, but got %v", expectedErr, got)
}
}
func testFMountatWithInvalidSource(t *testing.T, root string) {
// no such file or directory
expectedErr := syscall.Errno(2)
atdir := filepath.Join(root, "at")
if err := os.MkdirAll(atdir, 0777); err != nil {
t.Fatalf("failed to create dir(%s): %+v", atdir, err)
}
f, err := os.Open(root)
if err != nil {
t.Fatalf("failed to open dir(%s): %+v", atdir, err)
}
defer f.Close()
err = FMountat(f.Fd(), filepath.Join(root, "oops"), "at", "bind", unix.MS_BIND, "")
if got := errors.Cause(err); got != expectedErr {
t.Fatalf("expected error %v, but got %v", expectedErr, got)
}
}
func umount(t *testing.T, target string) {
for i := 0; i < 50; i++ {
if err := unix.Unmount(target, unix.MNT_DETACH); err != nil {
switch err {
case unix.EBUSY:
time.Sleep(50 * time.Millisecond)
continue
case unix.EINVAL:
return
default:
continue
}
}
}
t.Fatalf("failed to unmount target %s", target)
}

View File

@@ -0,0 +1,30 @@
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package sys
import (
_ "unsafe" // required for go:linkname.
)
//go:linkname beforeFork syscall.runtime_BeforeFork
func beforeFork()
//go:linkname afterFork syscall.runtime_AfterFork
func afterFork()
//go:linkname afterForkInChild syscall.runtime_AfterForkInChild
func afterForkInChild()

View File

@@ -0,0 +1,15 @@
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/