Replace mount fork hack with CLONE_FS

This change spins up a new goroutine, locks it to a thread, then
unshares CLONE_FS which allows us to `Chdir` from inside the thread
without affecting the rest of the program.

The thread is no longer usable after unshare so it leaves the thread
locked to prevent go from returning the thread to the thread pool.

Signed-off-by: Brian Goff <cpuguy83@gmail.com>
This commit is contained in:
Brian Goff 2022-10-11 23:47:16 +00:00
parent c21d1baa88
commit a24ef09937
7 changed files with 68 additions and 406 deletions

View File

@ -1,145 +0,0 @@
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package mount
import (
"fmt"
"runtime"
"syscall"
"unsafe"
"github.com/containerd/containerd/log"
"golang.org/x/sys/unix"
)
// fMountat performs mount from the provided directory.
func fMountat(dirfd uintptr, source, target, fstype string, flags uintptr, data string) error {
var (
sourceP, targetP, fstypeP, dataP *byte
pid uintptr
err error
errno, status syscall.Errno
)
sourceP, err = syscall.BytePtrFromString(source)
if err != nil {
return err
}
targetP, err = syscall.BytePtrFromString(target)
if err != nil {
return err
}
fstypeP, err = syscall.BytePtrFromString(fstype)
if err != nil {
return err
}
if data != "" {
dataP, err = syscall.BytePtrFromString(data)
if err != nil {
return err
}
}
runtime.LockOSThread()
defer runtime.UnlockOSThread()
var pipefds [2]int
if err := syscall.Pipe2(pipefds[:], syscall.O_CLOEXEC); err != nil {
return fmt.Errorf("failed to open pipe: %w", err)
}
defer func() {
// close both ends of the pipe in a deferred function, since open file
// descriptor table is shared with child
syscall.Close(pipefds[0])
syscall.Close(pipefds[1])
}()
pid, errno = forkAndMountat(dirfd,
uintptr(unsafe.Pointer(sourceP)),
uintptr(unsafe.Pointer(targetP)),
uintptr(unsafe.Pointer(fstypeP)),
flags,
uintptr(unsafe.Pointer(dataP)),
pipefds[1],
)
if errno != 0 {
return fmt.Errorf("failed to fork thread: %w", errno)
}
defer func() {
_, err := unix.Wait4(int(pid), nil, 0, nil)
for err == syscall.EINTR {
_, err = unix.Wait4(int(pid), nil, 0, nil)
}
if err != nil {
log.L.WithError(err).Debugf("failed to find pid=%d process", pid)
}
}()
_, _, errno = syscall.RawSyscall(syscall.SYS_READ,
uintptr(pipefds[0]),
uintptr(unsafe.Pointer(&status)),
unsafe.Sizeof(status))
if errno != 0 {
return fmt.Errorf("failed to read pipe: %w", errno)
}
if status != 0 {
return fmt.Errorf("failed to mount: %w", status)
}
return nil
}
// forkAndMountat will fork thread, change working dir and mount.
//
// precondition: the runtime OS thread must be locked.
func forkAndMountat(dirfd uintptr, source, target, fstype, flags, data uintptr, pipefd int) (pid uintptr, errno syscall.Errno) {
// block signal during clone
beforeFork()
// the cloned thread shares the open file descriptor, but the thread
// never be reused by runtime.
pid, _, errno = syscall.RawSyscall6(syscall.SYS_CLONE, uintptr(syscall.SIGCHLD)|syscall.CLONE_FILES, 0, 0, 0, 0, 0)
if errno != 0 || pid != 0 {
// restore all signals
afterFork()
return
}
// restore all signals
afterForkInChild()
// change working dir
_, _, errno = syscall.RawSyscall(syscall.SYS_FCHDIR, dirfd, 0, 0)
if errno != 0 {
goto childerr
}
_, _, errno = syscall.RawSyscall6(syscall.SYS_MOUNT, source, target, fstype, flags, data, 0)
childerr:
_, _, errno = syscall.RawSyscall(syscall.SYS_WRITE, uintptr(pipefd), uintptr(unsafe.Pointer(&errno)), unsafe.Sizeof(errno))
syscall.RawSyscall(syscall.SYS_EXIT, uintptr(errno), 0, 0)
panic("unreachable")
}

View File

@ -1,164 +0,0 @@
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package mount
import (
"errors"
"os"
"path/filepath"
"syscall"
"testing"
"time"
"github.com/containerd/continuity/fs/fstest"
"golang.org/x/sys/unix"
)
type fMountatCaseFunc func(t *testing.T, root string)
func TestFMountat(t *testing.T) {
if unix.Geteuid() != 0 {
t.Skip("Needs to be run as root")
return
}
t.Run("Normal", makeTestForFMountat(testFMountatNormal))
t.Run("ChdirWithFileFd", makeTestForFMountat(testFMountatWithFileFd))
t.Run("MountWithInvalidSource", makeTestForFMountat(testFMountatWithInvalidSource))
}
func makeTestForFMountat(fn fMountatCaseFunc) func(t *testing.T) {
return func(t *testing.T) {
t.Parallel()
suiteDir := t.TempDir()
fn(t, suiteDir)
}
}
func testFMountatNormal(t *testing.T, root string) {
expectedContent := "bye re-exec!\n"
apply := fstest.Apply(
fstest.CreateFile("/hi", []byte(expectedContent), 0777),
)
workdir := filepath.Join(root, "work")
if err := os.MkdirAll(workdir, 0777); err != nil {
t.Fatalf("failed to create dir(%s): %+v", workdir, err)
}
if err := apply.Apply(workdir); err != nil {
t.Fatalf("failed to prepare source dir: %+v", err)
}
atdir := filepath.Join(root, "at")
if err := os.MkdirAll(atdir, 0777); err != nil {
t.Fatalf("failed to create working dir(%s): %+v", atdir, err)
}
fsdir := filepath.Join(atdir, "fs")
if err := os.MkdirAll(fsdir, 0777); err != nil {
t.Fatalf("failed to create mount point dir(%s): %+v", fsdir, err)
}
f, err := os.Open(atdir)
if err != nil {
t.Fatalf("failed to open dir(%s): %+v", atdir, err)
}
defer f.Close()
// mount work to fs
if err = fMountat(f.Fd(), workdir, "fs", "bind", unix.MS_BIND|unix.MS_RDONLY, ""); err != nil {
t.Fatalf("expected no error here, but got error: %+v", err)
}
defer umount(t, fsdir)
// check hi file
content, err := os.ReadFile(filepath.Join(fsdir, "hi"))
if err != nil {
t.Fatalf("failed to read file: %+v", err)
}
if got := string(content); got != expectedContent {
t.Fatalf("expected to get(%v), but got(%v)", expectedContent, got)
}
// check the working directory
cwd, err := os.Getwd()
if err != nil {
t.Fatalf("failed to get current working dir: %+v", err)
}
if cwd == atdir {
t.Fatal("should not change the current working directory")
}
}
func testFMountatWithFileFd(t *testing.T, root string) {
// not a directory
expectedErr := syscall.Errno(20)
emptyFile := filepath.Join(root, "emptyFile")
f, err := os.Create(emptyFile)
if err != nil {
t.Fatalf("failed to create file(%s): %+v", emptyFile, err)
}
defer f.Close()
err = fMountat(f.Fd(), filepath.Join(root, "empty"), filepath.Join(root, "work"), "", 0, "")
if !errors.Is(err, expectedErr) {
t.Fatalf("expected error %v, but got %v", expectedErr, errors.Unwrap(err))
}
}
func testFMountatWithInvalidSource(t *testing.T, root string) {
// no such file or directory
expectedErr := syscall.Errno(2)
atdir := filepath.Join(root, "at")
if err := os.MkdirAll(atdir, 0777); err != nil {
t.Fatalf("failed to create dir(%s): %+v", atdir, err)
}
f, err := os.Open(root)
if err != nil {
t.Fatalf("failed to open dir(%s): %+v", atdir, err)
}
defer f.Close()
err = fMountat(f.Fd(), filepath.Join(root, "oops"), "at", "bind", unix.MS_BIND, "")
if !errors.Is(err, expectedErr) {
t.Fatalf("expected error %v, but got %v", expectedErr, err)
}
}
func umount(t *testing.T, target string) {
for i := 0; i < 50; i++ {
if err := unix.Unmount(target, unix.MNT_DETACH); err != nil {
switch err {
case unix.EBUSY:
time.Sleep(50 * time.Millisecond)
continue
case unix.EINVAL:
return
default:
continue
}
}
}
t.Fatalf("failed to unmount target %s", target)
}

View File

@ -21,6 +21,7 @@ import (
"fmt"
"os"
"path"
"runtime"
"strings"
"time"
@ -363,24 +364,29 @@ func mountAt(chdir string, source, target, fstype string, flags uintptr, data st
return unix.Mount(source, target, fstype, flags, data)
}
f, err := os.Open(chdir)
if err != nil {
return fmt.Errorf("failed to mountat: %w", err)
}
defer f.Close()
ch := make(chan error, 1)
go func() {
runtime.LockOSThread()
fs, err := f.Stat()
if err != nil {
return fmt.Errorf("failed to mountat: %w", err)
}
// Do not unlock this thread.
// If the thread is unlocked go will try to use it for other goroutines.
// However it is not possible to restore the thread state after CLONE_FS.
//
// Once the goroutine exits the thread should eventually be terminated by go.
if !fs.IsDir() {
return fmt.Errorf("failed to mountat: %s is not dir", chdir)
}
if err := fMountat(f.Fd(), source, target, fstype, flags, data); err != nil {
return fmt.Errorf("failed to mountat: %w", err)
}
return nil
if err := unix.Unshare(unix.CLONE_FS); err != nil {
ch <- err
return
}
if err := unix.Chdir(chdir); err != nil {
ch <- err
return
}
ch <- unix.Mount(source, target, fstype, flags, data)
}()
return <-ch
}
func (m *Mount) mountWithHelper(helperBinary, typePrefix, target string) error {

View File

@ -25,6 +25,7 @@ import (
"github.com/containerd/continuity/testutil"
exec "golang.org/x/sys/execabs"
"golang.org/x/sys/unix"
)
func TestLongestCommonPrefix(t *testing.T) {
@ -126,3 +127,48 @@ func TestFUSEHelper(t *testing.T) {
t.Fatal(err)
}
}
func TestMountAt(t *testing.T) {
testutil.RequiresRoot(t)
dir1 := t.TempDir()
dir2 := t.TempDir()
defer unix.Unmount(filepath.Join(dir2, "bar"), unix.MNT_DETACH)
if err := os.WriteFile(filepath.Join(dir1, "foo"), []byte("foo"), 0644); err != nil {
t.Fatal(err)
}
if err := os.WriteFile(filepath.Join(dir2, "bar"), []byte{}, 0644); err != nil {
t.Fatal(err)
}
wd, err := os.Getwd()
if err != nil {
t.Fatal(err)
}
// mount ${dir1}/foo at ${dir2}/bar
// But since we are using `mountAt` we only need to specify the relative path to dir2 as the target mountAt will chdir to there.
if err := mountAt(dir2, filepath.Join(dir1, "foo"), "bar", "none", unix.MS_BIND, ""); err != nil {
t.Fatal(err)
}
b, err := os.ReadFile(filepath.Join(dir2, "bar"))
if err != nil {
t.Fatal(err)
}
if string(b) != "foo" {
t.Fatalf("unexpected file content: %s", b)
}
newWD, err := os.Getwd()
if err != nil {
t.Fatal(err)
}
if wd != newWD {
t.Fatalf("unexpected working directory: %s", newWD)
}
}

View File

@ -1,15 +0,0 @@
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

View File

@ -1,33 +0,0 @@
//go:build linux && gc
// +build linux,gc
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package mount
import (
_ "unsafe" // required for go:linkname.
)
//go:linkname beforeFork syscall.runtime_BeforeFork
func beforeFork()
//go:linkname afterFork syscall.runtime_AfterFork
func afterFork()
//go:linkname afterForkInChild syscall.runtime_AfterForkInChild
func afterForkInChild()

View File

@ -1,33 +0,0 @@
//go:build linux && gccgo
// +build linux,gccgo
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package mount
import (
_ "unsafe" // required for go:linkname.
)
//go:linkname beforeFork syscall.runtime__BeforeFork
func beforeFork()
//go:linkname afterFork syscall.runtime__AfterFork
func afterFork()
//go:linkname afterForkInChild syscall.runtime__AfterForkInChild
func afterForkInChild()