Replace mount fork hack with CLONE_FS
This change spins up a new goroutine, locks it to a thread, then unshares CLONE_FS which allows us to `Chdir` from inside the thread without affecting the rest of the program. The thread is no longer usable after unshare so it leaves the thread locked to prevent go from returning the thread to the thread pool. Signed-off-by: Brian Goff <cpuguy83@gmail.com>
This commit is contained in:
parent
c21d1baa88
commit
a24ef09937
@ -1,145 +0,0 @@
|
|||||||
/*
|
|
||||||
Copyright The containerd Authors.
|
|
||||||
|
|
||||||
Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
you may not use this file except in compliance with the License.
|
|
||||||
You may obtain a copy of the License at
|
|
||||||
|
|
||||||
http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
|
|
||||||
Unless required by applicable law or agreed to in writing, software
|
|
||||||
distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
See the License for the specific language governing permissions and
|
|
||||||
limitations under the License.
|
|
||||||
*/
|
|
||||||
|
|
||||||
package mount
|
|
||||||
|
|
||||||
import (
|
|
||||||
"fmt"
|
|
||||||
"runtime"
|
|
||||||
"syscall"
|
|
||||||
"unsafe"
|
|
||||||
|
|
||||||
"github.com/containerd/containerd/log"
|
|
||||||
"golang.org/x/sys/unix"
|
|
||||||
)
|
|
||||||
|
|
||||||
// fMountat performs mount from the provided directory.
|
|
||||||
func fMountat(dirfd uintptr, source, target, fstype string, flags uintptr, data string) error {
|
|
||||||
var (
|
|
||||||
sourceP, targetP, fstypeP, dataP *byte
|
|
||||||
pid uintptr
|
|
||||||
err error
|
|
||||||
errno, status syscall.Errno
|
|
||||||
)
|
|
||||||
|
|
||||||
sourceP, err = syscall.BytePtrFromString(source)
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
targetP, err = syscall.BytePtrFromString(target)
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
fstypeP, err = syscall.BytePtrFromString(fstype)
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
if data != "" {
|
|
||||||
dataP, err = syscall.BytePtrFromString(data)
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
runtime.LockOSThread()
|
|
||||||
defer runtime.UnlockOSThread()
|
|
||||||
|
|
||||||
var pipefds [2]int
|
|
||||||
if err := syscall.Pipe2(pipefds[:], syscall.O_CLOEXEC); err != nil {
|
|
||||||
return fmt.Errorf("failed to open pipe: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
defer func() {
|
|
||||||
// close both ends of the pipe in a deferred function, since open file
|
|
||||||
// descriptor table is shared with child
|
|
||||||
syscall.Close(pipefds[0])
|
|
||||||
syscall.Close(pipefds[1])
|
|
||||||
}()
|
|
||||||
|
|
||||||
pid, errno = forkAndMountat(dirfd,
|
|
||||||
uintptr(unsafe.Pointer(sourceP)),
|
|
||||||
uintptr(unsafe.Pointer(targetP)),
|
|
||||||
uintptr(unsafe.Pointer(fstypeP)),
|
|
||||||
flags,
|
|
||||||
uintptr(unsafe.Pointer(dataP)),
|
|
||||||
pipefds[1],
|
|
||||||
)
|
|
||||||
|
|
||||||
if errno != 0 {
|
|
||||||
return fmt.Errorf("failed to fork thread: %w", errno)
|
|
||||||
}
|
|
||||||
|
|
||||||
defer func() {
|
|
||||||
_, err := unix.Wait4(int(pid), nil, 0, nil)
|
|
||||||
for err == syscall.EINTR {
|
|
||||||
_, err = unix.Wait4(int(pid), nil, 0, nil)
|
|
||||||
}
|
|
||||||
|
|
||||||
if err != nil {
|
|
||||||
log.L.WithError(err).Debugf("failed to find pid=%d process", pid)
|
|
||||||
}
|
|
||||||
}()
|
|
||||||
|
|
||||||
_, _, errno = syscall.RawSyscall(syscall.SYS_READ,
|
|
||||||
uintptr(pipefds[0]),
|
|
||||||
uintptr(unsafe.Pointer(&status)),
|
|
||||||
unsafe.Sizeof(status))
|
|
||||||
if errno != 0 {
|
|
||||||
return fmt.Errorf("failed to read pipe: %w", errno)
|
|
||||||
}
|
|
||||||
|
|
||||||
if status != 0 {
|
|
||||||
return fmt.Errorf("failed to mount: %w", status)
|
|
||||||
}
|
|
||||||
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// forkAndMountat will fork thread, change working dir and mount.
|
|
||||||
//
|
|
||||||
// precondition: the runtime OS thread must be locked.
|
|
||||||
func forkAndMountat(dirfd uintptr, source, target, fstype, flags, data uintptr, pipefd int) (pid uintptr, errno syscall.Errno) {
|
|
||||||
|
|
||||||
// block signal during clone
|
|
||||||
beforeFork()
|
|
||||||
|
|
||||||
// the cloned thread shares the open file descriptor, but the thread
|
|
||||||
// never be reused by runtime.
|
|
||||||
pid, _, errno = syscall.RawSyscall6(syscall.SYS_CLONE, uintptr(syscall.SIGCHLD)|syscall.CLONE_FILES, 0, 0, 0, 0, 0)
|
|
||||||
if errno != 0 || pid != 0 {
|
|
||||||
// restore all signals
|
|
||||||
afterFork()
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
// restore all signals
|
|
||||||
afterForkInChild()
|
|
||||||
|
|
||||||
// change working dir
|
|
||||||
_, _, errno = syscall.RawSyscall(syscall.SYS_FCHDIR, dirfd, 0, 0)
|
|
||||||
if errno != 0 {
|
|
||||||
goto childerr
|
|
||||||
}
|
|
||||||
_, _, errno = syscall.RawSyscall6(syscall.SYS_MOUNT, source, target, fstype, flags, data, 0)
|
|
||||||
|
|
||||||
childerr:
|
|
||||||
_, _, errno = syscall.RawSyscall(syscall.SYS_WRITE, uintptr(pipefd), uintptr(unsafe.Pointer(&errno)), unsafe.Sizeof(errno))
|
|
||||||
syscall.RawSyscall(syscall.SYS_EXIT, uintptr(errno), 0, 0)
|
|
||||||
panic("unreachable")
|
|
||||||
}
|
|
@ -1,164 +0,0 @@
|
|||||||
/*
|
|
||||||
Copyright The containerd Authors.
|
|
||||||
|
|
||||||
Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
you may not use this file except in compliance with the License.
|
|
||||||
You may obtain a copy of the License at
|
|
||||||
|
|
||||||
http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
|
|
||||||
Unless required by applicable law or agreed to in writing, software
|
|
||||||
distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
See the License for the specific language governing permissions and
|
|
||||||
limitations under the License.
|
|
||||||
*/
|
|
||||||
|
|
||||||
package mount
|
|
||||||
|
|
||||||
import (
|
|
||||||
"errors"
|
|
||||||
"os"
|
|
||||||
"path/filepath"
|
|
||||||
"syscall"
|
|
||||||
"testing"
|
|
||||||
"time"
|
|
||||||
|
|
||||||
"github.com/containerd/continuity/fs/fstest"
|
|
||||||
"golang.org/x/sys/unix"
|
|
||||||
)
|
|
||||||
|
|
||||||
type fMountatCaseFunc func(t *testing.T, root string)
|
|
||||||
|
|
||||||
func TestFMountat(t *testing.T) {
|
|
||||||
if unix.Geteuid() != 0 {
|
|
||||||
t.Skip("Needs to be run as root")
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
t.Run("Normal", makeTestForFMountat(testFMountatNormal))
|
|
||||||
t.Run("ChdirWithFileFd", makeTestForFMountat(testFMountatWithFileFd))
|
|
||||||
t.Run("MountWithInvalidSource", makeTestForFMountat(testFMountatWithInvalidSource))
|
|
||||||
}
|
|
||||||
|
|
||||||
func makeTestForFMountat(fn fMountatCaseFunc) func(t *testing.T) {
|
|
||||||
return func(t *testing.T) {
|
|
||||||
t.Parallel()
|
|
||||||
|
|
||||||
suiteDir := t.TempDir()
|
|
||||||
|
|
||||||
fn(t, suiteDir)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func testFMountatNormal(t *testing.T, root string) {
|
|
||||||
expectedContent := "bye re-exec!\n"
|
|
||||||
apply := fstest.Apply(
|
|
||||||
fstest.CreateFile("/hi", []byte(expectedContent), 0777),
|
|
||||||
)
|
|
||||||
|
|
||||||
workdir := filepath.Join(root, "work")
|
|
||||||
if err := os.MkdirAll(workdir, 0777); err != nil {
|
|
||||||
t.Fatalf("failed to create dir(%s): %+v", workdir, err)
|
|
||||||
}
|
|
||||||
|
|
||||||
if err := apply.Apply(workdir); err != nil {
|
|
||||||
t.Fatalf("failed to prepare source dir: %+v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
atdir := filepath.Join(root, "at")
|
|
||||||
if err := os.MkdirAll(atdir, 0777); err != nil {
|
|
||||||
t.Fatalf("failed to create working dir(%s): %+v", atdir, err)
|
|
||||||
}
|
|
||||||
|
|
||||||
fsdir := filepath.Join(atdir, "fs")
|
|
||||||
if err := os.MkdirAll(fsdir, 0777); err != nil {
|
|
||||||
t.Fatalf("failed to create mount point dir(%s): %+v", fsdir, err)
|
|
||||||
}
|
|
||||||
|
|
||||||
f, err := os.Open(atdir)
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("failed to open dir(%s): %+v", atdir, err)
|
|
||||||
}
|
|
||||||
defer f.Close()
|
|
||||||
|
|
||||||
// mount work to fs
|
|
||||||
if err = fMountat(f.Fd(), workdir, "fs", "bind", unix.MS_BIND|unix.MS_RDONLY, ""); err != nil {
|
|
||||||
t.Fatalf("expected no error here, but got error: %+v", err)
|
|
||||||
}
|
|
||||||
defer umount(t, fsdir)
|
|
||||||
|
|
||||||
// check hi file
|
|
||||||
content, err := os.ReadFile(filepath.Join(fsdir, "hi"))
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("failed to read file: %+v", err)
|
|
||||||
}
|
|
||||||
if got := string(content); got != expectedContent {
|
|
||||||
t.Fatalf("expected to get(%v), but got(%v)", expectedContent, got)
|
|
||||||
}
|
|
||||||
|
|
||||||
// check the working directory
|
|
||||||
cwd, err := os.Getwd()
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("failed to get current working dir: %+v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
if cwd == atdir {
|
|
||||||
t.Fatal("should not change the current working directory")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func testFMountatWithFileFd(t *testing.T, root string) {
|
|
||||||
// not a directory
|
|
||||||
expectedErr := syscall.Errno(20)
|
|
||||||
|
|
||||||
emptyFile := filepath.Join(root, "emptyFile")
|
|
||||||
f, err := os.Create(emptyFile)
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("failed to create file(%s): %+v", emptyFile, err)
|
|
||||||
}
|
|
||||||
defer f.Close()
|
|
||||||
|
|
||||||
err = fMountat(f.Fd(), filepath.Join(root, "empty"), filepath.Join(root, "work"), "", 0, "")
|
|
||||||
if !errors.Is(err, expectedErr) {
|
|
||||||
t.Fatalf("expected error %v, but got %v", expectedErr, errors.Unwrap(err))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func testFMountatWithInvalidSource(t *testing.T, root string) {
|
|
||||||
// no such file or directory
|
|
||||||
expectedErr := syscall.Errno(2)
|
|
||||||
|
|
||||||
atdir := filepath.Join(root, "at")
|
|
||||||
if err := os.MkdirAll(atdir, 0777); err != nil {
|
|
||||||
t.Fatalf("failed to create dir(%s): %+v", atdir, err)
|
|
||||||
}
|
|
||||||
|
|
||||||
f, err := os.Open(root)
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("failed to open dir(%s): %+v", atdir, err)
|
|
||||||
}
|
|
||||||
defer f.Close()
|
|
||||||
|
|
||||||
err = fMountat(f.Fd(), filepath.Join(root, "oops"), "at", "bind", unix.MS_BIND, "")
|
|
||||||
if !errors.Is(err, expectedErr) {
|
|
||||||
t.Fatalf("expected error %v, but got %v", expectedErr, err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func umount(t *testing.T, target string) {
|
|
||||||
for i := 0; i < 50; i++ {
|
|
||||||
if err := unix.Unmount(target, unix.MNT_DETACH); err != nil {
|
|
||||||
switch err {
|
|
||||||
case unix.EBUSY:
|
|
||||||
time.Sleep(50 * time.Millisecond)
|
|
||||||
continue
|
|
||||||
case unix.EINVAL:
|
|
||||||
return
|
|
||||||
default:
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
t.Fatalf("failed to unmount target %s", target)
|
|
||||||
}
|
|
@ -21,6 +21,7 @@ import (
|
|||||||
"fmt"
|
"fmt"
|
||||||
"os"
|
"os"
|
||||||
"path"
|
"path"
|
||||||
|
"runtime"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
@ -363,24 +364,29 @@ func mountAt(chdir string, source, target, fstype string, flags uintptr, data st
|
|||||||
return unix.Mount(source, target, fstype, flags, data)
|
return unix.Mount(source, target, fstype, flags, data)
|
||||||
}
|
}
|
||||||
|
|
||||||
f, err := os.Open(chdir)
|
ch := make(chan error, 1)
|
||||||
if err != nil {
|
go func() {
|
||||||
return fmt.Errorf("failed to mountat: %w", err)
|
runtime.LockOSThread()
|
||||||
}
|
|
||||||
defer f.Close()
|
|
||||||
|
|
||||||
fs, err := f.Stat()
|
// Do not unlock this thread.
|
||||||
if err != nil {
|
// If the thread is unlocked go will try to use it for other goroutines.
|
||||||
return fmt.Errorf("failed to mountat: %w", err)
|
// However it is not possible to restore the thread state after CLONE_FS.
|
||||||
}
|
//
|
||||||
|
// Once the goroutine exits the thread should eventually be terminated by go.
|
||||||
|
|
||||||
if !fs.IsDir() {
|
if err := unix.Unshare(unix.CLONE_FS); err != nil {
|
||||||
return fmt.Errorf("failed to mountat: %s is not dir", chdir)
|
ch <- err
|
||||||
}
|
return
|
||||||
if err := fMountat(f.Fd(), source, target, fstype, flags, data); err != nil {
|
}
|
||||||
return fmt.Errorf("failed to mountat: %w", err)
|
|
||||||
}
|
if err := unix.Chdir(chdir); err != nil {
|
||||||
return nil
|
ch <- err
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
ch <- unix.Mount(source, target, fstype, flags, data)
|
||||||
|
}()
|
||||||
|
return <-ch
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *Mount) mountWithHelper(helperBinary, typePrefix, target string) error {
|
func (m *Mount) mountWithHelper(helperBinary, typePrefix, target string) error {
|
||||||
|
@ -25,6 +25,7 @@ import (
|
|||||||
|
|
||||||
"github.com/containerd/continuity/testutil"
|
"github.com/containerd/continuity/testutil"
|
||||||
exec "golang.org/x/sys/execabs"
|
exec "golang.org/x/sys/execabs"
|
||||||
|
"golang.org/x/sys/unix"
|
||||||
)
|
)
|
||||||
|
|
||||||
func TestLongestCommonPrefix(t *testing.T) {
|
func TestLongestCommonPrefix(t *testing.T) {
|
||||||
@ -126,3 +127,48 @@ func TestFUSEHelper(t *testing.T) {
|
|||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestMountAt(t *testing.T) {
|
||||||
|
testutil.RequiresRoot(t)
|
||||||
|
|
||||||
|
dir1 := t.TempDir()
|
||||||
|
dir2 := t.TempDir()
|
||||||
|
|
||||||
|
defer unix.Unmount(filepath.Join(dir2, "bar"), unix.MNT_DETACH)
|
||||||
|
|
||||||
|
if err := os.WriteFile(filepath.Join(dir1, "foo"), []byte("foo"), 0644); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := os.WriteFile(filepath.Join(dir2, "bar"), []byte{}, 0644); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
wd, err := os.Getwd()
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// mount ${dir1}/foo at ${dir2}/bar
|
||||||
|
// But since we are using `mountAt` we only need to specify the relative path to dir2 as the target mountAt will chdir to there.
|
||||||
|
if err := mountAt(dir2, filepath.Join(dir1, "foo"), "bar", "none", unix.MS_BIND, ""); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
b, err := os.ReadFile(filepath.Join(dir2, "bar"))
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if string(b) != "foo" {
|
||||||
|
t.Fatalf("unexpected file content: %s", b)
|
||||||
|
}
|
||||||
|
|
||||||
|
newWD, err := os.Getwd()
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
if wd != newWD {
|
||||||
|
t.Fatalf("unexpected working directory: %s", newWD)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
@ -1,15 +0,0 @@
|
|||||||
/*
|
|
||||||
Copyright The containerd Authors.
|
|
||||||
|
|
||||||
Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
you may not use this file except in compliance with the License.
|
|
||||||
You may obtain a copy of the License at
|
|
||||||
|
|
||||||
http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
|
|
||||||
Unless required by applicable law or agreed to in writing, software
|
|
||||||
distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
See the License for the specific language governing permissions and
|
|
||||||
limitations under the License.
|
|
||||||
*/
|
|
@ -1,33 +0,0 @@
|
|||||||
//go:build linux && gc
|
|
||||||
// +build linux,gc
|
|
||||||
|
|
||||||
/*
|
|
||||||
Copyright The containerd Authors.
|
|
||||||
|
|
||||||
Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
you may not use this file except in compliance with the License.
|
|
||||||
You may obtain a copy of the License at
|
|
||||||
|
|
||||||
http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
|
|
||||||
Unless required by applicable law or agreed to in writing, software
|
|
||||||
distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
See the License for the specific language governing permissions and
|
|
||||||
limitations under the License.
|
|
||||||
*/
|
|
||||||
|
|
||||||
package mount
|
|
||||||
|
|
||||||
import (
|
|
||||||
_ "unsafe" // required for go:linkname.
|
|
||||||
)
|
|
||||||
|
|
||||||
//go:linkname beforeFork syscall.runtime_BeforeFork
|
|
||||||
func beforeFork()
|
|
||||||
|
|
||||||
//go:linkname afterFork syscall.runtime_AfterFork
|
|
||||||
func afterFork()
|
|
||||||
|
|
||||||
//go:linkname afterForkInChild syscall.runtime_AfterForkInChild
|
|
||||||
func afterForkInChild()
|
|
@ -1,33 +0,0 @@
|
|||||||
//go:build linux && gccgo
|
|
||||||
// +build linux,gccgo
|
|
||||||
|
|
||||||
/*
|
|
||||||
Copyright The containerd Authors.
|
|
||||||
|
|
||||||
Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
you may not use this file except in compliance with the License.
|
|
||||||
You may obtain a copy of the License at
|
|
||||||
|
|
||||||
http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
|
|
||||||
Unless required by applicable law or agreed to in writing, software
|
|
||||||
distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
See the License for the specific language governing permissions and
|
|
||||||
limitations under the License.
|
|
||||||
*/
|
|
||||||
|
|
||||||
package mount
|
|
||||||
|
|
||||||
import (
|
|
||||||
_ "unsafe" // required for go:linkname.
|
|
||||||
)
|
|
||||||
|
|
||||||
//go:linkname beforeFork syscall.runtime__BeforeFork
|
|
||||||
func beforeFork()
|
|
||||||
|
|
||||||
//go:linkname afterFork syscall.runtime__AfterFork
|
|
||||||
func afterFork()
|
|
||||||
|
|
||||||
//go:linkname afterForkInChild syscall.runtime__AfterForkInChild
|
|
||||||
func afterForkInChild()
|
|
Loading…
Reference in New Issue
Block a user