The Go runtime has started to [lock down future uses of linkname][1] since
go1.23. In the go source code, containerd project has been marked in the
comment, [hall of shame][2]. Well, the go:linkname is used to fork no-op
subprocess efficiently. However, since that comment, I would like to use
ptrace and remove go:linkname in the whole repository.
With go1.22 `go:linkname`:
```bash
$ go test -bench=.  -benchmem ./ -exec sudo
goos: linux
goarch: amd64
pkg: github.com/containerd/containerd/v2/core/mount
cpu: AMD Ryzen 7 5800H with Radeon Graphics
BenchmarkBatchRunGetUsernsFD_Concurrent1-16                 2440            533320 ns/op            1145 B/op         43 allocs/op
BenchmarkBatchRunGetUsernsFD_Concurrent10-16                 342           3661616 ns/op           11562 B/op        421 allocs/op
PASS
ok      github.com/containerd/containerd/v2/core/mount  2.983s
```
With go1.22 `ptrace`:
```bash
$ go test -bench=.  -benchmem ./ -exec sudo
goos: linux
goarch: amd64
pkg: github.com/containerd/containerd/v2/core/mount
cpu: AMD Ryzen 7 5800H with Radeon Graphics
BenchmarkBatchRunGetUsernsFD_Concurrent1-16                 1785            739557 ns/op            3948 B/op         68 allocs/op
BenchmarkBatchRunGetUsernsFD_Concurrent10-16                 328           4024300 ns/op           39601 B/op        671 allocs/op
PASS
ok      github.com/containerd/containerd/v2/core/mount  3.104s
```
With go1.23 `ptrace`:
```bash
$ go test -bench=.  -benchmem ./ -exec sudo
goos: linux
goarch: amd64
pkg: github.com/containerd/containerd/v2/core/mount
cpu: AMD Ryzen 7 5800H with Radeon Graphics
BenchmarkBatchRunGetUsernsFD_Concurrent1-16                 1815            723252 ns/op            4220 B/op         69 allocs/op
BenchmarkBatchRunGetUsernsFD_Concurrent10-16                 319           3957157 ns/op           42351 B/op        682 allocs/op
PASS
ok      github.com/containerd/containerd/v2/core/mount  3.051s
```
Diff:
The `ptrace` is slower than `go:linkname` mode. However, it's accepctable.
```
goos: linux
goarch: amd64
pkg: github.com/containerd/containerd/v2/core/mount
cpu: AMD Ryzen 7 5800H with Radeon Graphics
                                    │ go122-golinkname │             go122-ptrace              │             go123-ptrace              │
                                    │      sec/op      │    sec/op     vs base                 │    sec/op     vs base                 │
BatchRunGetUsernsFD_Concurrent1-16        533.3µ ± ∞ ¹   739.6µ ± ∞ ¹        ~ (p=1.000 n=1) ²   723.3µ ± ∞ ¹        ~ (p=1.000 n=1) ²
BatchRunGetUsernsFD_Concurrent10-16       3.662m ± ∞ ¹   4.024m ± ∞ ¹        ~ (p=1.000 n=1) ²   3.957m ± ∞ ¹        ~ (p=1.000 n=1) ²
geomean                                   1.397m         1.725m        +23.45%                   1.692m        +21.06%
¹ need >= 6 samples for confidence interval at level 0.95
² need >= 4 samples to detect a difference at alpha level 0.05
                                    │ go122-golinkname │              go122-ptrace               │              go123-ptrace               │
                                    │       B/op       │     B/op       vs base                  │     B/op       vs base                  │
BatchRunGetUsernsFD_Concurrent1-16       1.118Ki ± ∞ ¹   3.855Ki ± ∞ ¹         ~ (p=1.000 n=1) ²   4.121Ki ± ∞ ¹         ~ (p=1.000 n=1) ²
BatchRunGetUsernsFD_Concurrent10-16      11.29Ki ± ∞ ¹   38.67Ki ± ∞ ¹         ~ (p=1.000 n=1) ²   41.36Ki ± ∞ ¹         ~ (p=1.000 n=1) ²
geomean                                  3.553Ki         12.21Ki        +243.65%                   13.06Ki        +267.43%
¹ need >= 6 samples for confidence interval at level 0.95
² need >= 4 samples to detect a difference at alpha level 0.05
                                    │ go122-golinkname │             go122-ptrace             │             go123-ptrace             │
                                    │    allocs/op     │  allocs/op   vs base                 │  allocs/op   vs base                 │
BatchRunGetUsernsFD_Concurrent1-16         43.00 ± ∞ ¹   68.00 ± ∞ ¹        ~ (p=1.000 n=1) ²   69.00 ± ∞ ¹        ~ (p=1.000 n=1) ²
BatchRunGetUsernsFD_Concurrent10-16        421.0 ± ∞ ¹   671.0 ± ∞ ¹        ~ (p=1.000 n=1) ²   682.0 ± ∞ ¹        ~ (p=1.000 n=1) ²
geomean                                    134.5         213.6        +58.76%                   216.9        +61.23%
¹ need >= 6 samples for confidence interval at level 0.95
² need >= 4 samples to detect a difference at alpha level 0.05
```
[1]: <https://github.com/golang/go/issues/67401>
[2]: <https://github.com/golang/go/blob/release-branch.go1.23/src/runtime/proc.go#L4820>
Signed-off-by: Wei Fu <fuweid89@gmail.com>
		
	
		
			
				
	
	
		
			105 lines
		
	
	
		
			2.8 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
			
		
		
	
	
			105 lines
		
	
	
		
			2.8 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
/*
 | 
						|
   Copyright The containerd Authors.
 | 
						|
 | 
						|
   Licensed under the Apache License, Version 2.0 (the "License");
 | 
						|
   you may not use this file except in compliance with the License.
 | 
						|
   You may obtain a copy of the License at
 | 
						|
 | 
						|
       http://www.apache.org/licenses/LICENSE-2.0
 | 
						|
 | 
						|
   Unless required by applicable law or agreed to in writing, software
 | 
						|
   distributed under the License is distributed on an "AS IS" BASIS,
 | 
						|
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | 
						|
   See the License for the specific language governing permissions and
 | 
						|
   limitations under the License.
 | 
						|
*/
 | 
						|
 | 
						|
package mount
 | 
						|
 | 
						|
import (
 | 
						|
	"fmt"
 | 
						|
	"os"
 | 
						|
	"strconv"
 | 
						|
	"strings"
 | 
						|
	"syscall"
 | 
						|
 | 
						|
	"golang.org/x/sys/unix"
 | 
						|
)
 | 
						|
 | 
						|
// TODO: Support multiple mappings in future
 | 
						|
func parseIDMapping(mapping string) ([]syscall.SysProcIDMap, error) {
 | 
						|
	parts := strings.Split(mapping, ":")
 | 
						|
	if len(parts) != 3 {
 | 
						|
		return nil, fmt.Errorf("user namespace mappings require the format `container-id:host-id:size`")
 | 
						|
	}
 | 
						|
 | 
						|
	cID, err := strconv.Atoi(parts[0])
 | 
						|
	if err != nil {
 | 
						|
		return nil, fmt.Errorf("invalid container id for user namespace remapping, %w", err)
 | 
						|
	}
 | 
						|
 | 
						|
	hID, err := strconv.Atoi(parts[1])
 | 
						|
	if err != nil {
 | 
						|
		return nil, fmt.Errorf("invalid host id for user namespace remapping, %w", err)
 | 
						|
	}
 | 
						|
 | 
						|
	size, err := strconv.Atoi(parts[2])
 | 
						|
	if err != nil {
 | 
						|
		return nil, fmt.Errorf("invalid size for user namespace remapping, %w", err)
 | 
						|
	}
 | 
						|
 | 
						|
	if cID < 0 || hID < 0 || size < 0 {
 | 
						|
		return nil, fmt.Errorf("invalid mapping %s, all IDs and size must be positive integers", mapping)
 | 
						|
	}
 | 
						|
 | 
						|
	return []syscall.SysProcIDMap{
 | 
						|
		{
 | 
						|
			ContainerID: cID,
 | 
						|
			HostID:      hID,
 | 
						|
			Size:        size,
 | 
						|
		},
 | 
						|
	}, nil
 | 
						|
}
 | 
						|
 | 
						|
// IDMapMount applies GID/UID shift according to gidmap/uidmap for target path
 | 
						|
func IDMapMount(source, target string, usernsFd int) (err error) {
 | 
						|
	var (
 | 
						|
		attr unix.MountAttr
 | 
						|
	)
 | 
						|
 | 
						|
	attr.Attr_set = unix.MOUNT_ATTR_IDMAP
 | 
						|
	attr.Attr_clr = 0
 | 
						|
	attr.Propagation = 0
 | 
						|
	attr.Userns_fd = uint64(usernsFd)
 | 
						|
 | 
						|
	dFd, err := unix.OpenTree(-int(unix.EBADF), source, uint(unix.OPEN_TREE_CLONE|unix.OPEN_TREE_CLOEXEC|unix.AT_EMPTY_PATH))
 | 
						|
	if err != nil {
 | 
						|
		return fmt.Errorf("Unable to open tree for %s: %w", target, err)
 | 
						|
	}
 | 
						|
 | 
						|
	defer unix.Close(dFd)
 | 
						|
	if err = unix.MountSetattr(dFd, "", unix.AT_EMPTY_PATH, &attr); err != nil {
 | 
						|
		return fmt.Errorf("Unable to shift GID/UID for %s: %w", target, err)
 | 
						|
	}
 | 
						|
 | 
						|
	if err = unix.MoveMount(dFd, "", -int(unix.EBADF), target, unix.MOVE_MOUNT_F_EMPTY_PATH); err != nil {
 | 
						|
		return fmt.Errorf("Unable to attach mount tree to %s: %w", target, err)
 | 
						|
	}
 | 
						|
	return nil
 | 
						|
}
 | 
						|
 | 
						|
// GetUsernsFD forks the current process and creates a user namespace using
 | 
						|
// the specified mappings.
 | 
						|
func GetUsernsFD(uidmap, gidmap string) (_usernsFD *os.File, _ error) {
 | 
						|
	uidMaps, err := parseIDMapping(uidmap)
 | 
						|
	if err != nil {
 | 
						|
		return nil, err
 | 
						|
	}
 | 
						|
 | 
						|
	gidMaps, err := parseIDMapping(gidmap)
 | 
						|
	if err != nil {
 | 
						|
		return nil, err
 | 
						|
	}
 | 
						|
	return getUsernsFD(uidMaps, gidMaps)
 | 
						|
}
 |