 3cd8f9734d
			
		
	
	3cd8f9734d
	
	
	
		
			
			The Go runtime has started to [lock down future uses of linkname][1] since
go1.23. In the go source code, containerd project has been marked in the
comment, [hall of shame][2]. Well, the go:linkname is used to fork no-op
subprocess efficiently. However, since that comment, I would like to use
ptrace and remove go:linkname in the whole repository.
With go1.22 `go:linkname`:
```bash
$ go test -bench=.  -benchmem ./ -exec sudo
goos: linux
goarch: amd64
pkg: github.com/containerd/containerd/v2/core/mount
cpu: AMD Ryzen 7 5800H with Radeon Graphics
BenchmarkBatchRunGetUsernsFD_Concurrent1-16                 2440            533320 ns/op            1145 B/op         43 allocs/op
BenchmarkBatchRunGetUsernsFD_Concurrent10-16                 342           3661616 ns/op           11562 B/op        421 allocs/op
PASS
ok      github.com/containerd/containerd/v2/core/mount  2.983s
```
With go1.22 `ptrace`:
```bash
$ go test -bench=.  -benchmem ./ -exec sudo
goos: linux
goarch: amd64
pkg: github.com/containerd/containerd/v2/core/mount
cpu: AMD Ryzen 7 5800H with Radeon Graphics
BenchmarkBatchRunGetUsernsFD_Concurrent1-16                 1785            739557 ns/op            3948 B/op         68 allocs/op
BenchmarkBatchRunGetUsernsFD_Concurrent10-16                 328           4024300 ns/op           39601 B/op        671 allocs/op
PASS
ok      github.com/containerd/containerd/v2/core/mount  3.104s
```
With go1.23 `ptrace`:
```bash
$ go test -bench=.  -benchmem ./ -exec sudo
goos: linux
goarch: amd64
pkg: github.com/containerd/containerd/v2/core/mount
cpu: AMD Ryzen 7 5800H with Radeon Graphics
BenchmarkBatchRunGetUsernsFD_Concurrent1-16                 1815            723252 ns/op            4220 B/op         69 allocs/op
BenchmarkBatchRunGetUsernsFD_Concurrent10-16                 319           3957157 ns/op           42351 B/op        682 allocs/op
PASS
ok      github.com/containerd/containerd/v2/core/mount  3.051s
```
Diff:
The `ptrace` is slower than `go:linkname` mode. However, it's accepctable.
```
goos: linux
goarch: amd64
pkg: github.com/containerd/containerd/v2/core/mount
cpu: AMD Ryzen 7 5800H with Radeon Graphics
                                    │ go122-golinkname │             go122-ptrace              │             go123-ptrace              │
                                    │      sec/op      │    sec/op     vs base                 │    sec/op     vs base                 │
BatchRunGetUsernsFD_Concurrent1-16        533.3µ ± ∞ ¹   739.6µ ± ∞ ¹        ~ (p=1.000 n=1) ²   723.3µ ± ∞ ¹        ~ (p=1.000 n=1) ²
BatchRunGetUsernsFD_Concurrent10-16       3.662m ± ∞ ¹   4.024m ± ∞ ¹        ~ (p=1.000 n=1) ²   3.957m ± ∞ ¹        ~ (p=1.000 n=1) ²
geomean                                   1.397m         1.725m        +23.45%                   1.692m        +21.06%
¹ need >= 6 samples for confidence interval at level 0.95
² need >= 4 samples to detect a difference at alpha level 0.05
                                    │ go122-golinkname │              go122-ptrace               │              go123-ptrace               │
                                    │       B/op       │     B/op       vs base                  │     B/op       vs base                  │
BatchRunGetUsernsFD_Concurrent1-16       1.118Ki ± ∞ ¹   3.855Ki ± ∞ ¹         ~ (p=1.000 n=1) ²   4.121Ki ± ∞ ¹         ~ (p=1.000 n=1) ²
BatchRunGetUsernsFD_Concurrent10-16      11.29Ki ± ∞ ¹   38.67Ki ± ∞ ¹         ~ (p=1.000 n=1) ²   41.36Ki ± ∞ ¹         ~ (p=1.000 n=1) ²
geomean                                  3.553Ki         12.21Ki        +243.65%                   13.06Ki        +267.43%
¹ need >= 6 samples for confidence interval at level 0.95
² need >= 4 samples to detect a difference at alpha level 0.05
                                    │ go122-golinkname │             go122-ptrace             │             go123-ptrace             │
                                    │    allocs/op     │  allocs/op   vs base                 │  allocs/op   vs base                 │
BatchRunGetUsernsFD_Concurrent1-16         43.00 ± ∞ ¹   68.00 ± ∞ ¹        ~ (p=1.000 n=1) ²   69.00 ± ∞ ¹        ~ (p=1.000 n=1) ²
BatchRunGetUsernsFD_Concurrent10-16        421.0 ± ∞ ¹   671.0 ± ∞ ¹        ~ (p=1.000 n=1) ²   682.0 ± ∞ ¹        ~ (p=1.000 n=1) ²
geomean                                    134.5         213.6        +58.76%                   216.9        +61.23%
¹ need >= 6 samples for confidence interval at level 0.95
² need >= 4 samples to detect a difference at alpha level 0.05
```
[1]: <https://github.com/golang/go/issues/67401>
[2]: <https://github.com/golang/go/blob/release-branch.go1.23/src/runtime/proc.go#L4820>
Signed-off-by: Wei Fu <fuweid89@gmail.com>
		
	
		
			
				
	
	
		
			86 lines
		
	
	
		
			2.6 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
			
		
		
	
	
			86 lines
		
	
	
		
			2.6 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
| //go:build go1.23 && linux
 | |
| 
 | |
| /*
 | |
|    Copyright The containerd Authors.
 | |
| 
 | |
|    Licensed under the Apache License, Version 2.0 (the "License");
 | |
|    you may not use this file except in compliance with the License.
 | |
|    You may obtain a copy of the License at
 | |
| 
 | |
|        http://www.apache.org/licenses/LICENSE-2.0
 | |
| 
 | |
|    Unless required by applicable law or agreed to in writing, software
 | |
|    distributed under the License is distributed on an "AS IS" BASIS,
 | |
|    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | |
|    See the License for the specific language governing permissions and
 | |
|    limitations under the License.
 | |
| */
 | |
| 
 | |
| package mount
 | |
| 
 | |
| import (
 | |
| 	"fmt"
 | |
| 	"os"
 | |
| 	"syscall"
 | |
| 
 | |
| 	"github.com/containerd/containerd/v2/pkg/sys"
 | |
| 
 | |
| 	"golang.org/x/sys/unix"
 | |
| )
 | |
| 
 | |
| // getUsernsFD returns pinnable user namespace's file descriptor.
 | |
| //
 | |
| // NOTE: The GO runtime uses pidfd to handle subprocess since go1.23. However,
 | |
| // it has double close issue tracked by [1]. We can't use pidfd directly and
 | |
| // the GO runtime doesn't export interface to show if it's using pidfd or not.
 | |
| // So, we call `sys.SupportsPidFD` first and then use `os.Process` directly.
 | |
| //
 | |
| // [1]: https://github.com/golang/go/issues/68984
 | |
| func getUsernsFD(uidMaps, gidMaps []syscall.SysProcIDMap) (_ *os.File, retErr error) {
 | |
| 	if !sys.SupportsPidFD() {
 | |
| 		return nil, fmt.Errorf("failed to prevent pid reused issue because pidfd isn't supported")
 | |
| 	}
 | |
| 
 | |
| 	proc, err := os.StartProcess("/proc/self/exe", []string{"containerd[getUsernsFD]"}, &os.ProcAttr{
 | |
| 		Sys: &syscall.SysProcAttr{
 | |
| 			Cloneflags:  unix.CLONE_NEWUSER,
 | |
| 			UidMappings: uidMaps,
 | |
| 			GidMappings: gidMaps,
 | |
| 			// NOTE: It's reexec but it's not heavy because subprocess
 | |
| 			// be in PTRACE_TRACEME mode before performing execve.
 | |
| 			Ptrace:    true,
 | |
| 			Pdeathsig: syscall.SIGKILL,
 | |
| 		},
 | |
| 	})
 | |
| 	if err != nil {
 | |
| 		return nil, fmt.Errorf("failed to start noop process for unshare: %w", err)
 | |
| 	}
 | |
| 
 | |
| 	defer func() {
 | |
| 		proc.Kill()
 | |
| 		proc.Wait()
 | |
| 	}()
 | |
| 
 | |
| 	// NOTE:
 | |
| 	//
 | |
| 	// The usernsFD will hold the userns reference in kernel. Even if the
 | |
| 	// child process is reaped, the usernsFD is still valid.
 | |
| 	usernsFD, err := os.Open(fmt.Sprintf("/proc/%d/ns/user", proc.Pid))
 | |
| 	if err != nil {
 | |
| 		return nil, fmt.Errorf("failed to get userns file descriptor for /proc/%d/user/ns: %w", proc.Pid, err)
 | |
| 	}
 | |
| 	defer func() {
 | |
| 		if retErr != nil {
 | |
| 			usernsFD.Close()
 | |
| 		}
 | |
| 	}()
 | |
| 
 | |
| 	// Ensure the child process is still alive. If the err is ESRCH, we
 | |
| 	// should return error because we can't guarantee the usernsFD and
 | |
| 	// u[g]idmapFile are valid. It's safe to return error and retry.
 | |
| 	if err := proc.Signal(syscall.Signal(0)); err != nil {
 | |
| 		return nil, fmt.Errorf("failed to ensure child process is alive: %w", err)
 | |
| 	}
 | |
| 	return usernsFD, nil
 | |
| }
 |