Move sys to pkg/sys

Signed-off-by: Derek McGowan <derek@mcg.dev>
This commit is contained in:
Derek McGowan
2024-01-17 09:56:16 -08:00
parent de606680b0
commit 6be90158cd
22 changed files with 10 additions and 10 deletions

View File

@@ -0,0 +1,51 @@
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package sys
import (
"os"
"github.com/moby/sys/sequential"
)
// CreateSequential is deprecated.
//
// Deprecated: use github.com/moby/sys/sequential.Create
func CreateSequential(name string) (*os.File, error) {
return sequential.Create(name)
}
// OpenSequential is deprecated.
//
// Deprecated: use github.com/moby/sys/sequential.Open
func OpenSequential(name string) (*os.File, error) {
return sequential.Open(name)
}
// OpenFileSequential is deprecated.
//
// Deprecated: use github.com/moby/sys/sequential.OpenFile
func OpenFileSequential(name string, flag int, perm os.FileMode) (*os.File, error) {
return sequential.OpenFile(name, flag, perm)
}
// TempFileSequential is deprecated.
//
// Deprecated: use github.com/moby/sys/sequential.CreateTemp
func TempFileSequential(dir, prefix string) (f *os.File, err error) {
return sequential.CreateTemp(dir, prefix)
}

26
pkg/sys/filesys_unix.go Normal file
View File

@@ -0,0 +1,26 @@
//go:build !windows
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package sys
import "os"
// MkdirAllWithACL is a wrapper for os.MkdirAll on Unix systems.
func MkdirAllWithACL(path string, perm os.FileMode) error {
return os.MkdirAll(path, perm)
}

151
pkg/sys/filesys_windows.go Normal file
View File

@@ -0,0 +1,151 @@
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package sys
import (
"os"
"regexp"
"syscall"
"unsafe"
"golang.org/x/sys/windows"
)
// SddlAdministratorsLocalSystem is local administrators plus NT AUTHORITY\System.
const SddlAdministratorsLocalSystem = "D:P(A;OICI;GA;;;BA)(A;OICI;GA;;;SY)"
// volumePath is a regular expression to check if a path is a Windows
// volume path (e.g., "\\?\Volume{4c1b02c1-d990-11dc-99ae-806e6f6e6963}"
// or "\\?\Volume{4c1b02c1-d990-11dc-99ae-806e6f6e6963}\").
var volumePath = regexp.MustCompile(`^\\\\\?\\Volume{[a-z0-9-]+}\\?$`)
// MkdirAllWithACL is a custom version of os.MkdirAll modified for use on Windows
// so that it is both volume path aware, and to create a directory
// an appropriate SDDL defined ACL for Builtin Administrators and Local System.
func MkdirAllWithACL(path string, _ os.FileMode) error {
sa, err := makeSecurityAttributes(SddlAdministratorsLocalSystem)
if err != nil {
return &os.PathError{Op: "mkdirall", Path: path, Err: err}
}
return mkdirall(path, sa)
}
// MkdirAll is a custom version of os.MkdirAll that is volume path aware for
// Windows. It can be used as a drop-in replacement for os.MkdirAll.
func MkdirAll(path string, _ os.FileMode) error {
return mkdirall(path, nil)
}
// mkdirall is a custom version of os.MkdirAll modified for use on Windows
// so that it is both volume path aware, and can create a directory with
// a DACL.
func mkdirall(path string, perm *windows.SecurityAttributes) error {
if volumePath.MatchString(path) {
return nil
}
// The rest of this method is largely copied from os.MkdirAll and should be kept
// as-is to ensure compatibility.
// Fast path: if we can tell whether path is a directory or file, stop with success or error.
dir, err := os.Stat(path)
if err == nil {
if dir.IsDir() {
return nil
}
return &os.PathError{Op: "mkdir", Path: path, Err: syscall.ENOTDIR}
}
// Slow path: make sure parent exists and then call Mkdir for path.
i := len(path)
for i > 0 && os.IsPathSeparator(path[i-1]) { // Skip trailing path separator.
i--
}
j := i
for j > 0 && !os.IsPathSeparator(path[j-1]) { // Scan backward over element.
j--
}
if j > 1 {
// Create parent.
err = mkdirall(fixRootDirectory(path[:j-1]), perm)
if err != nil {
return err
}
}
// Parent now exists; invoke Mkdir and use its result.
err = mkdirWithACL(path, perm)
if err != nil {
// Handle arguments like "foo/." by
// double-checking that directory doesn't exist.
dir, err1 := os.Lstat(path)
if err1 == nil && dir.IsDir() {
return nil
}
return err
}
return nil
}
// mkdirWithACL creates a new directory. If there is an error, it will be of
// type *PathError. .
//
// This is a modified and combined version of os.Mkdir and windows.Mkdir
// in golang to cater for creating a directory am ACL permitting full
// access, with inheritance, to any subfolder/file for Built-in Administrators
// and Local System.
func mkdirWithACL(name string, sa *windows.SecurityAttributes) error {
if sa == nil {
return os.Mkdir(name, 0)
}
namep, err := windows.UTF16PtrFromString(name)
if err != nil {
return &os.PathError{Op: "mkdir", Path: name, Err: err}
}
err = windows.CreateDirectory(namep, sa)
if err != nil {
return &os.PathError{Op: "mkdir", Path: name, Err: err}
}
return nil
}
// fixRootDirectory fixes a reference to a drive's root directory to
// have the required trailing slash.
func fixRootDirectory(p string) string {
if len(p) == len(`\\?\c:`) {
if os.IsPathSeparator(p[0]) && os.IsPathSeparator(p[1]) && p[2] == '?' && os.IsPathSeparator(p[3]) && p[5] == ':' {
return p + `\`
}
}
return p
}
func makeSecurityAttributes(sddl string) (*windows.SecurityAttributes, error) {
var sa windows.SecurityAttributes
sa.Length = uint32(unsafe.Sizeof(sa))
sa.InheritHandle = 1
var err error
sa.SecurityDescriptor, err = windows.SecurityDescriptorFromString(sddl)
if err != nil {
return nil, err
}
return &sa, nil
}

82
pkg/sys/oom_linux.go Normal file
View File

@@ -0,0 +1,82 @@
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package sys
import (
"fmt"
"os"
"strconv"
"strings"
"github.com/containerd/containerd/v2/pkg/userns"
"golang.org/x/sys/unix"
)
const (
// OOMScoreAdjMin is from OOM_SCORE_ADJ_MIN https://github.com/torvalds/linux/blob/v5.10/include/uapi/linux/oom.h#L9
OOMScoreAdjMin = -1000
// OOMScoreAdjMax is from OOM_SCORE_ADJ_MAX https://github.com/torvalds/linux/blob/v5.10/include/uapi/linux/oom.h#L10
OOMScoreAdjMax = 1000
)
// AdjustOOMScore sets the oom score for the provided pid. If the provided score
// is out of range (-1000 - 1000), it is clipped to the min/max value.
func AdjustOOMScore(pid, score int) error {
if score > OOMScoreAdjMax {
score = OOMScoreAdjMax
} else if score < OOMScoreAdjMin {
score = OOMScoreAdjMin
}
return SetOOMScore(pid, score)
}
// SetOOMScore sets the oom score for the provided pid
func SetOOMScore(pid, score int) error {
if score > OOMScoreAdjMax || score < OOMScoreAdjMin {
return fmt.Errorf("value out of range (%d): OOM score must be between %d and %d", score, OOMScoreAdjMin, OOMScoreAdjMax)
}
path := fmt.Sprintf("/proc/%d/oom_score_adj", pid)
f, err := os.OpenFile(path, os.O_WRONLY, 0)
if err != nil {
return err
}
defer f.Close()
if _, err = f.WriteString(strconv.Itoa(score)); err != nil {
if os.IsPermission(err) && (!runningPrivileged() || userns.RunningInUserNS()) {
return nil
}
return err
}
return nil
}
// GetOOMScoreAdj gets the oom score for a process. It returns 0 (zero) if either
// no oom score is set, or a sore is set to 0.
func GetOOMScoreAdj(pid int) (int, error) {
path := fmt.Sprintf("/proc/%d/oom_score_adj", pid)
data, err := os.ReadFile(path)
if err != nil {
return 0, err
}
return strconv.Atoi(strings.TrimSpace(string(data)))
}
// runningPrivileged returns true if the effective user ID of the
// calling process is 0
func runningPrivileged() bool {
return unix.Geteuid() == 0
}

129
pkg/sys/oom_linux_test.go Normal file
View File

@@ -0,0 +1,129 @@
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package sys
import (
"errors"
"fmt"
"os"
"os/exec"
"testing"
"time"
"github.com/containerd/containerd/v2/pkg/userns"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func TestSetPositiveOomScoreAdjustment(t *testing.T) {
// Setting a *positive* OOM score adjust does not require privileged
_, adjustment, err := adjustOom(123)
assert.NoError(t, err)
assert.EqualValues(t, adjustment, 123)
}
func TestSetNegativeOomScoreAdjustmentWhenPrivileged(t *testing.T) {
if !runningPrivileged() || userns.RunningInUserNS() {
t.Skip("requires root and not running in user namespace")
return
}
_, adjustment, err := adjustOom(-123)
assert.NoError(t, err)
assert.EqualValues(t, adjustment, -123)
}
func TestSetNegativeOomScoreAdjustmentWhenUnprivilegedHasNoEffect(t *testing.T) {
if runningPrivileged() && !userns.RunningInUserNS() {
t.Skip("needs to be run as non-root or in user namespace")
return
}
initial, adjustment, err := adjustOom(-123)
assert.NoError(t, err)
assert.EqualValues(t, adjustment, initial)
}
func TestSetOOMScoreBoundaries(t *testing.T) {
err := SetOOMScore(0, OOMScoreAdjMax+1)
require.NotNil(t, err)
assert.Contains(t, err.Error(), fmt.Sprintf("value out of range (%d): OOM score must be between", OOMScoreAdjMax+1))
err = SetOOMScore(0, OOMScoreAdjMin-1)
require.NotNil(t, err)
assert.Contains(t, err.Error(), fmt.Sprintf("value out of range (%d): OOM score must be between", OOMScoreAdjMin-1))
_, adjustment, err := adjustOom(OOMScoreAdjMax)
assert.NoError(t, err)
assert.EqualValues(t, adjustment, OOMScoreAdjMax)
score, err := GetOOMScoreAdj(os.Getpid())
assert.NoError(t, err)
if score == OOMScoreAdjMin {
// We won't be able to set the score lower than the parent process. This
// could also be tested if the parent process does not have a oom-score-adj
// set, but GetOOMScoreAdj does not distinguish between "not set" and
// "score is set, but zero".
_, adjustment, err = adjustOom(OOMScoreAdjMin)
assert.NoError(t, err)
assert.EqualValues(t, adjustment, OOMScoreAdjMin)
}
}
func adjustOom(adjustment int) (int, int, error) {
cmd := exec.Command("sleep", "100")
if err := cmd.Start(); err != nil {
return 0, 0, err
}
defer cmd.Process.Kill()
pid, err := waitForPid(cmd.Process)
if err != nil {
return 0, 0, err
}
initial, err := GetOOMScoreAdj(pid)
if err != nil {
return 0, 0, err
}
if err := SetOOMScore(pid, adjustment); err != nil {
return 0, 0, err
}
adj, err := GetOOMScoreAdj(pid)
return initial, adj, err
}
func waitForPid(process *os.Process) (int, error) {
c := make(chan int, 1)
go func() {
for {
pid := process.Pid
if pid != 0 {
c <- pid
}
}
}()
select {
case pid := <-c:
return pid, nil
case <-time.After(10 * time.Second):
return 0, errors.New("process did not start in 10 seconds")
}
}

View File

@@ -0,0 +1,48 @@
//go:build !linux
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package sys
const (
// OOMScoreMaxKillable is not implemented on non Linux
OOMScoreMaxKillable = 0
// OOMScoreAdjMax is not implemented on non Linux
OOMScoreAdjMax = 0
)
// AdjustOOMScore sets the oom score for the provided pid. If the provided score
// is out of range (-1000 - 1000), it is clipped to the min/max value.
//
// Not implemented on Windows
func AdjustOOMScore(pid, score int) error {
return nil
}
// SetOOMScore sets the oom score for the process
//
// Not implemented on Windows
func SetOOMScore(pid, score int) error {
return nil
}
// GetOOMScoreAdj gets the oom score for a process
//
// Not implemented on Windows
func GetOOMScoreAdj(pid int) (int, error) {
return 0, nil
}

View File

@@ -0,0 +1,290 @@
//go:build !windows
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package reaper
import (
"errors"
"fmt"
"os/exec"
"runtime"
"sync"
"syscall"
"time"
runc "github.com/containerd/go-runc"
"golang.org/x/sys/unix"
)
// ErrNoSuchProcess is returned when the process no longer exists
var ErrNoSuchProcess = errors.New("no such process")
const bufferSize = 32
type subscriber struct {
sync.Mutex
c chan runc.Exit
closed bool
}
func (s *subscriber) close() {
s.Lock()
if s.closed {
s.Unlock()
return
}
close(s.c)
s.closed = true
s.Unlock()
}
func (s *subscriber) do(fn func()) {
s.Lock()
fn()
s.Unlock()
}
// Reap should be called when the process receives an SIGCHLD. Reap will reap
// all exited processes and close their wait channels
func Reap() error {
now := time.Now()
exits, err := reap(false)
for _, e := range exits {
done := Default.notify(runc.Exit{
Timestamp: now,
Pid: e.Pid,
Status: e.Status,
})
select {
case <-done:
case <-time.After(1 * time.Second):
}
}
return err
}
// Default is the default monitor initialized for the package
var Default = &Monitor{
subscribers: make(map[chan runc.Exit]*subscriber),
}
// Monitor monitors the underlying system for process status changes
type Monitor struct {
sync.Mutex
subscribers map[chan runc.Exit]*subscriber
}
// Start starts the command and registers the process with the reaper
func (m *Monitor) Start(c *exec.Cmd) (chan runc.Exit, error) {
ec := m.Subscribe()
if err := c.Start(); err != nil {
m.Unsubscribe(ec)
return nil, err
}
return ec, nil
}
// StartLocked starts the command and registers the process with the reaper
func (m *Monitor) StartLocked(c *exec.Cmd) (chan runc.Exit, error) {
runtime.LockOSThread()
defer runtime.UnlockOSThread()
return m.Start(c)
}
// Wait blocks until a process is signal as dead.
// User should rely on the value of the exit status to determine if the
// command was successful or not.
func (m *Monitor) Wait(c *exec.Cmd, ec chan runc.Exit) (int, error) {
for e := range ec {
if e.Pid == c.Process.Pid {
// make sure we flush all IO
c.Wait()
m.Unsubscribe(ec)
return e.Status, nil
}
}
// return no such process if the ec channel is closed and no more exit
// events will be sent
return -1, ErrNoSuchProcess
}
// WaitTimeout is used to skip the blocked command and kill the left process.
func (m *Monitor) WaitTimeout(c *exec.Cmd, ec chan runc.Exit, timeout time.Duration) (int, error) {
type exitStatusWrapper struct {
status int
err error
}
// capacity can make sure that the following goroutine will not be
// blocked if there is no receiver when timeout.
waitCh := make(chan *exitStatusWrapper, 1)
go func() {
defer close(waitCh)
status, err := m.Wait(c, ec)
waitCh <- &exitStatusWrapper{
status: status,
err: err,
}
}()
timer := time.NewTimer(timeout)
defer timer.Stop()
select {
case <-timer.C:
syscall.Kill(c.Process.Pid, syscall.SIGKILL)
return 0, fmt.Errorf("timeout %v for cmd(pid=%d): %s, %s", timeout, c.Process.Pid, c.Path, c.Args)
case res := <-waitCh:
return res.status, res.err
}
}
// Subscribe to process exit changes
func (m *Monitor) Subscribe() chan runc.Exit {
c := make(chan runc.Exit, bufferSize)
m.Lock()
m.subscribers[c] = &subscriber{
c: c,
}
m.Unlock()
return c
}
// Unsubscribe to process exit changes
func (m *Monitor) Unsubscribe(c chan runc.Exit) {
m.Lock()
s, ok := m.subscribers[c]
if !ok {
m.Unlock()
return
}
s.close()
delete(m.subscribers, c)
m.Unlock()
}
func (m *Monitor) getSubscribers() map[chan runc.Exit]*subscriber {
out := make(map[chan runc.Exit]*subscriber)
m.Lock()
for k, v := range m.subscribers {
out[k] = v
}
m.Unlock()
return out
}
func (m *Monitor) notify(e runc.Exit) chan struct{} {
const timeout = 1 * time.Millisecond
var (
done = make(chan struct{}, 1)
timer = time.NewTimer(timeout)
success = make(map[chan runc.Exit]struct{})
)
stop(timer, true)
go func() {
defer close(done)
for {
var (
failed int
subscribers = m.getSubscribers()
)
for _, s := range subscribers {
s.do(func() {
if s.closed {
return
}
if _, ok := success[s.c]; ok {
return
}
timer.Reset(timeout)
recv := true
select {
case s.c <- e:
success[s.c] = struct{}{}
case <-timer.C:
recv = false
failed++
}
stop(timer, recv)
})
}
// all subscribers received the message
if failed == 0 {
return
}
}
}()
return done
}
func stop(timer *time.Timer, recv bool) {
if !timer.Stop() && recv {
<-timer.C
}
}
// exit is the wait4 information from an exited process
type exit struct {
Pid int
Status int
}
// reap reaps all child processes for the calling process and returns their
// exit information
func reap(wait bool) (exits []exit, err error) {
var (
ws unix.WaitStatus
rus unix.Rusage
)
flag := unix.WNOHANG
if wait {
flag = 0
}
for {
pid, err := unix.Wait4(-1, &ws, flag, &rus)
if err != nil {
if err == unix.ECHILD {
return exits, nil
}
return exits, err
}
if pid <= 0 {
return exits, nil
}
exits = append(exits, exit{
Pid: pid,
Status: exitStatus(ws),
})
}
}
const exitSignalOffset = 128
// exitStatus returns the correct exit status for a process based on if it
// was signaled or exited cleanly
func exitStatus(status unix.WaitStatus) int {
if status.Signaled() {
return exitSignalOffset + int(status.Signal())
}
return status.ExitStatus()
}

View File

@@ -0,0 +1,39 @@
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package reaper
import (
"unsafe"
"golang.org/x/sys/unix"
)
// SetSubreaper sets the value i as the subreaper setting for the calling process
func SetSubreaper(i int) error {
return unix.Prctl(unix.PR_SET_CHILD_SUBREAPER, uintptr(i), 0, 0, 0)
}
// GetSubreaper returns the subreaper setting for the calling process
func GetSubreaper() (int, error) {
var i uintptr
if err := unix.Prctl(unix.PR_GET_CHILD_SUBREAPER, uintptr(unsafe.Pointer(&i)), 0, 0, 0); err != nil {
return -1, err
}
return int(i), nil
}

80
pkg/sys/socket_unix.go Normal file
View File

@@ -0,0 +1,80 @@
//go:build !windows
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package sys
import (
"fmt"
"net"
"os"
"path/filepath"
"golang.org/x/sys/unix"
)
// CreateUnixSocket creates a unix socket and returns the listener
func CreateUnixSocket(path string) (net.Listener, error) {
// BSDs have a 104 limit
if len(path) > 104 {
return nil, fmt.Errorf("%q: unix socket path too long (> 104)", path)
}
if err := os.MkdirAll(filepath.Dir(path), 0660); err != nil {
return nil, err
}
if err := unix.Unlink(path); err != nil && !os.IsNotExist(err) {
return nil, err
}
return net.Listen("unix", path)
}
// GetLocalListener returns a listener out of a unix socket.
func GetLocalListener(path string, uid, gid int) (net.Listener, error) {
// Ensure parent directory is created
if err := mkdirAs(filepath.Dir(path), uid, gid); err != nil {
return nil, err
}
l, err := CreateUnixSocket(path)
if err != nil {
return l, err
}
if err := os.Chmod(path, 0660); err != nil {
l.Close()
return nil, err
}
if err := os.Chown(path, uid, gid); err != nil {
l.Close()
return nil, err
}
return l, nil
}
func mkdirAs(path string, uid, gid int) error {
if _, err := os.Stat(path); !os.IsNotExist(err) {
return err
}
if err := os.MkdirAll(path, 0770); err != nil {
return err
}
return os.Chown(path, uid, gid)
}

30
pkg/sys/socket_windows.go Normal file
View File

@@ -0,0 +1,30 @@
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package sys
import (
"net"
"github.com/Microsoft/go-winio"
)
// GetLocalListener returns a Listernet out of a named pipe.
// `path` must be of the form of `\\.\pipe\<pipename>`
// (see https://msdn.microsoft.com/en-us/library/windows/desktop/aa365150)
func GetLocalListener(path string, uid, gid int) (net.Listener, error) {
return winio.ListenPipe(path, nil)
}

View File

@@ -0,0 +1,30 @@
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package sys
import (
_ "unsafe" // required for go:linkname.
)
//go:linkname beforeFork syscall.runtime_BeforeFork
func beforeFork()
//go:linkname afterFork syscall.runtime_AfterFork
func afterFork()
//go:linkname afterForkInChild syscall.runtime_AfterForkInChild
func afterForkInChild()

View File

@@ -0,0 +1,94 @@
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package sys
import (
"runtime"
"syscall"
"unsafe"
)
// ForkUserns is to fork child process with user namespace. It returns child
// process's pid and pidfd reference to the child process.
//
// Precondition: The runtime OS thread must be locked, which is GO runtime
// requirement.
//
// Beside this, the child process sets PR_SET_PDEATHSIG with SIGKILL so that
// the parent process's OS thread must be locked. Otherwise, the exit event of
// parent process's OS thread will send kill signal to the child process,
// even if parent process is still running.
//
//go:norace
//go:noinline
func ForkUserns() (_pid uintptr, _pidfd uintptr, _ syscall.Errno) {
var (
pidfd uintptr
pid, ppid uintptr
err syscall.Errno
)
ppid, _, err = syscall.RawSyscall(uintptr(syscall.SYS_GETPID), 0, 0, 0)
if err != 0 {
return 0, 0, err
}
beforeFork()
if runtime.GOARCH == "s390x" {
// NOTE:
//
// On the s390 architectures, the order of the first two
// arguments is reversed.
//
// REF: https://man7.org/linux/man-pages/man2/clone.2.html
pid, _, err = syscall.RawSyscall(syscall.SYS_CLONE,
0,
uintptr(syscall.CLONE_NEWUSER|syscall.SIGCHLD|syscall.CLONE_PIDFD),
uintptr(unsafe.Pointer(&pidfd)),
)
} else {
pid, _, err = syscall.RawSyscall(syscall.SYS_CLONE,
uintptr(syscall.CLONE_NEWUSER|syscall.SIGCHLD|syscall.CLONE_PIDFD),
0,
uintptr(unsafe.Pointer(&pidfd)),
)
}
if err != 0 || pid != 0 {
afterFork()
return pid, pidfd, err
}
afterForkInChild()
if _, _, err = syscall.RawSyscall(syscall.SYS_PRCTL, syscall.PR_SET_PDEATHSIG, uintptr(syscall.SIGKILL), 0); err != 0 {
goto err
}
pid, _, err = syscall.RawSyscall(syscall.SYS_GETPPID, 0, 0, 0)
if err != 0 {
goto err
}
// exit if re-parent
if pid != ppid {
goto err
}
_, _, err = syscall.RawSyscall(syscall.SYS_PPOLL, 0, 0, 0)
err:
syscall.RawSyscall(syscall.SYS_EXIT, uintptr(err), 0, 0)
panic("unreachable")
}