//go:build !windows /* Copyright The containerd Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ package shim import ( "bufio" "context" "crypto/sha256" "errors" "fmt" "io" "math" "net" "os" "path/filepath" "runtime" "strconv" "strings" "syscall" "time" "github.com/containerd/log" "github.com/mdlayher/vsock" "github.com/containerd/containerd/v2/defaults" "github.com/containerd/containerd/v2/pkg/namespaces" "github.com/containerd/containerd/v2/pkg/sys" ) const ( shimBinaryFormat = "containerd-shim-%s-%s" socketPathLimit = 106 protoVsock = "vsock" protoHybridVsock = "hvsock" protoUnix = "unix" ) func getSysProcAttr() *syscall.SysProcAttr { return &syscall.SysProcAttr{ Setpgid: true, } } // AdjustOOMScore sets the OOM score for the process to the parents OOM score +1 // to ensure that they parent has a lower* score than the shim // if not already at the maximum OOM Score func AdjustOOMScore(pid int) error { parent := os.Getppid() score, err := sys.GetOOMScoreAdj(parent) if err != nil { return fmt.Errorf("get parent OOM score: %w", err) } shimScore := score + 1 if err := sys.AdjustOOMScore(pid, shimScore); err != nil { return fmt.Errorf("set shim OOM score: %w", err) } return nil } const socketRoot = defaults.DefaultStateDir // SocketAddress returns a socket address func SocketAddress(ctx context.Context, socketPath, id string, debug bool) (string, error) { ns, err := namespaces.NamespaceRequired(ctx) if err != nil { return "", err } path := filepath.Join(socketPath, ns, id) if debug { path = filepath.Join(path, "debug") } d := sha256.Sum256([]byte(path)) return fmt.Sprintf("unix://%s/%x", filepath.Join(socketRoot, "s"), d), nil } // AnonDialer returns a dialer for a socket func AnonDialer(address string, timeout time.Duration) (net.Conn, error) { proto, addr, ok := strings.Cut(address, "://") if !ok { return net.DialTimeout("unix", socket(address).path(), timeout) } switch proto { case protoVsock: // vsock dialer can not set timeout return dialVsock(addr) case protoHybridVsock: return dialHybridVsock(addr, timeout) case protoUnix: return net.DialTimeout("unix", socket(address).path(), timeout) default: return nil, fmt.Errorf("unsupported protocol: %s", proto) } } // AnonReconnectDialer returns a dialer for an existing socket on reconnection func AnonReconnectDialer(address string, timeout time.Duration) (net.Conn, error) { return AnonDialer(address, timeout) } // NewSocket returns a new socket func NewSocket(address string) (*net.UnixListener, error) { var ( sock = socket(address) path = sock.path() isAbstract = sock.isAbstract() perm = os.FileMode(0600) ) // Darwin needs +x to access socket, otherwise it'll fail with "bind: permission denied" when running as non-root. if runtime.GOOS == "darwin" { perm = 0700 } if !isAbstract { if err := os.MkdirAll(filepath.Dir(path), perm); err != nil { return nil, fmt.Errorf("mkdir failed for %s: %w", path, err) } } l, err := net.Listen("unix", path) if err != nil { return nil, err } if !isAbstract { if err := os.Chmod(path, perm); err != nil { os.Remove(sock.path()) l.Close() return nil, fmt.Errorf("chmod failed for %s: %w", path, err) } } return l.(*net.UnixListener), nil } const abstractSocketPrefix = "\x00" type socket string func (s socket) isAbstract() bool { return !strings.HasPrefix(string(s), "unix://") } func (s socket) path() string { path := strings.TrimPrefix(string(s), "unix://") // if there was no trim performed, we assume an abstract socket if len(path) == len(s) { path = abstractSocketPrefix + path } return path } // RemoveSocket removes the socket at the specified address if // it exists on the filesystem func RemoveSocket(address string) error { sock := socket(address) if !sock.isAbstract() { return os.Remove(sock.path()) } return nil } // SocketEaddrinuse returns true if the provided error is caused by the // EADDRINUSE error number func SocketEaddrinuse(err error) bool { var netErr *net.OpError if errors.As(err, &netErr) { if netErr.Op != "listen" { return false } return errors.Is(err, syscall.EADDRINUSE) } return false } // CanConnect returns true if the socket provided at the address // is accepting new connections func CanConnect(address string) bool { conn, err := AnonDialer(address, 100*time.Millisecond) if err != nil { return false } conn.Close() return true } func hybridVsockDialer(addr string, port uint64, timeout time.Duration) (net.Conn, error) { timeoutCh := time.After(timeout) // Do 10 retries before timeout retryInterval := timeout / 10 for { conn, err := net.DialTimeout("unix", addr, timeout) if err != nil { return nil, err } if _, err = conn.Write([]byte(fmt.Sprintf("CONNECT %d\n", port))); err != nil { conn.Close() return nil, err } errChan := make(chan error, 1) go func() { reader := bufio.NewReader(conn) response, err := reader.ReadString('\n') if err != nil { errChan <- err return } if strings.Contains(response, "OK") { errChan <- nil } else { errChan <- fmt.Errorf("hybrid vsock handshake response error: %s", response) } }() select { case err = <-errChan: if err != nil { conn.Close() // When it is EOF, maybe the server side is not ready. if err == io.EOF { log.G(context.Background()).Warnf("Read hybrid vsock got EOF, server may not ready") time.Sleep(retryInterval) continue } return nil, err } return conn, nil case <-timeoutCh: conn.Close() return nil, fmt.Errorf("timeout waiting for hybrid vsocket handshake of %s:%d", addr, port) } } } func dialVsock(address string) (net.Conn, error) { contextIDString, portString, ok := strings.Cut(address, ":") if !ok { return nil, fmt.Errorf("invalid vsock address %s", address) } contextID, err := strconv.ParseUint(contextIDString, 10, 0) if err != nil { return nil, fmt.Errorf("failed to parse vsock context id %s, %v", contextIDString, err) } if contextID > math.MaxUint32 { return nil, fmt.Errorf("vsock context id %d is invalid", contextID) } port, err := strconv.ParseUint(portString, 10, 0) if err != nil { return nil, fmt.Errorf("failed to parse vsock port %s, %v", portString, err) } if port > math.MaxUint32 { return nil, fmt.Errorf("vsock port %d is invalid", port) } return vsock.Dial(uint32(contextID), uint32(port), &vsock.Config{}) } func dialHybridVsock(address string, timeout time.Duration) (net.Conn, error) { addr, portString, ok := strings.Cut(address, ":") if !ok { return nil, fmt.Errorf("invalid hybrid vsock address %s", address) } port, err := strconv.ParseUint(portString, 10, 0) if err != nil { return nil, fmt.Errorf("failed to parse hybrid vsock port %s, %v", portString, err) } if port > math.MaxUint32 { return nil, fmt.Errorf("hybrid vsock port %d is invalid", port) } return hybridVsockDialer(addr, port, timeout) } func cleanupSockets(ctx context.Context) { if address, err := ReadAddress("address"); err == nil { _ = RemoveSocket(address) } if len(socketFlag) > 0 { _ = RemoveSocket("unix://" + socketFlag) } else if address, err := SocketAddress(ctx, addressFlag, id, false); err == nil { _ = RemoveSocket(address) } if len(debugSocketFlag) > 0 { _ = RemoveSocket("unix://" + debugSocketFlag) } else if address, err := SocketAddress(ctx, addressFlag, id, true); err == nil { _ = RemoveSocket(address) } }