cri: implement RuntimeConfig rpc

The rpc only reports one field, i.e. the cgroup driver, to kubelet. Containerd determines the effective cgroup driver by looking at all runtime handlers, starting from the default runtime handler (the rest in alphabetical order), and returning the cgroup driver setting of the first runtime handler that supports one. If no runtime handler supports cgroup driver (i.e. has a config option for it) containerd falls back to auto-detection, returning systemd if systemd is running and cgroupfs otherwise. This patch implements the CRI server side of Kubernetes KEP-4033: https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/4033-group-driver-detection-over-cri Signed-off-by: Markus Lehtonen <markus.lehtonen@intel.com>
2023-06-19 16:14:22 +03:00
parent 850b2e1bf3
commit ed47d6ba76
80 changed files with 9669 additions and 0 deletions
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/ebpf/devicefilter/devicefilter.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/ebpf/devicefilter/devicefilter.go
@@ -0,0 +1,208 @@
+// Package devicefilter contains eBPF device filter program
+//
+// The implementation is based on https://github.com/containers/crun/blob/0.10.2/src/libcrun/ebpf.c
+//
+// Although ebpf.c is originally licensed under LGPL-3.0-or-later, the author (Giuseppe Scrivano)
+// agreed to relicense the file in Apache License 2.0: https://github.com/opencontainers/runc/issues/2144#issuecomment-543116397
+package devicefilter
+
+import (
+	"errors"
+	"fmt"
+	"math"
+	"strconv"
+
+	"github.com/cilium/ebpf/asm"
+	devicesemulator "github.com/opencontainers/runc/libcontainer/cgroups/devices"
+	"github.com/opencontainers/runc/libcontainer/devices"
+	"golang.org/x/sys/unix"
+)
+
+const (
+	// license string format is same as kernel MODULE_LICENSE macro
+	license = "Apache"
+)
+
+// DeviceFilter returns eBPF device filter program and its license string
+func DeviceFilter(rules []*devices.Rule) (asm.Instructions, string, error) {
+	// Generate the minimum ruleset for the device rules we are given. While we
+	// don't care about minimum transitions in cgroupv2, using the emulator
+	// gives us a guarantee that the behaviour of devices filtering is the same
+	// as cgroupv1, including security hardenings to avoid misconfiguration
+	// (such as punching holes in wildcard rules).
+	emu := new(devicesemulator.Emulator)
+	for _, rule := range rules {
+		if err := emu.Apply(*rule); err != nil {
+			return nil, "", err
+		}
+	}
+	cleanRules, err := emu.Rules()
+	if err != nil {
+		return nil, "", err
+	}
+
+	p := &program{
+		defaultAllow: emu.IsBlacklist(),
+	}
+	p.init()
+
+	for idx, rule := range cleanRules {
+		if rule.Type == devices.WildcardDevice {
+			// We can safely skip over wildcard entries because there should
+			// only be one (at most) at the very start to instruct cgroupv1 to
+			// go into allow-list mode. However we do double-check this here.
+			if idx != 0 || rule.Allow != emu.IsBlacklist() {
+				return nil, "", fmt.Errorf("[internal error] emulated cgroupv2 devices ruleset had bad wildcard at idx %v (%s)", idx, rule.CgroupString())
+			}
+			continue
+		}
+		if rule.Allow == p.defaultAllow {
+			// There should be no rules which have an action equal to the
+			// default action, the emulator removes those.
+			return nil, "", fmt.Errorf("[internal error] emulated cgroupv2 devices ruleset had no-op rule at idx %v (%s)", idx, rule.CgroupString())
+		}
+		if err := p.appendRule(rule); err != nil {
+			return nil, "", err
+		}
+	}
+	return p.finalize(), license, nil
+}
+
+type program struct {
+	insts        asm.Instructions
+	defaultAllow bool
+	blockID      int
+}
+
+func (p *program) init() {
+	// struct bpf_cgroup_dev_ctx: https://elixir.bootlin.com/linux/v5.3.6/source/include/uapi/linux/bpf.h#L3423
+	/*
+		u32 access_type
+		u32 major
+		u32 minor
+	*/
+	// R2 <- type (lower 16 bit of u32 access_type at R1[0])
+	p.insts = append(p.insts,
+		asm.LoadMem(asm.R2, asm.R1, 0, asm.Word),
+		asm.And.Imm32(asm.R2, 0xFFFF))
+
+	// R3 <- access (upper 16 bit of u32 access_type at R1[0])
+	p.insts = append(p.insts,
+		asm.LoadMem(asm.R3, asm.R1, 0, asm.Word),
+		// RSh: bitwise shift right
+		asm.RSh.Imm32(asm.R3, 16))
+
+	// R4 <- major (u32 major at R1[4])
+	p.insts = append(p.insts,
+		asm.LoadMem(asm.R4, asm.R1, 4, asm.Word))
+
+	// R5 <- minor (u32 minor at R1[8])
+	p.insts = append(p.insts,
+		asm.LoadMem(asm.R5, asm.R1, 8, asm.Word))
+}
+
+// appendRule rule converts an OCI rule to the relevant eBPF block and adds it
+// to the in-progress filter program. In order to operate properly, it must be
+// called with a "clean" rule list (generated by devices.Emulator.Rules() --
+// with any "a" rules removed).
+func (p *program) appendRule(rule *devices.Rule) error {
+	if p.blockID < 0 {
+		return errors.New("the program is finalized")
+	}
+
+	var bpfType int32
+	switch rule.Type {
+	case devices.CharDevice:
+		bpfType = int32(unix.BPF_DEVCG_DEV_CHAR)
+	case devices.BlockDevice:
+		bpfType = int32(unix.BPF_DEVCG_DEV_BLOCK)
+	default:
+		// We do not permit 'a', nor any other types we don't know about.
+		return fmt.Errorf("invalid type %q", string(rule.Type))
+	}
+	if rule.Major > math.MaxUint32 {
+		return fmt.Errorf("invalid major %d", rule.Major)
+	}
+	if rule.Minor > math.MaxUint32 {
+		return fmt.Errorf("invalid minor %d", rule.Major)
+	}
+	hasMajor := rule.Major >= 0 // if not specified in OCI json, major is set to -1
+	hasMinor := rule.Minor >= 0
+	bpfAccess := int32(0)
+	for _, r := range rule.Permissions {
+		switch r {
+		case 'r':
+			bpfAccess |= unix.BPF_DEVCG_ACC_READ
+		case 'w':
+			bpfAccess |= unix.BPF_DEVCG_ACC_WRITE
+		case 'm':
+			bpfAccess |= unix.BPF_DEVCG_ACC_MKNOD
+		default:
+			return fmt.Errorf("unknown device access %v", r)
+		}
+	}
+	// If the access is rwm, skip the check.
+	hasAccess := bpfAccess != (unix.BPF_DEVCG_ACC_READ | unix.BPF_DEVCG_ACC_WRITE | unix.BPF_DEVCG_ACC_MKNOD)
+
+	var (
+		blockSym         = "block-" + strconv.Itoa(p.blockID)
+		nextBlockSym     = "block-" + strconv.Itoa(p.blockID+1)
+		prevBlockLastIdx = len(p.insts) - 1
+	)
+	p.insts = append(p.insts,
+		// if (R2 != bpfType) goto next
+		asm.JNE.Imm(asm.R2, bpfType, nextBlockSym),
+	)
+	if hasAccess {
+		p.insts = append(p.insts,
+			// if (R3 & bpfAccess != R3 /* use R1 as a temp var */) goto next
+			asm.Mov.Reg32(asm.R1, asm.R3),
+			asm.And.Imm32(asm.R1, bpfAccess),
+			asm.JNE.Reg(asm.R1, asm.R3, nextBlockSym),
+		)
+	}
+	if hasMajor {
+		p.insts = append(p.insts,
+			// if (R4 != major) goto next
+			asm.JNE.Imm(asm.R4, int32(rule.Major), nextBlockSym),
+		)
+	}
+	if hasMinor {
+		p.insts = append(p.insts,
+			// if (R5 != minor) goto next
+			asm.JNE.Imm(asm.R5, int32(rule.Minor), nextBlockSym),
+		)
+	}
+	p.insts = append(p.insts, acceptBlock(rule.Allow)...)
+	// set blockSym to the first instruction we added in this iteration
+	p.insts[prevBlockLastIdx+1] = p.insts[prevBlockLastIdx+1].Sym(blockSym)
+	p.blockID++
+	return nil
+}
+
+func (p *program) finalize() asm.Instructions {
+	var v int32
+	if p.defaultAllow {
+		v = 1
+	}
+	blockSym := "block-" + strconv.Itoa(p.blockID)
+	p.insts = append(p.insts,
+		// R0 <- v
+		asm.Mov.Imm32(asm.R0, v).Sym(blockSym),
+		asm.Return(),
+	)
+	p.blockID = -1
+	return p.insts
+}
+
+func acceptBlock(accept bool) asm.Instructions {
+	var v int32
+	if accept {
+		v = 1
+	}
+	return []asm.Instruction{
+		// R0 <- v
+		asm.Mov.Imm32(asm.R0, v),
+		asm.Return(),
+	}
+}
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/ebpf/ebpf_linux.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/ebpf/ebpf_linux.go
@@ -0,0 +1,253 @@
+package ebpf
+
+import (
+	"errors"
+	"fmt"
+	"os"
+	"runtime"
+	"sync"
+	"unsafe"
+
+	"github.com/cilium/ebpf"
+	"github.com/cilium/ebpf/asm"
+	"github.com/cilium/ebpf/link"
+	"github.com/sirupsen/logrus"
+	"golang.org/x/sys/unix"
+)
+
+func nilCloser() error {
+	return nil
+}
+
+func findAttachedCgroupDeviceFilters(dirFd int) ([]*ebpf.Program, error) {
+	type bpfAttrQuery struct {
+		TargetFd    uint32
+		AttachType  uint32
+		QueryType   uint32
+		AttachFlags uint32
+		ProgIds     uint64 // __aligned_u64
+		ProgCnt     uint32
+	}
+
+	// Currently you can only have 64 eBPF programs attached to a cgroup.
+	size := 64
+	retries := 0
+	for retries < 10 {
+		progIds := make([]uint32, size)
+		query := bpfAttrQuery{
+			TargetFd:   uint32(dirFd),
+			AttachType: uint32(unix.BPF_CGROUP_DEVICE),
+			ProgIds:    uint64(uintptr(unsafe.Pointer(&progIds[0]))),
+			ProgCnt:    uint32(len(progIds)),
+		}
+
+		// Fetch the list of program ids.
+		_, _, errno := unix.Syscall(unix.SYS_BPF,
+			uintptr(unix.BPF_PROG_QUERY),
+			uintptr(unsafe.Pointer(&query)),
+			unsafe.Sizeof(query))
+		size = int(query.ProgCnt)
+		runtime.KeepAlive(query)
+		if errno != 0 {
+			// On ENOSPC we get the correct number of programs.
+			if errno == unix.ENOSPC {
+				retries++
+				continue
+			}
+			return nil, fmt.Errorf("bpf_prog_query(BPF_CGROUP_DEVICE) failed: %w", errno)
+		}
+
+		// Convert the ids to program handles.
+		progIds = progIds[:size]
+		programs := make([]*ebpf.Program, 0, len(progIds))
+		for _, progId := range progIds {
+			program, err := ebpf.NewProgramFromID(ebpf.ProgramID(progId))
+			if err != nil {
+				// We skip over programs that give us -EACCES or -EPERM. This
+				// is necessary because there may be BPF programs that have
+				// been attached (such as with --systemd-cgroup) which have an
+				// LSM label that blocks us from interacting with the program.
+				//
+				// Because additional BPF_CGROUP_DEVICE programs only can add
+				// restrictions, there's no real issue with just ignoring these
+				// programs (and stops runc from breaking on distributions with
+				// very strict SELinux policies).
+				if errors.Is(err, os.ErrPermission) {
+					logrus.Debugf("ignoring existing CGROUP_DEVICE program (prog_id=%v) which cannot be accessed by runc -- likely due to LSM policy: %v", progId, err)
+					continue
+				}
+				return nil, fmt.Errorf("cannot fetch program from id: %w", err)
+			}
+			programs = append(programs, program)
+		}
+		runtime.KeepAlive(progIds)
+		return programs, nil
+	}
+
+	return nil, errors.New("could not get complete list of CGROUP_DEVICE programs")
+}
+
+var (
+	haveBpfProgReplaceBool bool
+	haveBpfProgReplaceOnce sync.Once
+)
+
+// Loosely based on the BPF_F_REPLACE support check in
+// https://github.com/cilium/ebpf/blob/v0.6.0/link/syscalls.go.
+//
+// TODO: move this logic to cilium/ebpf
+func haveBpfProgReplace() bool {
+	haveBpfProgReplaceOnce.Do(func() {
+		prog, err := ebpf.NewProgram(&ebpf.ProgramSpec{
+			Type:    ebpf.CGroupDevice,
+			License: "MIT",
+			Instructions: asm.Instructions{
+				asm.Mov.Imm(asm.R0, 0),
+				asm.Return(),
+			},
+		})
+		if err != nil {
+			logrus.Debugf("checking for BPF_F_REPLACE support: ebpf.NewProgram failed: %v", err)
+			return
+		}
+		defer prog.Close()
+
+		devnull, err := os.Open("/dev/null")
+		if err != nil {
+			logrus.Debugf("checking for BPF_F_REPLACE support: open dummy target fd: %v", err)
+			return
+		}
+		defer devnull.Close()
+
+		// We know that we have BPF_PROG_ATTACH since we can load
+		// BPF_CGROUP_DEVICE programs. If passing BPF_F_REPLACE gives us EINVAL
+		// we know that the feature isn't present.
+		err = link.RawAttachProgram(link.RawAttachProgramOptions{
+			// We rely on this fd being checked after attachFlags.
+			Target: int(devnull.Fd()),
+			// Attempt to "replace" bad fds with this program.
+			Program: prog,
+			Attach:  ebpf.AttachCGroupDevice,
+			Flags:   unix.BPF_F_ALLOW_MULTI | unix.BPF_F_REPLACE,
+		})
+		if errors.Is(err, unix.EINVAL) {
+			// not supported
+			return
+		}
+		// attach_flags test succeeded.
+		if !errors.Is(err, unix.EBADF) {
+			logrus.Debugf("checking for BPF_F_REPLACE: got unexpected (not EBADF or EINVAL) error: %v", err)
+		}
+		haveBpfProgReplaceBool = true
+	})
+	return haveBpfProgReplaceBool
+}
+
+// LoadAttachCgroupDeviceFilter installs eBPF device filter program to /sys/fs/cgroup/<foo> directory.
+//
+// Requires the system to be running in cgroup2 unified-mode with kernel >= 4.15 .
+//
+// https://github.com/torvalds/linux/commit/ebc614f687369f9df99828572b1d85a7c2de3d92
+func LoadAttachCgroupDeviceFilter(insts asm.Instructions, license string, dirFd int) (func() error, error) {
+	// Increase `ulimit -l` limit to avoid BPF_PROG_LOAD error (#2167).
+	// This limit is not inherited into the container.
+	memlockLimit := &unix.Rlimit{
+		Cur: unix.RLIM_INFINITY,
+		Max: unix.RLIM_INFINITY,
+	}
+	_ = unix.Setrlimit(unix.RLIMIT_MEMLOCK, memlockLimit)
+
+	// Get the list of existing programs.
+	oldProgs, err := findAttachedCgroupDeviceFilters(dirFd)
+	if err != nil {
+		return nilCloser, err
+	}
+	useReplaceProg := haveBpfProgReplace() && len(oldProgs) == 1
+
+	// Generate new program.
+	spec := &ebpf.ProgramSpec{
+		Type:         ebpf.CGroupDevice,
+		Instructions: insts,
+		License:      license,
+	}
+	prog, err := ebpf.NewProgram(spec)
+	if err != nil {
+		return nilCloser, err
+	}
+
+	// If there is only one old program, we can just replace it directly.
+	var (
+		replaceProg *ebpf.Program
+		attachFlags uint32 = unix.BPF_F_ALLOW_MULTI
+	)
+	if useReplaceProg {
+		replaceProg = oldProgs[0]
+		attachFlags |= unix.BPF_F_REPLACE
+	}
+	err = link.RawAttachProgram(link.RawAttachProgramOptions{
+		Target:  dirFd,
+		Program: prog,
+		Replace: replaceProg,
+		Attach:  ebpf.AttachCGroupDevice,
+		Flags:   attachFlags,
+	})
+	if err != nil {
+		return nilCloser, fmt.Errorf("failed to call BPF_PROG_ATTACH (BPF_CGROUP_DEVICE, BPF_F_ALLOW_MULTI): %w", err)
+	}
+	closer := func() error {
+		err = link.RawDetachProgram(link.RawDetachProgramOptions{
+			Target:  dirFd,
+			Program: prog,
+			Attach:  ebpf.AttachCGroupDevice,
+		})
+		if err != nil {
+			return fmt.Errorf("failed to call BPF_PROG_DETACH (BPF_CGROUP_DEVICE): %w", err)
+		}
+		// TODO: Should we attach the old filters back in this case? Otherwise
+		//       we fail-open on a security feature, which is a bit scary.
+		return nil
+	}
+	if !useReplaceProg {
+		logLevel := logrus.DebugLevel
+		// If there was more than one old program, give a warning (since this
+		// really shouldn't happen with runc-managed cgroups) and then detach
+		// all the old programs.
+		if len(oldProgs) > 1 {
+			// NOTE: Ideally this should be a warning but it turns out that
+			//       systemd-managed cgroups trigger this warning (apparently
+			//       systemd doesn't delete old non-systemd programs when
+			//       setting properties).
+			logrus.Infof("found more than one filter (%d) attached to a cgroup -- removing extra filters!", len(oldProgs))
+			logLevel = logrus.InfoLevel
+		}
+		for idx, oldProg := range oldProgs {
+			// Output some extra debug info.
+			if info, err := oldProg.Info(); err == nil {
+				fields := logrus.Fields{
+					"type": info.Type.String(),
+					"tag":  info.Tag,
+					"name": info.Name,
+				}
+				if id, ok := info.ID(); ok {
+					fields["id"] = id
+				}
+				if runCount, ok := info.RunCount(); ok {
+					fields["run_count"] = runCount
+				}
+				if runtime, ok := info.Runtime(); ok {
+					fields["runtime"] = runtime.String()
+				}
+				logrus.WithFields(fields).Logf(logLevel, "removing old filter %d from cgroup", idx)
+			}
+			err = link.RawDetachProgram(link.RawDetachProgramOptions{
+				Target:  dirFd,
+				Program: oldProg,
+				Attach:  ebpf.AttachCGroupDevice,
+			})
+			if err != nil {
+				return closer, fmt.Errorf("failed to call BPF_PROG_DETACH (BPF_CGROUP_DEVICE) on old filter program: %w", err)
+			}
+		}
+	}
+	return closer, nil
+}