cri: implement RuntimeConfig rpc

The rpc only reports one field, i.e. the cgroup driver, to kubelet.
Containerd determines the effective cgroup driver by looking at all
runtime handlers, starting from the default runtime handler (the rest in
alphabetical order), and returning the cgroup driver setting of the
first runtime handler that supports one. If no runtime handler supports
cgroup driver (i.e. has a config option for it) containerd falls back to
auto-detection, returning systemd if systemd is running and cgroupfs
otherwise.

This patch implements the CRI server side of Kubernetes KEP-4033:
https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/4033-group-driver-detection-over-cri

Signed-off-by: Markus Lehtonen <markus.lehtonen@intel.com>
This commit is contained in:
Markus Lehtonen
2023-06-19 16:14:22 +03:00
parent 850b2e1bf3
commit ed47d6ba76
80 changed files with 9669 additions and 0 deletions

View File

@@ -0,0 +1,208 @@
// Package devicefilter contains eBPF device filter program
//
// The implementation is based on https://github.com/containers/crun/blob/0.10.2/src/libcrun/ebpf.c
//
// Although ebpf.c is originally licensed under LGPL-3.0-or-later, the author (Giuseppe Scrivano)
// agreed to relicense the file in Apache License 2.0: https://github.com/opencontainers/runc/issues/2144#issuecomment-543116397
package devicefilter
import (
"errors"
"fmt"
"math"
"strconv"
"github.com/cilium/ebpf/asm"
devicesemulator "github.com/opencontainers/runc/libcontainer/cgroups/devices"
"github.com/opencontainers/runc/libcontainer/devices"
"golang.org/x/sys/unix"
)
const (
// license string format is same as kernel MODULE_LICENSE macro
license = "Apache"
)
// DeviceFilter returns eBPF device filter program and its license string
func DeviceFilter(rules []*devices.Rule) (asm.Instructions, string, error) {
// Generate the minimum ruleset for the device rules we are given. While we
// don't care about minimum transitions in cgroupv2, using the emulator
// gives us a guarantee that the behaviour of devices filtering is the same
// as cgroupv1, including security hardenings to avoid misconfiguration
// (such as punching holes in wildcard rules).
emu := new(devicesemulator.Emulator)
for _, rule := range rules {
if err := emu.Apply(*rule); err != nil {
return nil, "", err
}
}
cleanRules, err := emu.Rules()
if err != nil {
return nil, "", err
}
p := &program{
defaultAllow: emu.IsBlacklist(),
}
p.init()
for idx, rule := range cleanRules {
if rule.Type == devices.WildcardDevice {
// We can safely skip over wildcard entries because there should
// only be one (at most) at the very start to instruct cgroupv1 to
// go into allow-list mode. However we do double-check this here.
if idx != 0 || rule.Allow != emu.IsBlacklist() {
return nil, "", fmt.Errorf("[internal error] emulated cgroupv2 devices ruleset had bad wildcard at idx %v (%s)", idx, rule.CgroupString())
}
continue
}
if rule.Allow == p.defaultAllow {
// There should be no rules which have an action equal to the
// default action, the emulator removes those.
return nil, "", fmt.Errorf("[internal error] emulated cgroupv2 devices ruleset had no-op rule at idx %v (%s)", idx, rule.CgroupString())
}
if err := p.appendRule(rule); err != nil {
return nil, "", err
}
}
return p.finalize(), license, nil
}
type program struct {
insts asm.Instructions
defaultAllow bool
blockID int
}
func (p *program) init() {
// struct bpf_cgroup_dev_ctx: https://elixir.bootlin.com/linux/v5.3.6/source/include/uapi/linux/bpf.h#L3423
/*
u32 access_type
u32 major
u32 minor
*/
// R2 <- type (lower 16 bit of u32 access_type at R1[0])
p.insts = append(p.insts,
asm.LoadMem(asm.R2, asm.R1, 0, asm.Word),
asm.And.Imm32(asm.R2, 0xFFFF))
// R3 <- access (upper 16 bit of u32 access_type at R1[0])
p.insts = append(p.insts,
asm.LoadMem(asm.R3, asm.R1, 0, asm.Word),
// RSh: bitwise shift right
asm.RSh.Imm32(asm.R3, 16))
// R4 <- major (u32 major at R1[4])
p.insts = append(p.insts,
asm.LoadMem(asm.R4, asm.R1, 4, asm.Word))
// R5 <- minor (u32 minor at R1[8])
p.insts = append(p.insts,
asm.LoadMem(asm.R5, asm.R1, 8, asm.Word))
}
// appendRule rule converts an OCI rule to the relevant eBPF block and adds it
// to the in-progress filter program. In order to operate properly, it must be
// called with a "clean" rule list (generated by devices.Emulator.Rules() --
// with any "a" rules removed).
func (p *program) appendRule(rule *devices.Rule) error {
if p.blockID < 0 {
return errors.New("the program is finalized")
}
var bpfType int32
switch rule.Type {
case devices.CharDevice:
bpfType = int32(unix.BPF_DEVCG_DEV_CHAR)
case devices.BlockDevice:
bpfType = int32(unix.BPF_DEVCG_DEV_BLOCK)
default:
// We do not permit 'a', nor any other types we don't know about.
return fmt.Errorf("invalid type %q", string(rule.Type))
}
if rule.Major > math.MaxUint32 {
return fmt.Errorf("invalid major %d", rule.Major)
}
if rule.Minor > math.MaxUint32 {
return fmt.Errorf("invalid minor %d", rule.Major)
}
hasMajor := rule.Major >= 0 // if not specified in OCI json, major is set to -1
hasMinor := rule.Minor >= 0
bpfAccess := int32(0)
for _, r := range rule.Permissions {
switch r {
case 'r':
bpfAccess |= unix.BPF_DEVCG_ACC_READ
case 'w':
bpfAccess |= unix.BPF_DEVCG_ACC_WRITE
case 'm':
bpfAccess |= unix.BPF_DEVCG_ACC_MKNOD
default:
return fmt.Errorf("unknown device access %v", r)
}
}
// If the access is rwm, skip the check.
hasAccess := bpfAccess != (unix.BPF_DEVCG_ACC_READ | unix.BPF_DEVCG_ACC_WRITE | unix.BPF_DEVCG_ACC_MKNOD)
var (
blockSym = "block-" + strconv.Itoa(p.blockID)
nextBlockSym = "block-" + strconv.Itoa(p.blockID+1)
prevBlockLastIdx = len(p.insts) - 1
)
p.insts = append(p.insts,
// if (R2 != bpfType) goto next
asm.JNE.Imm(asm.R2, bpfType, nextBlockSym),
)
if hasAccess {
p.insts = append(p.insts,
// if (R3 & bpfAccess != R3 /* use R1 as a temp var */) goto next
asm.Mov.Reg32(asm.R1, asm.R3),
asm.And.Imm32(asm.R1, bpfAccess),
asm.JNE.Reg(asm.R1, asm.R3, nextBlockSym),
)
}
if hasMajor {
p.insts = append(p.insts,
// if (R4 != major) goto next
asm.JNE.Imm(asm.R4, int32(rule.Major), nextBlockSym),
)
}
if hasMinor {
p.insts = append(p.insts,
// if (R5 != minor) goto next
asm.JNE.Imm(asm.R5, int32(rule.Minor), nextBlockSym),
)
}
p.insts = append(p.insts, acceptBlock(rule.Allow)...)
// set blockSym to the first instruction we added in this iteration
p.insts[prevBlockLastIdx+1] = p.insts[prevBlockLastIdx+1].Sym(blockSym)
p.blockID++
return nil
}
func (p *program) finalize() asm.Instructions {
var v int32
if p.defaultAllow {
v = 1
}
blockSym := "block-" + strconv.Itoa(p.blockID)
p.insts = append(p.insts,
// R0 <- v
asm.Mov.Imm32(asm.R0, v).Sym(blockSym),
asm.Return(),
)
p.blockID = -1
return p.insts
}
func acceptBlock(accept bool) asm.Instructions {
var v int32
if accept {
v = 1
}
return []asm.Instruction{
// R0 <- v
asm.Mov.Imm32(asm.R0, v),
asm.Return(),
}
}

View File

@@ -0,0 +1,253 @@
package ebpf
import (
"errors"
"fmt"
"os"
"runtime"
"sync"
"unsafe"
"github.com/cilium/ebpf"
"github.com/cilium/ebpf/asm"
"github.com/cilium/ebpf/link"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
)
func nilCloser() error {
return nil
}
func findAttachedCgroupDeviceFilters(dirFd int) ([]*ebpf.Program, error) {
type bpfAttrQuery struct {
TargetFd uint32
AttachType uint32
QueryType uint32
AttachFlags uint32
ProgIds uint64 // __aligned_u64
ProgCnt uint32
}
// Currently you can only have 64 eBPF programs attached to a cgroup.
size := 64
retries := 0
for retries < 10 {
progIds := make([]uint32, size)
query := bpfAttrQuery{
TargetFd: uint32(dirFd),
AttachType: uint32(unix.BPF_CGROUP_DEVICE),
ProgIds: uint64(uintptr(unsafe.Pointer(&progIds[0]))),
ProgCnt: uint32(len(progIds)),
}
// Fetch the list of program ids.
_, _, errno := unix.Syscall(unix.SYS_BPF,
uintptr(unix.BPF_PROG_QUERY),
uintptr(unsafe.Pointer(&query)),
unsafe.Sizeof(query))
size = int(query.ProgCnt)
runtime.KeepAlive(query)
if errno != 0 {
// On ENOSPC we get the correct number of programs.
if errno == unix.ENOSPC {
retries++
continue
}
return nil, fmt.Errorf("bpf_prog_query(BPF_CGROUP_DEVICE) failed: %w", errno)
}
// Convert the ids to program handles.
progIds = progIds[:size]
programs := make([]*ebpf.Program, 0, len(progIds))
for _, progId := range progIds {
program, err := ebpf.NewProgramFromID(ebpf.ProgramID(progId))
if err != nil {
// We skip over programs that give us -EACCES or -EPERM. This
// is necessary because there may be BPF programs that have
// been attached (such as with --systemd-cgroup) which have an
// LSM label that blocks us from interacting with the program.
//
// Because additional BPF_CGROUP_DEVICE programs only can add
// restrictions, there's no real issue with just ignoring these
// programs (and stops runc from breaking on distributions with
// very strict SELinux policies).
if errors.Is(err, os.ErrPermission) {
logrus.Debugf("ignoring existing CGROUP_DEVICE program (prog_id=%v) which cannot be accessed by runc -- likely due to LSM policy: %v", progId, err)
continue
}
return nil, fmt.Errorf("cannot fetch program from id: %w", err)
}
programs = append(programs, program)
}
runtime.KeepAlive(progIds)
return programs, nil
}
return nil, errors.New("could not get complete list of CGROUP_DEVICE programs")
}
var (
haveBpfProgReplaceBool bool
haveBpfProgReplaceOnce sync.Once
)
// Loosely based on the BPF_F_REPLACE support check in
// https://github.com/cilium/ebpf/blob/v0.6.0/link/syscalls.go.
//
// TODO: move this logic to cilium/ebpf
func haveBpfProgReplace() bool {
haveBpfProgReplaceOnce.Do(func() {
prog, err := ebpf.NewProgram(&ebpf.ProgramSpec{
Type: ebpf.CGroupDevice,
License: "MIT",
Instructions: asm.Instructions{
asm.Mov.Imm(asm.R0, 0),
asm.Return(),
},
})
if err != nil {
logrus.Debugf("checking for BPF_F_REPLACE support: ebpf.NewProgram failed: %v", err)
return
}
defer prog.Close()
devnull, err := os.Open("/dev/null")
if err != nil {
logrus.Debugf("checking for BPF_F_REPLACE support: open dummy target fd: %v", err)
return
}
defer devnull.Close()
// We know that we have BPF_PROG_ATTACH since we can load
// BPF_CGROUP_DEVICE programs. If passing BPF_F_REPLACE gives us EINVAL
// we know that the feature isn't present.
err = link.RawAttachProgram(link.RawAttachProgramOptions{
// We rely on this fd being checked after attachFlags.
Target: int(devnull.Fd()),
// Attempt to "replace" bad fds with this program.
Program: prog,
Attach: ebpf.AttachCGroupDevice,
Flags: unix.BPF_F_ALLOW_MULTI | unix.BPF_F_REPLACE,
})
if errors.Is(err, unix.EINVAL) {
// not supported
return
}
// attach_flags test succeeded.
if !errors.Is(err, unix.EBADF) {
logrus.Debugf("checking for BPF_F_REPLACE: got unexpected (not EBADF or EINVAL) error: %v", err)
}
haveBpfProgReplaceBool = true
})
return haveBpfProgReplaceBool
}
// LoadAttachCgroupDeviceFilter installs eBPF device filter program to /sys/fs/cgroup/<foo> directory.
//
// Requires the system to be running in cgroup2 unified-mode with kernel >= 4.15 .
//
// https://github.com/torvalds/linux/commit/ebc614f687369f9df99828572b1d85a7c2de3d92
func LoadAttachCgroupDeviceFilter(insts asm.Instructions, license string, dirFd int) (func() error, error) {
// Increase `ulimit -l` limit to avoid BPF_PROG_LOAD error (#2167).
// This limit is not inherited into the container.
memlockLimit := &unix.Rlimit{
Cur: unix.RLIM_INFINITY,
Max: unix.RLIM_INFINITY,
}
_ = unix.Setrlimit(unix.RLIMIT_MEMLOCK, memlockLimit)
// Get the list of existing programs.
oldProgs, err := findAttachedCgroupDeviceFilters(dirFd)
if err != nil {
return nilCloser, err
}
useReplaceProg := haveBpfProgReplace() && len(oldProgs) == 1
// Generate new program.
spec := &ebpf.ProgramSpec{
Type: ebpf.CGroupDevice,
Instructions: insts,
License: license,
}
prog, err := ebpf.NewProgram(spec)
if err != nil {
return nilCloser, err
}
// If there is only one old program, we can just replace it directly.
var (
replaceProg *ebpf.Program
attachFlags uint32 = unix.BPF_F_ALLOW_MULTI
)
if useReplaceProg {
replaceProg = oldProgs[0]
attachFlags |= unix.BPF_F_REPLACE
}
err = link.RawAttachProgram(link.RawAttachProgramOptions{
Target: dirFd,
Program: prog,
Replace: replaceProg,
Attach: ebpf.AttachCGroupDevice,
Flags: attachFlags,
})
if err != nil {
return nilCloser, fmt.Errorf("failed to call BPF_PROG_ATTACH (BPF_CGROUP_DEVICE, BPF_F_ALLOW_MULTI): %w", err)
}
closer := func() error {
err = link.RawDetachProgram(link.RawDetachProgramOptions{
Target: dirFd,
Program: prog,
Attach: ebpf.AttachCGroupDevice,
})
if err != nil {
return fmt.Errorf("failed to call BPF_PROG_DETACH (BPF_CGROUP_DEVICE): %w", err)
}
// TODO: Should we attach the old filters back in this case? Otherwise
// we fail-open on a security feature, which is a bit scary.
return nil
}
if !useReplaceProg {
logLevel := logrus.DebugLevel
// If there was more than one old program, give a warning (since this
// really shouldn't happen with runc-managed cgroups) and then detach
// all the old programs.
if len(oldProgs) > 1 {
// NOTE: Ideally this should be a warning but it turns out that
// systemd-managed cgroups trigger this warning (apparently
// systemd doesn't delete old non-systemd programs when
// setting properties).
logrus.Infof("found more than one filter (%d) attached to a cgroup -- removing extra filters!", len(oldProgs))
logLevel = logrus.InfoLevel
}
for idx, oldProg := range oldProgs {
// Output some extra debug info.
if info, err := oldProg.Info(); err == nil {
fields := logrus.Fields{
"type": info.Type.String(),
"tag": info.Tag,
"name": info.Name,
}
if id, ok := info.ID(); ok {
fields["id"] = id
}
if runCount, ok := info.RunCount(); ok {
fields["run_count"] = runCount
}
if runtime, ok := info.Runtime(); ok {
fields["runtime"] = runtime.String()
}
logrus.WithFields(fields).Logf(logLevel, "removing old filter %d from cgroup", idx)
}
err = link.RawDetachProgram(link.RawDetachProgramOptions{
Target: dirFd,
Program: oldProg,
Attach: ebpf.AttachCGroupDevice,
})
if err != nil {
return closer, fmt.Errorf("failed to call BPF_PROG_DETACH (BPF_CGROUP_DEVICE) on old filter program: %w", err)
}
}
}
return closer, nil
}