Revert "Update runc to 1.0.0"

This commit is contained in:
Odin Ugedal
2021-07-05 14:03:04 +02:00
committed by GitHub
parent 5e3bed6399
commit 61d88af9e4
146 changed files with 1196 additions and 2702 deletions

View File

@@ -1,3 +0,0 @@
module github.com/bits-and-blooms/bitset
go 1.14

View File

@@ -18,23 +18,6 @@ reason about the proposed changes.
## Running the tests
Many of the tests require privileges to set resource limits and load eBPF code.
The easiest way to obtain these is to run the tests with `sudo`.
To test the current package with your local kernel you can simply run:
```
go test -exec sudo ./...
```
To test the current package with a different kernel version you can use the [run-tests.sh](run-tests.sh) script.
It requires [virtme](https://github.com/amluto/virtme) and qemu to be installed.
Examples:
```bash
# Run all tests on a 5.4 kernel
./run-tests.sh 5.4
# Run a subset of tests:
./run-tests.sh 5.4 go test ./link
```
The easiest way to obtain these is to run the tests with `sudo`:
sudo go test ./...

View File

@@ -1,7 +1,7 @@
# The development version of clang is distributed as the 'clang' binary,
# while stable/released versions have a version number attached.
# Pin the default clang to a stable version.
CLANG ?= clang-12
CLANG ?= clang-11
CFLAGS := -target bpf -O2 -g -Wall -Werror $(CFLAGS)
# Obtain an absolute path to the directory of the Makefile.
@@ -17,7 +17,7 @@ VERSION := $(shell cat ${REPODIR}/testdata/docker/VERSION)
TARGETS := \
testdata/loader-clang-7 \
testdata/loader-clang-9 \
testdata/loader-$(CLANG) \
testdata/loader-clang-11 \
testdata/invalid_map \
testdata/raw_tracepoint \
testdata/invalid_map_static \
@@ -33,7 +33,6 @@ TARGETS := \
docker-all:
docker run --rm --user "${UIDGID}" \
-v "${REPODIR}":/ebpf -w /ebpf --env MAKEFLAGS \
--env CFLAGS="-fdebug-prefix-map=/ebpf=." \
"${IMAGE}:${VERSION}" \
make all
@@ -48,8 +47,6 @@ clean:
-$(RM) internal/btf/testdata/*.elf
all: $(addsuffix -el.elf,$(TARGETS)) $(addsuffix -eb.elf,$(TARGETS))
ln -srf testdata/loader-$(CLANG)-el.elf testdata/loader-el.elf
ln -srf testdata/loader-$(CLANG)-eb.elf testdata/loader-eb.elf
testdata/loader-%-el.elf: testdata/loader.c
$* $(CFLAGS) -mlittle-endian -c $< -o $@

View File

@@ -57,7 +57,7 @@ func (ins *Instruction) Unmarshal(r io.Reader, bo binary.ByteOrder) (uint64, err
return 0, fmt.Errorf("can't unmarshal registers: %s", err)
}
if !bi.OpCode.IsDWordLoad() {
if !bi.OpCode.isDWordLoad() {
return InstructionSize, nil
}
@@ -80,7 +80,7 @@ func (ins Instruction) Marshal(w io.Writer, bo binary.ByteOrder) (uint64, error)
return 0, errors.New("invalid opcode")
}
isDWordLoad := ins.OpCode.IsDWordLoad()
isDWordLoad := ins.OpCode.isDWordLoad()
cons := int32(ins.Constant)
if isDWordLoad {
@@ -123,7 +123,7 @@ func (ins Instruction) Marshal(w io.Writer, bo binary.ByteOrder) (uint64, error)
//
// Returns an error if the instruction doesn't load a map.
func (ins *Instruction) RewriteMapPtr(fd int) error {
if !ins.OpCode.IsDWordLoad() {
if !ins.OpCode.isDWordLoad() {
return fmt.Errorf("%s is not a 64 bit load", ins.OpCode)
}
@@ -138,19 +138,15 @@ func (ins *Instruction) RewriteMapPtr(fd int) error {
return nil
}
// MapPtr returns the map fd for this instruction.
//
// The result is undefined if the instruction is not a load from a map,
// see IsLoadFromMap.
func (ins *Instruction) MapPtr() int {
return int(int32(uint64(ins.Constant) & math.MaxUint32))
func (ins *Instruction) mapPtr() uint32 {
return uint32(uint64(ins.Constant) & math.MaxUint32)
}
// RewriteMapOffset changes the offset of a direct load from a map.
//
// Returns an error if the instruction is not a direct load.
func (ins *Instruction) RewriteMapOffset(offset uint32) error {
if !ins.OpCode.IsDWordLoad() {
if !ins.OpCode.isDWordLoad() {
return fmt.Errorf("%s is not a 64 bit load", ins.OpCode)
}
@@ -167,10 +163,10 @@ func (ins *Instruction) mapOffset() uint32 {
return uint32(uint64(ins.Constant) >> 32)
}
// IsLoadFromMap returns true if the instruction loads from a map.
// isLoadFromMap returns true if the instruction loads from a map.
//
// This covers both loading the map pointer and direct map value loads.
func (ins *Instruction) IsLoadFromMap() bool {
func (ins *Instruction) isLoadFromMap() bool {
return ins.OpCode == LoadImmOp(DWord) && (ins.Src == PseudoMapFD || ins.Src == PseudoMapValue)
}
@@ -181,12 +177,6 @@ func (ins *Instruction) IsFunctionCall() bool {
return ins.OpCode.JumpOp() == Call && ins.Src == PseudoCall
}
// IsConstantLoad returns true if the instruction loads a constant of the
// given size.
func (ins *Instruction) IsConstantLoad(size Size) bool {
return ins.OpCode == LoadImmOp(size) && ins.Src == R0 && ins.Offset == 0
}
// Format implements fmt.Formatter.
func (ins Instruction) Format(f fmt.State, c rune) {
if c != 'v' {
@@ -207,8 +197,8 @@ func (ins Instruction) Format(f fmt.State, c rune) {
return
}
if ins.IsLoadFromMap() {
fd := ins.MapPtr()
if ins.isLoadFromMap() {
fd := int32(ins.mapPtr())
switch ins.Src {
case PseudoMapFD:
fmt.Fprintf(f, "LoadMapPtr dst: %s fd: %d", ins.Dst, fd)
@@ -413,7 +403,7 @@ func (insns Instructions) Marshal(w io.Writer, bo binary.ByteOrder) error {
func (insns Instructions) Tag(bo binary.ByteOrder) (string, error) {
h := sha1.New()
for i, ins := range insns {
if ins.IsLoadFromMap() {
if ins.isLoadFromMap() {
ins.Constant = 0
}
_, err := ins.Marshal(h, bo)

View File

@@ -111,7 +111,7 @@ func LoadMapPtr(dst Register, fd int) Instruction {
OpCode: LoadImmOp(DWord),
Dst: dst,
Src: PseudoMapFD,
Constant: int64(uint32(fd)),
Constant: int64(fd),
}
}

View File

@@ -69,13 +69,13 @@ const InvalidOpCode OpCode = 0xff
// rawInstructions returns the number of BPF instructions required
// to encode this opcode.
func (op OpCode) rawInstructions() int {
if op.IsDWordLoad() {
if op.isDWordLoad() {
return 2
}
return 1
}
func (op OpCode) IsDWordLoad() bool {
func (op OpCode) isDWordLoad() bool {
return op == LoadImmOp(DWord)
}

View File

@@ -3,7 +3,6 @@ package ebpf
import (
"errors"
"fmt"
"io"
"math"
"reflect"
"strings"
@@ -90,8 +89,8 @@ func (cs *CollectionSpec) RewriteMaps(maps map[string]*Map) error {
//
// The constant must be defined like so in the C program:
//
// volatile const type foobar;
// volatile const type foobar = default;
// static volatile const type foobar;
// static volatile const type foobar = default;
//
// Replacement values must be of the same length as the C sizeof(type).
// If necessary, they are marshalled according to the same rules as
@@ -270,21 +269,11 @@ func NewCollectionWithOptions(spec *CollectionSpec, opts CollectionOptions) (*Co
}, nil
}
type handleCache struct {
btfHandles map[*btf.Spec]*btf.Handle
btfSpecs map[io.ReaderAt]*btf.Spec
}
type btfHandleCache map[*btf.Spec]*btf.Handle
func newHandleCache() *handleCache {
return &handleCache{
btfHandles: make(map[*btf.Spec]*btf.Handle),
btfSpecs: make(map[io.ReaderAt]*btf.Spec),
}
}
func (hc handleCache) btfHandle(spec *btf.Spec) (*btf.Handle, error) {
if hc.btfHandles[spec] != nil {
return hc.btfHandles[spec], nil
func (btfs btfHandleCache) load(spec *btf.Spec) (*btf.Handle, error) {
if btfs[spec] != nil {
return btfs[spec], nil
}
handle, err := btf.NewHandle(spec)
@@ -292,30 +281,14 @@ func (hc handleCache) btfHandle(spec *btf.Spec) (*btf.Handle, error) {
return nil, err
}
hc.btfHandles[spec] = handle
btfs[spec] = handle
return handle, nil
}
func (hc handleCache) btfSpec(rd io.ReaderAt) (*btf.Spec, error) {
if hc.btfSpecs[rd] != nil {
return hc.btfSpecs[rd], nil
}
spec, err := btf.LoadSpecFromReader(rd)
if err != nil {
return nil, err
}
hc.btfSpecs[rd] = spec
return spec, nil
}
func (hc handleCache) close() {
for _, handle := range hc.btfHandles {
func (btfs btfHandleCache) close() {
for _, handle := range btfs {
handle.Close()
}
hc.btfHandles = nil
hc.btfSpecs = nil
}
func lazyLoadCollection(coll *CollectionSpec, opts *CollectionOptions) (
@@ -327,12 +300,12 @@ func lazyLoadCollection(coll *CollectionSpec, opts *CollectionOptions) (
var (
maps = make(map[string]*Map)
progs = make(map[string]*Program)
handles = newHandleCache()
btfs = make(btfHandleCache)
skipMapsAndProgs = false
)
cleanup = func() {
handles.close()
btfs.close()
if skipMapsAndProgs {
return
@@ -362,7 +335,7 @@ func lazyLoadCollection(coll *CollectionSpec, opts *CollectionOptions) (
return nil, fmt.Errorf("missing map %s", mapName)
}
m, err := newMapWithOptions(mapSpec, opts.Maps, handles)
m, err := newMapWithOptions(mapSpec, opts.Maps, btfs)
if err != nil {
return nil, fmt.Errorf("map %s: %w", mapName, err)
}
@@ -387,7 +360,7 @@ func lazyLoadCollection(coll *CollectionSpec, opts *CollectionOptions) (
for i := range progSpec.Instructions {
ins := &progSpec.Instructions[i]
if !ins.IsLoadFromMap() || ins.Reference == "" {
if ins.OpCode != asm.LoadImmOp(asm.DWord) || ins.Reference == "" {
continue
}
@@ -399,7 +372,7 @@ func lazyLoadCollection(coll *CollectionSpec, opts *CollectionOptions) (
m, err := loadMap(ins.Reference)
if err != nil {
return nil, fmt.Errorf("program %s: %w", progName, err)
return nil, fmt.Errorf("program %s: %s", progName, err)
}
fd := m.FD()
@@ -411,7 +384,7 @@ func lazyLoadCollection(coll *CollectionSpec, opts *CollectionOptions) (
}
}
prog, err := newProgramWithOptions(progSpec, opts.Programs, handles)
prog, err := newProgramWithOptions(progSpec, opts.Programs, btfs)
if err != nil {
return nil, fmt.Errorf("program %s: %w", progName, err)
}
@@ -561,7 +534,7 @@ func assignValues(to interface{}, valueOf func(reflect.Type, string) (reflect.Va
}
if err != nil {
return fmt.Errorf("field %s: %w", field.Name, err)
return fmt.Errorf("field %s: %s", field.Name, err)
}
}

View File

@@ -96,7 +96,7 @@ func LoadCollectionSpecFromReader(rd io.ReaderAt) (*CollectionSpec, error) {
}
btfSpec, err := btf.LoadSpecFromReader(rd)
if err != nil && !errors.Is(err, btf.ErrNotFound) {
if err != nil {
return nil, fmt.Errorf("load BTF: %w", err)
}
@@ -159,7 +159,7 @@ func LoadCollectionSpecFromReader(rd io.ReaderAt) (*CollectionSpec, error) {
}
if target.Flags&elf.SHF_STRINGS > 0 {
return nil, fmt.Errorf("section %q: string is not stack allocated: %w", section.Name, ErrNotSupported)
return nil, fmt.Errorf("section %q: string %q is not stack allocated: %w", section.Name, rel.Name, ErrNotSupported)
}
target.references++
@@ -374,25 +374,17 @@ func (ec *elfCode) relocateInstruction(ins *asm.Instruction, rel elf.Symbol) err
}
case dataSection:
var offset uint32
switch typ {
case elf.STT_SECTION:
if bind != elf.STB_LOCAL {
return fmt.Errorf("direct load: %s: unsupported relocation %s", name, bind)
}
// This is really a reference to a static symbol, which clang doesn't
// emit a symbol table entry for. Instead it encodes the offset in
// the instruction itself.
offset = uint32(uint64(ins.Constant))
case elf.STT_OBJECT:
if bind != elf.STB_GLOBAL {
return fmt.Errorf("direct load: %s: unsupported relocation %s", name, bind)
}
offset = uint32(rel.Value)
default:
return fmt.Errorf("incorrect relocation type %v for direct map load", typ)
}
@@ -402,8 +394,10 @@ func (ec *elfCode) relocateInstruction(ins *asm.Instruction, rel elf.Symbol) err
// it's not clear how to encode that into Instruction.
name = target.Name
// The kernel expects the offset in the second basic BPF instruction.
ins.Constant = int64(uint64(offset) << 32)
// For some reason, clang encodes the offset of the symbol its
// section in the first basic BPF instruction, while the kernel
// expects it in the second one.
ins.Constant <<= 32
ins.Src = asm.PseudoMapValue
// Mark the instruction as needing an update when creating the
@@ -497,38 +491,33 @@ func (ec *elfCode) loadMaps(maps map[string]*MapSpec) error {
return fmt.Errorf("section %s: missing symbol for map at offset %d", sec.Name, offset)
}
mapName := mapSym.Name
if maps[mapName] != nil {
if maps[mapSym.Name] != nil {
return fmt.Errorf("section %v: map %v already exists", sec.Name, mapSym)
}
lr := io.LimitReader(r, int64(size))
spec := MapSpec{
Name: SanitizeName(mapName, -1),
Name: SanitizeName(mapSym.Name, -1),
}
switch {
case binary.Read(lr, ec.ByteOrder, &spec.Type) != nil:
return fmt.Errorf("map %s: missing type", mapName)
return fmt.Errorf("map %v: missing type", mapSym)
case binary.Read(lr, ec.ByteOrder, &spec.KeySize) != nil:
return fmt.Errorf("map %s: missing key size", mapName)
return fmt.Errorf("map %v: missing key size", mapSym)
case binary.Read(lr, ec.ByteOrder, &spec.ValueSize) != nil:
return fmt.Errorf("map %s: missing value size", mapName)
return fmt.Errorf("map %v: missing value size", mapSym)
case binary.Read(lr, ec.ByteOrder, &spec.MaxEntries) != nil:
return fmt.Errorf("map %s: missing max entries", mapName)
return fmt.Errorf("map %v: missing max entries", mapSym)
case binary.Read(lr, ec.ByteOrder, &spec.Flags) != nil:
return fmt.Errorf("map %s: missing flags", mapName)
return fmt.Errorf("map %v: missing flags", mapSym)
}
if _, err := io.Copy(internal.DiscardZeroes{}, lr); err != nil {
return fmt.Errorf("map %s: unknown and non-zero fields in definition", mapName)
return fmt.Errorf("map %v: unknown and non-zero fields in definition", mapSym)
}
if err := spec.clampPerfEventArraySize(); err != nil {
return fmt.Errorf("map %s: %w", mapName, err)
}
maps[mapName] = &spec
maps[mapSym.Name] = &spec
}
}
@@ -576,10 +565,6 @@ func (ec *elfCode) loadBTFMaps(maps map[string]*MapSpec) error {
return fmt.Errorf("map %v: %w", name, err)
}
if err := mapSpec.clampPerfEventArraySize(); err != nil {
return fmt.Errorf("map %v: %w", name, err)
}
maps[name] = mapSpec
}
}
@@ -862,8 +847,6 @@ func getProgType(sectionName string) (ProgramType, AttachType, uint32, string) {
"uretprobe/": {Kprobe, AttachNone, 0},
"tracepoint/": {TracePoint, AttachNone, 0},
"raw_tracepoint/": {RawTracepoint, AttachNone, 0},
"raw_tp/": {RawTracepoint, AttachNone, 0},
"tp_btf/": {Tracing, AttachTraceRawTp, 0},
"xdp": {XDP, AttachNone, 0},
"perf_event": {PerfEvent, AttachNone, 0},
"lwt_in": {LWTIn, AttachNone, 0},

View File

@@ -35,7 +35,7 @@ type Spec struct {
namedTypes map[string][]namedType
funcInfos map[string]extInfo
lineInfos map[string]extInfo
coreRelos map[string]coreRelos
coreRelos map[string]bpfCoreRelos
byteOrder binary.ByteOrder
}
@@ -53,7 +53,7 @@ type btfHeader struct {
// LoadSpecFromReader reads BTF sections from an ELF.
//
// Returns ErrNotFound if the reader contains no BTF.
// Returns a nil Spec and no error if no BTF was present.
func LoadSpecFromReader(rd io.ReaderAt) (*Spec, error) {
file, err := internal.NewSafeELFFile(rd)
if err != nil {
@@ -67,7 +67,7 @@ func LoadSpecFromReader(rd io.ReaderAt) (*Spec, error) {
}
if btfSection == nil {
return nil, fmt.Errorf("btf: %w", ErrNotFound)
return nil, nil
}
symbols, err := file.Symbols()
@@ -438,13 +438,13 @@ func (s *Spec) Program(name string, length uint64) (*Program, error) {
funcInfos, funcOK := s.funcInfos[name]
lineInfos, lineOK := s.lineInfos[name]
relos, coreOK := s.coreRelos[name]
coreRelos, coreOK := s.coreRelos[name]
if !funcOK && !lineOK && !coreOK {
return nil, fmt.Errorf("no extended BTF info for section %s", name)
}
return &Program{s, length, funcInfos, lineInfos, relos}, nil
return &Program{s, length, funcInfos, lineInfos, coreRelos}, nil
}
// Datasec returns the BTF required to create maps which represent data sections.
@@ -491,8 +491,7 @@ func (s *Spec) FindType(name string, typ Type) error {
return fmt.Errorf("type %s: %w", name, ErrNotFound)
}
cpy, _ := copyType(candidate, nil)
value := reflect.Indirect(reflect.ValueOf(cpy))
value := reflect.Indirect(reflect.ValueOf(copyType(candidate)))
reflect.Indirect(reflect.ValueOf(typ)).Set(value)
return nil
}
@@ -607,7 +606,7 @@ type Program struct {
spec *Spec
length uint64
funcInfos, lineInfos extInfo
coreRelos coreRelos
coreRelos bpfCoreRelos
}
// ProgramSpec returns the Spec needed for loading function and line infos into the kernel.
@@ -666,23 +665,16 @@ func ProgramLineInfos(s *Program) (recordSize uint32, bytes []byte, err error) {
return s.lineInfos.recordSize, bytes, nil
}
// ProgramFixups returns the changes required to adjust the program to the target.
// ProgramRelocations returns the CO-RE relocations required to adjust the
// program to the target.
//
// This is a free function instead of a method to hide it from users
// of package ebpf.
func ProgramFixups(s *Program, target *Spec) (COREFixups, error) {
func ProgramRelocations(s *Program, target *Spec) (map[uint64]Relocation, error) {
if len(s.coreRelos) == 0 {
return nil, nil
}
if target == nil {
var err error
target, err = LoadKernelSpec()
if err != nil {
return nil, err
}
}
return coreRelocate(s.spec, target, s.coreRelos)
}

View File

@@ -3,160 +3,43 @@ package btf
import (
"errors"
"fmt"
"math"
"reflect"
"sort"
"strconv"
"strings"
"github.com/cilium/ebpf/asm"
)
// Code in this file is derived from libbpf, which is available under a BSD
// 2-Clause license.
// COREFixup is the result of computing a CO-RE relocation for a target.
type COREFixup struct {
Kind COREKind
Local uint32
Target uint32
Poison bool
// Relocation describes a CO-RE relocation.
type Relocation struct {
Current uint32
New uint32
}
func (f COREFixup) equal(other COREFixup) bool {
return f.Local == other.Local && f.Target == other.Target
func (r Relocation) equal(other Relocation) bool {
return r.Current == other.Current && r.New == other.New
}
func (f COREFixup) String() string {
if f.Poison {
return fmt.Sprintf("%s=poison", f.Kind)
}
return fmt.Sprintf("%s=%d->%d", f.Kind, f.Local, f.Target)
}
func (f COREFixup) apply(ins *asm.Instruction) error {
if f.Poison {
return errors.New("can't poison individual instruction")
}
switch class := ins.OpCode.Class(); class {
case asm.LdXClass, asm.StClass, asm.StXClass:
if want := int16(f.Local); want != ins.Offset {
return fmt.Errorf("invalid offset %d, expected %d", ins.Offset, want)
}
if f.Target > math.MaxInt16 {
return fmt.Errorf("offset %d exceeds MaxInt16", f.Target)
}
ins.Offset = int16(f.Target)
case asm.LdClass:
if !ins.IsConstantLoad(asm.DWord) {
return fmt.Errorf("not a dword-sized immediate load")
}
if want := int64(f.Local); want != ins.Constant {
return fmt.Errorf("invalid immediate %d, expected %d", ins.Constant, want)
}
ins.Constant = int64(f.Target)
case asm.ALUClass:
if ins.OpCode.ALUOp() == asm.Swap {
return fmt.Errorf("relocation against swap")
}
fallthrough
case asm.ALU64Class:
if src := ins.OpCode.Source(); src != asm.ImmSource {
return fmt.Errorf("invalid source %s", src)
}
if want := int64(f.Local); want != ins.Constant {
return fmt.Errorf("invalid immediate %d, expected %d", ins.Constant, want)
}
if f.Target > math.MaxInt32 {
return fmt.Errorf("immediate %d exceeds MaxInt32", f.Target)
}
ins.Constant = int64(f.Target)
default:
return fmt.Errorf("invalid class %s", class)
}
return nil
}
func (f COREFixup) isNonExistant() bool {
return f.Kind.checksForExistence() && f.Target == 0
}
type COREFixups map[uint64]COREFixup
// Apply a set of CO-RE relocations to a BPF program.
func (fs COREFixups) Apply(insns asm.Instructions) (asm.Instructions, error) {
if len(fs) == 0 {
cpy := make(asm.Instructions, len(insns))
copy(cpy, insns)
return insns, nil
}
cpy := make(asm.Instructions, 0, len(insns))
iter := insns.Iterate()
for iter.Next() {
fixup, ok := fs[iter.Offset.Bytes()]
if !ok {
cpy = append(cpy, *iter.Ins)
continue
}
ins := *iter.Ins
if fixup.Poison {
const badRelo = asm.BuiltinFunc(0xbad2310)
cpy = append(cpy, badRelo.Call())
if ins.OpCode.IsDWordLoad() {
// 64 bit constant loads occupy two raw bpf instructions, so
// we need to add another instruction as padding.
cpy = append(cpy, badRelo.Call())
}
continue
}
if err := fixup.apply(&ins); err != nil {
return nil, fmt.Errorf("instruction %d, offset %d: %s: %w", iter.Index, iter.Offset.Bytes(), fixup.Kind, err)
}
cpy = append(cpy, ins)
}
return cpy, nil
}
// COREKind is the type of CO-RE relocation
type COREKind uint32
// coreReloKind is the type of CO-RE relocation
type coreReloKind uint32
const (
reloFieldByteOffset COREKind = iota /* field byte offset */
reloFieldByteSize /* field size in bytes */
reloFieldExists /* field existence in target kernel */
reloFieldSigned /* field signedness (0 - unsigned, 1 - signed) */
reloFieldLShiftU64 /* bitfield-specific left bitshift */
reloFieldRShiftU64 /* bitfield-specific right bitshift */
reloTypeIDLocal /* type ID in local BPF object */
reloTypeIDTarget /* type ID in target kernel */
reloTypeExists /* type existence in target kernel */
reloTypeSize /* type size in bytes */
reloEnumvalExists /* enum value existence in target kernel */
reloEnumvalValue /* enum value integer value */
reloFieldByteOffset coreReloKind = iota /* field byte offset */
reloFieldByteSize /* field size in bytes */
reloFieldExists /* field existence in target kernel */
reloFieldSigned /* field signedness (0 - unsigned, 1 - signed) */
reloFieldLShiftU64 /* bitfield-specific left bitshift */
reloFieldRShiftU64 /* bitfield-specific right bitshift */
reloTypeIDLocal /* type ID in local BPF object */
reloTypeIDTarget /* type ID in target kernel */
reloTypeExists /* type existence in target kernel */
reloTypeSize /* type size in bytes */
reloEnumvalExists /* enum value existence in target kernel */
reloEnumvalValue /* enum value integer value */
)
func (k COREKind) String() string {
func (k coreReloKind) String() string {
switch k {
case reloFieldByteOffset:
return "byte_off"
@@ -187,249 +70,103 @@ func (k COREKind) String() string {
}
}
func (k COREKind) checksForExistence() bool {
return k == reloEnumvalExists || k == reloTypeExists || k == reloFieldExists
}
func coreRelocate(local, target *Spec, coreRelos bpfCoreRelos) (map[uint64]Relocation, error) {
if target == nil {
var err error
target, err = loadKernelSpec()
if err != nil {
return nil, err
}
}
func coreRelocate(local, target *Spec, relos coreRelos) (COREFixups, error) {
if local.byteOrder != target.byteOrder {
return nil, fmt.Errorf("can't relocate %s against %s", local.byteOrder, target.byteOrder)
}
var ids []TypeID
relosByID := make(map[TypeID]coreRelos)
result := make(COREFixups, len(relos))
for _, relo := range relos {
if relo.kind == reloTypeIDLocal {
// Filtering out reloTypeIDLocal here makes our lives a lot easier
// down the line, since it doesn't have a target at all.
if len(relo.accessor) > 1 || relo.accessor[0] != 0 {
return nil, fmt.Errorf("%s: unexpected accessor %v", relo.kind, relo.accessor)
}
result[uint64(relo.insnOff)] = COREFixup{
relo.kind,
uint32(relo.typeID),
uint32(relo.typeID),
false,
}
continue
}
relos, ok := relosByID[relo.typeID]
if !ok {
ids = append(ids, relo.typeID)
}
relosByID[relo.typeID] = append(relos, relo)
}
// Ensure we work on relocations in a deterministic order.
sort.Slice(ids, func(i, j int) bool {
return ids[i] < ids[j]
})
for _, id := range ids {
if int(id) >= len(local.types) {
return nil, fmt.Errorf("invalid type id %d", id)
}
localType := local.types[id]
named, ok := localType.(namedType)
if !ok || named.name() == "" {
return nil, fmt.Errorf("relocate unnamed or anonymous type %s: %w", localType, ErrNotSupported)
}
relos := relosByID[id]
targets := target.namedTypes[named.essentialName()]
fixups, err := coreCalculateFixups(localType, targets, relos)
if err != nil {
return nil, fmt.Errorf("relocate %s: %w", localType, err)
}
for i, relo := range relos {
result[uint64(relo.insnOff)] = fixups[i]
}
}
return result, nil
}
var errAmbiguousRelocation = errors.New("ambiguous relocation")
var errImpossibleRelocation = errors.New("impossible relocation")
// coreCalculateFixups calculates the fixups for the given relocations using
// the "best" target.
//
// The best target is determined by scoring: the less poisoning we have to do
// the better the target is.
func coreCalculateFixups(local Type, targets []namedType, relos coreRelos) ([]COREFixup, error) {
localID := local.ID()
local, err := copyType(local, skipQualifierAndTypedef)
if err != nil {
return nil, err
}
bestScore := len(relos)
var bestFixups []COREFixup
for i := range targets {
targetID := targets[i].ID()
target, err := copyType(targets[i], skipQualifierAndTypedef)
relocations := make(map[uint64]Relocation, len(coreRelos))
for _, relo := range coreRelos {
accessorStr, err := local.strings.Lookup(relo.AccessStrOff)
if err != nil {
return nil, err
}
score := 0 // lower is better
fixups := make([]COREFixup, 0, len(relos))
for _, relo := range relos {
fixup, err := coreCalculateFixup(local, localID, target, targetID, relo)
if err != nil {
return nil, fmt.Errorf("target %s: %w", target, err)
}
if fixup.Poison || fixup.isNonExistant() {
score++
}
fixups = append(fixups, fixup)
accessor, err := parseCoreAccessor(accessorStr)
if err != nil {
return nil, fmt.Errorf("accessor %q: %s", accessorStr, err)
}
if score > bestScore {
// We have a better target already, ignore this one.
if int(relo.TypeID) >= len(local.types) {
return nil, fmt.Errorf("invalid type id %d", relo.TypeID)
}
typ := local.types[relo.TypeID]
if relo.ReloKind == reloTypeIDLocal {
relocations[uint64(relo.InsnOff)] = Relocation{
uint32(typ.ID()),
uint32(typ.ID()),
}
continue
}
if score < bestScore {
// This is the best target yet, use it.
bestScore = score
bestFixups = fixups
continue
named, ok := typ.(namedType)
if !ok || named.name() == "" {
return nil, fmt.Errorf("relocate anonymous type %s: %w", typ.String(), ErrNotSupported)
}
// Some other target has the same score as the current one. Make sure
// the fixups agree with each other.
for i, fixup := range bestFixups {
if !fixup.equal(fixups[i]) {
return nil, fmt.Errorf("%s: multiple types match: %w", fixup.Kind, errAmbiguousRelocation)
}
name := essentialName(named.name())
res, err := coreCalculateRelocation(typ, target.namedTypes[name], relo.ReloKind, accessor)
if err != nil {
return nil, fmt.Errorf("relocate %s: %w", name, err)
}
relocations[uint64(relo.InsnOff)] = res
}
if bestFixups == nil {
// Nothing at all matched, probably because there are no suitable
// targets at all. Poison everything!
bestFixups = make([]COREFixup, len(relos))
for i, relo := range relos {
bestFixups[i] = COREFixup{Kind: relo.kind, Poison: true}
}
}
return bestFixups, nil
return relocations, nil
}
// coreCalculateFixup calculates the fixup for a single local type, target type
// and relocation.
func coreCalculateFixup(local Type, localID TypeID, target Type, targetID TypeID, relo coreRelo) (COREFixup, error) {
fixup := func(local, target uint32) (COREFixup, error) {
return COREFixup{relo.kind, local, target, false}, nil
}
poison := func() (COREFixup, error) {
if relo.kind.checksForExistence() {
return fixup(1, 0)
}
return COREFixup{relo.kind, 0, 0, true}, nil
}
zero := COREFixup{}
switch relo.kind {
case reloTypeIDTarget, reloTypeSize, reloTypeExists:
if len(relo.accessor) > 1 || relo.accessor[0] != 0 {
return zero, fmt.Errorf("%s: unexpected accessor %v", relo.kind, relo.accessor)
}
err := coreAreTypesCompatible(local, target)
if errors.Is(err, errImpossibleRelocation) {
return poison()
}
if err != nil {
return zero, fmt.Errorf("relocation %s: %w", relo.kind, err)
}
switch relo.kind {
case reloTypeExists:
return fixup(1, 1)
var errAmbiguousRelocation = errors.New("ambiguous relocation")
func coreCalculateRelocation(local Type, targets []namedType, kind coreReloKind, localAccessor coreAccessor) (Relocation, error) {
var relos []Relocation
var matches []Type
for _, target := range targets {
switch kind {
case reloTypeIDTarget:
return fixup(uint32(localID), uint32(targetID))
case reloTypeSize:
localSize, err := Sizeof(local)
if err != nil {
return zero, err
if localAccessor[0] != 0 {
return Relocation{}, fmt.Errorf("%s: unexpected non-zero accessor", kind)
}
targetSize, err := Sizeof(target)
if err != nil {
return zero, err
if compat, err := coreAreTypesCompatible(local, target); err != nil {
return Relocation{}, fmt.Errorf("%s: %s", kind, err)
} else if !compat {
continue
}
return fixup(uint32(localSize), uint32(targetSize))
relos = append(relos, Relocation{uint32(target.ID()), uint32(target.ID())})
default:
return Relocation{}, fmt.Errorf("relocation %s: %w", kind, ErrNotSupported)
}
matches = append(matches, target)
}
case reloEnumvalValue, reloEnumvalExists:
localValue, targetValue, err := coreFindEnumValue(local, relo.accessor, target)
if errors.Is(err, errImpossibleRelocation) {
return poison()
}
if err != nil {
return zero, fmt.Errorf("relocation %s: %w", relo.kind, err)
}
if len(relos) == 0 {
// TODO: Add switch for existence checks like reloEnumvalExists here.
switch relo.kind {
case reloEnumvalExists:
return fixup(1, 1)
case reloEnumvalValue:
return fixup(uint32(localValue.Value), uint32(targetValue.Value))
}
case reloFieldByteOffset, reloFieldByteSize, reloFieldExists:
if _, ok := target.(*Fwd); ok {
// We can't relocate fields using a forward declaration, so
// skip it. If a non-forward declaration is present in the BTF
// we'll find it in one of the other iterations.
return poison()
}
localField, targetField, err := coreFindField(local, relo.accessor, target)
if errors.Is(err, errImpossibleRelocation) {
return poison()
}
if err != nil {
return zero, fmt.Errorf("target %s: %w", target, err)
}
switch relo.kind {
case reloFieldExists:
return fixup(1, 1)
case reloFieldByteOffset:
return fixup(localField.offset/8, targetField.offset/8)
case reloFieldByteSize:
localSize, err := Sizeof(localField.Type)
if err != nil {
return zero, err
}
targetSize, err := Sizeof(targetField.Type)
if err != nil {
return zero, err
}
return fixup(uint32(localSize), uint32(targetSize))
// TODO: This might have to be poisoned.
return Relocation{}, fmt.Errorf("no relocation found, tried %v", targets)
}
relo := relos[0]
for _, altRelo := range relos[1:] {
if !altRelo.equal(relo) {
return Relocation{}, fmt.Errorf("multiple types %v match: %w", matches, errAmbiguousRelocation)
}
}
return zero, fmt.Errorf("relocation %s: %w", relo.kind, ErrNotSupported)
return relo, nil
}
/* coreAccessor contains a path through a struct. It contains at least one index.
@@ -482,240 +219,6 @@ func parseCoreAccessor(accessor string) (coreAccessor, error) {
return result, nil
}
func (ca coreAccessor) String() string {
strs := make([]string, 0, len(ca))
for _, i := range ca {
strs = append(strs, strconv.Itoa(i))
}
return strings.Join(strs, ":")
}
func (ca coreAccessor) enumValue(t Type) (*EnumValue, error) {
e, ok := t.(*Enum)
if !ok {
return nil, fmt.Errorf("not an enum: %s", t)
}
if len(ca) > 1 {
return nil, fmt.Errorf("invalid accessor %s for enum", ca)
}
i := ca[0]
if i >= len(e.Values) {
return nil, fmt.Errorf("invalid index %d for %s", i, e)
}
return &e.Values[i], nil
}
type coreField struct {
Type Type
offset uint32
}
func adjustOffset(base uint32, t Type, n int) (uint32, error) {
size, err := Sizeof(t)
if err != nil {
return 0, err
}
return base + (uint32(n) * uint32(size) * 8), nil
}
// coreFindField descends into the local type using the accessor and tries to
// find an equivalent field in target at each step.
//
// Returns the field and the offset of the field from the start of
// target in bits.
func coreFindField(local Type, localAcc coreAccessor, target Type) (_, _ coreField, _ error) {
// The first index is used to offset a pointer of the base type like
// when accessing an array.
localOffset, err := adjustOffset(0, local, localAcc[0])
if err != nil {
return coreField{}, coreField{}, err
}
targetOffset, err := adjustOffset(0, target, localAcc[0])
if err != nil {
return coreField{}, coreField{}, err
}
if err := coreAreMembersCompatible(local, target); err != nil {
return coreField{}, coreField{}, fmt.Errorf("fields: %w", err)
}
var localMaybeFlex, targetMaybeFlex bool
for _, acc := range localAcc[1:] {
switch localType := local.(type) {
case composite:
// For composite types acc is used to find the field in the local type,
// and then we try to find a field in target with the same name.
localMembers := localType.members()
if acc >= len(localMembers) {
return coreField{}, coreField{}, fmt.Errorf("invalid accessor %d for %s", acc, local)
}
localMember := localMembers[acc]
if localMember.Name == "" {
_, ok := localMember.Type.(composite)
if !ok {
return coreField{}, coreField{}, fmt.Errorf("unnamed field with type %s: %s", localMember.Type, ErrNotSupported)
}
// This is an anonymous struct or union, ignore it.
local = localMember.Type
localOffset += localMember.Offset
localMaybeFlex = false
continue
}
targetType, ok := target.(composite)
if !ok {
return coreField{}, coreField{}, fmt.Errorf("target not composite: %w", errImpossibleRelocation)
}
targetMember, last, err := coreFindMember(targetType, localMember.Name)
if err != nil {
return coreField{}, coreField{}, err
}
if targetMember.BitfieldSize > 0 {
return coreField{}, coreField{}, fmt.Errorf("field %q is a bitfield: %w", targetMember.Name, ErrNotSupported)
}
local = localMember.Type
localMaybeFlex = acc == len(localMembers)-1
localOffset += localMember.Offset
target = targetMember.Type
targetMaybeFlex = last
targetOffset += targetMember.Offset
case *Array:
// For arrays, acc is the index in the target.
targetType, ok := target.(*Array)
if !ok {
return coreField{}, coreField{}, fmt.Errorf("target not array: %w", errImpossibleRelocation)
}
if localType.Nelems == 0 && !localMaybeFlex {
return coreField{}, coreField{}, fmt.Errorf("local type has invalid flexible array")
}
if targetType.Nelems == 0 && !targetMaybeFlex {
return coreField{}, coreField{}, fmt.Errorf("target type has invalid flexible array")
}
if localType.Nelems > 0 && acc >= int(localType.Nelems) {
return coreField{}, coreField{}, fmt.Errorf("invalid access of %s at index %d", localType, acc)
}
if targetType.Nelems > 0 && acc >= int(targetType.Nelems) {
return coreField{}, coreField{}, fmt.Errorf("out of bounds access of target: %w", errImpossibleRelocation)
}
local = localType.Type
localMaybeFlex = false
localOffset, err = adjustOffset(localOffset, local, acc)
if err != nil {
return coreField{}, coreField{}, err
}
target = targetType.Type
targetMaybeFlex = false
targetOffset, err = adjustOffset(targetOffset, target, acc)
if err != nil {
return coreField{}, coreField{}, err
}
default:
return coreField{}, coreField{}, fmt.Errorf("relocate field of %T: %w", localType, ErrNotSupported)
}
if err := coreAreMembersCompatible(local, target); err != nil {
return coreField{}, coreField{}, err
}
}
return coreField{local, localOffset}, coreField{target, targetOffset}, nil
}
// coreFindMember finds a member in a composite type while handling anonymous
// structs and unions.
func coreFindMember(typ composite, name Name) (Member, bool, error) {
if name == "" {
return Member{}, false, errors.New("can't search for anonymous member")
}
type offsetTarget struct {
composite
offset uint32
}
targets := []offsetTarget{{typ, 0}}
visited := make(map[composite]bool)
for i := 0; i < len(targets); i++ {
target := targets[i]
// Only visit targets once to prevent infinite recursion.
if visited[target] {
continue
}
if len(visited) >= maxTypeDepth {
// This check is different than libbpf, which restricts the entire
// path to BPF_CORE_SPEC_MAX_LEN items.
return Member{}, false, fmt.Errorf("type is nested too deep")
}
visited[target] = true
members := target.members()
for j, member := range members {
if member.Name == name {
// NB: This is safe because member is a copy.
member.Offset += target.offset
return member, j == len(members)-1, nil
}
// The names don't match, but this member could be an anonymous struct
// or union.
if member.Name != "" {
continue
}
comp, ok := member.Type.(composite)
if !ok {
return Member{}, false, fmt.Errorf("anonymous non-composite type %T not allowed", member.Type)
}
targets = append(targets, offsetTarget{comp, target.offset + member.Offset})
}
}
return Member{}, false, fmt.Errorf("no matching member: %w", errImpossibleRelocation)
}
// coreFindEnumValue follows localAcc to find the equivalent enum value in target.
func coreFindEnumValue(local Type, localAcc coreAccessor, target Type) (localValue, targetValue *EnumValue, _ error) {
localValue, err := localAcc.enumValue(local)
if err != nil {
return nil, nil, err
}
targetEnum, ok := target.(*Enum)
if !ok {
return nil, nil, errImpossibleRelocation
}
localName := localValue.Name.essentialName()
for i, targetValue := range targetEnum.Values {
if targetValue.Name.essentialName() != localName {
continue
}
return localValue, &targetEnum.Values[i], nil
}
return nil, nil, errImpossibleRelocation
}
/* The comment below is from bpf_core_types_are_compat in libbpf.c:
*
* Check local and target types for compatibility. This check is used for
@@ -736,10 +239,8 @@ func coreFindEnumValue(local Type, localAcc coreAccessor, target Type) (localVal
* number of input args and compatible return and argument types.
* These rules are not set in stone and probably will be adjusted as we get
* more experience with using BPF CO-RE relocations.
*
* Returns errImpossibleRelocation if types are not compatible.
*/
func coreAreTypesCompatible(localType Type, targetType Type) error {
func coreAreTypesCompatible(localType Type, targetType Type) (bool, error) {
var (
localTs, targetTs typeDeque
l, t = &localType, &targetType
@@ -748,14 +249,14 @@ func coreAreTypesCompatible(localType Type, targetType Type) error {
for ; l != nil && t != nil; l, t = localTs.shift(), targetTs.shift() {
if depth >= maxTypeDepth {
return errors.New("types are nested too deep")
return false, errors.New("types are nested too deep")
}
localType = *l
targetType = *t
localType = skipQualifierAndTypedef(*l)
targetType = skipQualifierAndTypedef(*t)
if reflect.TypeOf(localType) != reflect.TypeOf(targetType) {
return fmt.Errorf("type mismatch: %w", errImpossibleRelocation)
return false, nil
}
switch lv := (localType).(type) {
@@ -765,7 +266,7 @@ func coreAreTypesCompatible(localType Type, targetType Type) error {
case *Int:
tv := targetType.(*Int)
if lv.isBitfield() || tv.isBitfield() {
return fmt.Errorf("bitfield: %w", errImpossibleRelocation)
return false, nil
}
case *Pointer, *Array:
@@ -776,7 +277,7 @@ func coreAreTypesCompatible(localType Type, targetType Type) error {
case *FuncProto:
tv := targetType.(*FuncProto)
if len(lv.Params) != len(tv.Params) {
return fmt.Errorf("function param mismatch: %w", errImpossibleRelocation)
return false, nil
}
depth++
@@ -784,24 +285,22 @@ func coreAreTypesCompatible(localType Type, targetType Type) error {
targetType.walk(&targetTs)
default:
return fmt.Errorf("unsupported type %T", localType)
return false, fmt.Errorf("unsupported type %T", localType)
}
}
if l != nil {
return fmt.Errorf("dangling local type %T", *l)
return false, fmt.Errorf("dangling local type %T", *l)
}
if t != nil {
return fmt.Errorf("dangling target type %T", *t)
return false, fmt.Errorf("dangling target type %T", *t)
}
return nil
return true, nil
}
/* coreAreMembersCompatible checks two types for field-based relocation compatibility.
*
* The comment below is from bpf_core_fields_are_compat in libbpf.c:
/* The comment below is from bpf_core_fields_are_compat in libbpf.c:
*
* Check two types for compatibility for the purpose of field access
* relocation. const/volatile/restrict and typedefs are skipped to ensure we
@@ -815,63 +314,65 @@ func coreAreTypesCompatible(localType Type, targetType Type) error {
* - for INT, size and signedness are ignored;
* - for ARRAY, dimensionality is ignored, element types are checked for
* compatibility recursively;
* [ NB: coreAreMembersCompatible doesn't recurse, this check is done
* by coreFindField. ]
* - everything else shouldn't be ever a target of relocation.
* These rules are not set in stone and probably will be adjusted as we get
* more experience with using BPF CO-RE relocations.
*
* Returns errImpossibleRelocation if the members are not compatible.
*/
func coreAreMembersCompatible(localType Type, targetType Type) error {
doNamesMatch := func(a, b string) error {
func coreAreMembersCompatible(localType Type, targetType Type) (bool, error) {
doNamesMatch := func(a, b string) bool {
if a == "" || b == "" {
// allow anonymous and named type to match
return nil
return true
}
if essentialName(a) == essentialName(b) {
return nil
return essentialName(a) == essentialName(b)
}
for depth := 0; depth <= maxTypeDepth; depth++ {
localType = skipQualifierAndTypedef(localType)
targetType = skipQualifierAndTypedef(targetType)
_, lok := localType.(composite)
_, tok := targetType.(composite)
if lok && tok {
return true, nil
}
return fmt.Errorf("names don't match: %w", errImpossibleRelocation)
}
_, lok := localType.(composite)
_, tok := targetType.(composite)
if lok && tok {
return nil
}
if reflect.TypeOf(localType) != reflect.TypeOf(targetType) {
return fmt.Errorf("type mismatch: %w", errImpossibleRelocation)
}
switch lv := localType.(type) {
case *Array, *Pointer:
return nil
case *Enum:
tv := targetType.(*Enum)
return doNamesMatch(lv.name(), tv.name())
case *Fwd:
tv := targetType.(*Fwd)
return doNamesMatch(lv.name(), tv.name())
case *Int:
tv := targetType.(*Int)
if lv.isBitfield() || tv.isBitfield() {
return fmt.Errorf("bitfield: %w", errImpossibleRelocation)
if reflect.TypeOf(localType) != reflect.TypeOf(targetType) {
return false, nil
}
return nil
default:
return fmt.Errorf("type %s: %w", localType, ErrNotSupported)
switch lv := localType.(type) {
case *Pointer:
return true, nil
case *Enum:
tv := targetType.(*Enum)
return doNamesMatch(lv.name(), tv.name()), nil
case *Fwd:
tv := targetType.(*Fwd)
return doNamesMatch(lv.name(), tv.name()), nil
case *Int:
tv := targetType.(*Int)
return !lv.isBitfield() && !tv.isBitfield(), nil
case *Array:
tv := targetType.(*Array)
localType = lv.Type
targetType = tv.Type
default:
return false, fmt.Errorf("unsupported type %T", localType)
}
}
return false, errors.New("types are nested too deep")
}
func skipQualifierAndTypedef(typ Type) (Type, error) {
func skipQualifierAndTypedef(typ Type) Type {
result := typ
for depth := 0; depth <= maxTypeDepth; depth++ {
switch v := (result).(type) {
@@ -880,8 +381,8 @@ func skipQualifierAndTypedef(typ Type) (Type, error) {
case *Typedef:
result = v.Type
default:
return result, nil
return result
}
}
return nil, errors.New("exceeded type depth")
return typ
}

View File

@@ -30,7 +30,7 @@ type btfExtCoreHeader struct {
CoreReloLen uint32
}
func parseExtInfos(r io.ReadSeeker, bo binary.ByteOrder, strings stringTable) (funcInfo, lineInfo map[string]extInfo, relos map[string]coreRelos, err error) {
func parseExtInfos(r io.ReadSeeker, bo binary.ByteOrder, strings stringTable) (funcInfo, lineInfo map[string]extInfo, coreRelos map[string]bpfCoreRelos, err error) {
var header btfExtHeader
var coreHeader btfExtCoreHeader
if err := binary.Read(r, bo, &header); err != nil {
@@ -94,13 +94,13 @@ func parseExtInfos(r io.ReadSeeker, bo binary.ByteOrder, strings stringTable) (f
return nil, nil, nil, fmt.Errorf("can't seek to CO-RE relocation section: %v", err)
}
relos, err = parseExtInfoRelos(io.LimitReader(r, int64(coreHeader.CoreReloLen)), bo, strings)
coreRelos, err = parseExtInfoRelos(io.LimitReader(r, int64(coreHeader.CoreReloLen)), bo, strings)
if err != nil {
return nil, nil, nil, fmt.Errorf("CO-RE relocation info: %w", err)
}
}
return funcInfo, lineInfo, relos, nil
return funcInfo, lineInfo, coreRelos, nil
}
type btfExtInfoSec struct {
@@ -208,25 +208,18 @@ type bpfCoreRelo struct {
InsnOff uint32
TypeID TypeID
AccessStrOff uint32
Kind COREKind
ReloKind coreReloKind
}
type coreRelo struct {
insnOff uint32
typeID TypeID
accessor coreAccessor
kind COREKind
}
type coreRelos []coreRelo
type bpfCoreRelos []bpfCoreRelo
// append two slices of extInfoRelo to each other. The InsnOff of b are adjusted
// by offset.
func (r coreRelos) append(other coreRelos, offset uint64) coreRelos {
result := make([]coreRelo, 0, len(r)+len(other))
func (r bpfCoreRelos) append(other bpfCoreRelos, offset uint64) bpfCoreRelos {
result := make([]bpfCoreRelo, 0, len(r)+len(other))
result = append(result, r...)
for _, relo := range other {
relo.insnOff += uint32(offset)
relo.InsnOff += uint32(offset)
result = append(result, relo)
}
return result
@@ -234,7 +227,7 @@ func (r coreRelos) append(other coreRelos, offset uint64) coreRelos {
var extInfoReloSize = binary.Size(bpfCoreRelo{})
func parseExtInfoRelos(r io.Reader, bo binary.ByteOrder, strings stringTable) (map[string]coreRelos, error) {
func parseExtInfoRelos(r io.Reader, bo binary.ByteOrder, strings stringTable) (map[string]bpfCoreRelos, error) {
var recordSize uint32
if err := binary.Read(r, bo, &recordSize); err != nil {
return nil, fmt.Errorf("read record size: %v", err)
@@ -244,14 +237,14 @@ func parseExtInfoRelos(r io.Reader, bo binary.ByteOrder, strings stringTable) (m
return nil, fmt.Errorf("expected record size %d, got %d", extInfoReloSize, recordSize)
}
result := make(map[string]coreRelos)
result := make(map[string]bpfCoreRelos)
for {
secName, infoHeader, err := parseExtInfoHeader(r, bo, strings)
if errors.Is(err, io.EOF) {
return result, nil
}
var relos coreRelos
var relos []bpfCoreRelo
for i := uint32(0); i < infoHeader.NumInfo; i++ {
var relo bpfCoreRelo
if err := binary.Read(r, bo, &relo); err != nil {
@@ -262,22 +255,7 @@ func parseExtInfoRelos(r io.Reader, bo binary.ByteOrder, strings stringTable) (m
return nil, fmt.Errorf("section %v: offset %v is not aligned with instruction size", secName, relo.InsnOff)
}
accessorStr, err := strings.Lookup(relo.AccessStrOff)
if err != nil {
return nil, err
}
accessor, err := parseCoreAccessor(accessorStr)
if err != nil {
return nil, fmt.Errorf("accessor %q: %s", accessorStr, err)
}
relos = append(relos, coreRelo{
relo.InsnOff,
relo.TypeID,
accessor,
relo.Kind,
})
relos = append(relos, relo)
}
result[secName] = relos

View File

@@ -1,6 +1,7 @@
package btf
import (
"errors"
"fmt"
"math"
"strings"
@@ -36,7 +37,6 @@ type Type interface {
type namedType interface {
Type
name() string
essentialName() string
}
// Name identifies a type.
@@ -48,10 +48,6 @@ func (n Name) name() string {
return string(n)
}
func (n Name) essentialName() string {
return essentialName(string(n))
}
// Void is the unit type of BTF.
type Void struct{}
@@ -178,7 +174,8 @@ func (s *Struct) walk(tdq *typeDeque) {
func (s *Struct) copy() Type {
cpy := *s
cpy.Members = copyMembers(s.Members)
cpy.Members = make([]Member, len(s.Members))
copy(cpy.Members, s.Members)
return &cpy
}
@@ -209,7 +206,8 @@ func (u *Union) walk(tdq *typeDeque) {
func (u *Union) copy() Type {
cpy := *u
cpy.Members = copyMembers(u.Members)
cpy.Members = make([]Member, len(u.Members))
copy(cpy.Members, u.Members)
return &cpy
}
@@ -217,12 +215,6 @@ func (u *Union) members() []Member {
return u.Members
}
func copyMembers(orig []Member) []Member {
cpy := make([]Member, len(orig))
copy(cpy, orig)
return cpy
}
type composite interface {
members() []Member
}
@@ -519,7 +511,7 @@ func Sizeof(typ Type) (int, error) {
switch v := typ.(type) {
case *Array:
if n > 0 && int64(v.Nelems) > math.MaxInt64/n {
return 0, fmt.Errorf("type %s: overflow", typ)
return 0, errors.New("overflow")
}
// Arrays may be of zero length, which allows
@@ -540,30 +532,28 @@ func Sizeof(typ Type) (int, error) {
continue
default:
return 0, fmt.Errorf("unsized type %T", typ)
return 0, fmt.Errorf("unrecognized type %T", typ)
}
if n > 0 && elem > math.MaxInt64/n {
return 0, fmt.Errorf("type %s: overflow", typ)
return 0, errors.New("overflow")
}
size := n * elem
if int64(int(size)) != size {
return 0, fmt.Errorf("type %s: overflow", typ)
return 0, errors.New("overflow")
}
return int(size), nil
}
return 0, fmt.Errorf("type %s: exceeded type depth", typ)
return 0, errors.New("exceeded type depth")
}
// copy a Type recursively.
//
// typ may form a cycle.
//
// Returns any errors from transform verbatim.
func copyType(typ Type, transform func(Type) (Type, error)) (Type, error) {
func copyType(typ Type) Type {
var (
copies = make(map[Type]Type)
work typeDeque
@@ -576,17 +566,7 @@ func copyType(typ Type, transform func(Type) (Type, error)) (Type, error) {
continue
}
var cpy Type
if transform != nil {
tf, err := transform(*t)
if err != nil {
return nil, fmt.Errorf("copy %s: %w", typ, err)
}
cpy = tf.copy()
} else {
cpy = (*t).copy()
}
cpy := (*t).copy()
copies[*t] = cpy
*t = cpy
@@ -594,7 +574,7 @@ func copyType(typ Type, transform func(Type) (Type, error)) (Type, error) {
cpy.walk(&work)
}
return typ, nil
return typ
}
// typeDeque keeps track of pointers to types which still

View File

@@ -50,19 +50,3 @@ func (se *SafeELFFile) Symbols() (syms []elf.Symbol, err error) {
syms, err = se.File.Symbols()
return
}
// DynamicSymbols is the safe version of elf.File.DynamicSymbols.
func (se *SafeELFFile) DynamicSymbols() (syms []elf.Symbol, err error) {
defer func() {
r := recover()
if r == nil {
return
}
syms = nil
err = fmt.Errorf("reading ELF dynamic symbols panicked: %s", r)
}()
syms, err = se.File.DynamicSymbols()
return
}

View File

@@ -9,16 +9,11 @@ import (
// depending on the host's endianness.
var NativeEndian binary.ByteOrder
// Clang is set to either "el" or "eb" depending on the host's endianness.
var ClangEndian string
func init() {
if isBigEndian() {
NativeEndian = binary.BigEndian
ClangEndian = "eb"
} else {
NativeEndian = binary.LittleEndian
ClangEndian = "el"
}
}

View File

@@ -29,10 +29,6 @@ type VerifierError struct {
log string
}
func (le *VerifierError) Unwrap() error {
return le.cause
}
func (le *VerifierError) Error() string {
if le.log == "" {
return le.cause.Error()

View File

@@ -22,6 +22,10 @@ func NewSlicePointer(buf []byte) Pointer {
// NewStringPointer creates a 64-bit pointer from a string.
func NewStringPointer(str string) Pointer {
if str == "" {
return Pointer{}
}
p, err := unix.BytePtrFromString(str)
if err != nil {
return Pointer{}

View File

@@ -42,7 +42,6 @@ const (
PROT_READ = linux.PROT_READ
PROT_WRITE = linux.PROT_WRITE
MAP_SHARED = linux.MAP_SHARED
PERF_ATTR_SIZE_VER1 = linux.PERF_ATTR_SIZE_VER1
PERF_TYPE_SOFTWARE = linux.PERF_TYPE_SOFTWARE
PERF_TYPE_TRACEPOINT = linux.PERF_TYPE_TRACEPOINT
PERF_COUNT_SW_BPF_OUTPUT = linux.PERF_COUNT_SW_BPF_OUTPUT

View File

@@ -43,7 +43,6 @@ const (
PROT_READ = 0x1
PROT_WRITE = 0x2
MAP_SHARED = 0x1
PERF_ATTR_SIZE_VER1 = 0
PERF_TYPE_SOFTWARE = 0x1
PERF_TYPE_TRACEPOINT = 0
PERF_COUNT_SW_BPF_OUTPUT = 0xa

View File

@@ -1,16 +1,12 @@
package link
import (
"bytes"
"crypto/rand"
"errors"
"fmt"
"io/ioutil"
"os"
"path/filepath"
"runtime"
"sync"
"unsafe"
"github.com/cilium/ebpf"
"github.com/cilium/ebpf/internal"
@@ -19,60 +15,13 @@ import (
var (
kprobeEventsPath = filepath.Join(tracefsPath, "kprobe_events")
kprobeRetprobeBit = struct {
once sync.Once
value uint64
err error
}{}
)
type probeType uint8
const (
kprobeType probeType = iota
uprobeType
)
func (pt probeType) String() string {
if pt == kprobeType {
return "kprobe"
}
return "uprobe"
}
func (pt probeType) EventsPath() string {
if pt == kprobeType {
return kprobeEventsPath
}
return uprobeEventsPath
}
func (pt probeType) PerfEventType(ret bool) perfEventType {
if pt == kprobeType {
if ret {
return kretprobeEvent
}
return kprobeEvent
}
if ret {
return uretprobeEvent
}
return uprobeEvent
}
func (pt probeType) RetprobeBit() (uint64, error) {
if pt == kprobeType {
return kretprobeBit()
}
return uretprobeBit()
}
// Kprobe attaches the given eBPF program to a perf event that fires when the
// given kernel symbol starts executing. See /proc/kallsyms for available
// symbols. For example, printk():
//
// Kprobe("printk", prog)
// Kprobe("printk")
//
// The resulting Link must be Closed during program shutdown to avoid leaking
// system resources.
@@ -95,7 +44,7 @@ func Kprobe(symbol string, prog *ebpf.Program) (Link, error) {
// before the given kernel symbol exits, with the function stack left intact.
// See /proc/kallsyms for available symbols. For example, printk():
//
// Kretprobe("printk", prog)
// Kretprobe("printk")
//
// The resulting Link must be Closed during program shutdown to avoid leaking
// system resources.
@@ -131,10 +80,7 @@ func kprobe(symbol string, prog *ebpf.Program, ret bool) (*perfEvent, error) {
}
// Use kprobe PMU if the kernel has it available.
tp, err := pmuKprobe(platformPrefix(symbol), ret)
if errors.Is(err, os.ErrNotExist) {
tp, err = pmuKprobe(symbol, ret)
}
tp, err := pmuKprobe(symbol, ret)
if err == nil {
return tp, nil
}
@@ -143,10 +89,7 @@ func kprobe(symbol string, prog *ebpf.Program, ret bool) (*perfEvent, error) {
}
// Use tracefs if kprobe PMU is missing.
tp, err = tracefsKprobe(platformPrefix(symbol), ret)
if errors.Is(err, os.ErrNotExist) {
tp, err = tracefsKprobe(symbol, ret)
}
tp, err = tracefsKprobe(symbol, ret)
if err != nil {
return nil, fmt.Errorf("creating trace event '%s' in tracefs: %w", symbol, err)
}
@@ -154,70 +97,36 @@ func kprobe(symbol string, prog *ebpf.Program, ret bool) (*perfEvent, error) {
return tp, nil
}
// pmuKprobe opens a perf event based on the kprobe PMU.
// Returns os.ErrNotExist if the given symbol does not exist in the kernel.
// pmuKprobe opens a perf event based on a Performance Monitoring Unit.
// Requires at least 4.17 (e12f03d7031a "perf/core: Implement the
// 'perf_kprobe' PMU").
// Returns ErrNotSupported if the kernel doesn't support perf_kprobe PMU,
// or os.ErrNotExist if the given symbol does not exist in the kernel.
func pmuKprobe(symbol string, ret bool) (*perfEvent, error) {
return pmuProbe(kprobeType, symbol, "", 0, ret)
}
// pmuProbe opens a perf event based on a Performance Monitoring Unit.
//
// Requires at least a 4.17 kernel.
// e12f03d7031a "perf/core: Implement the 'perf_kprobe' PMU"
// 33ea4b24277b "perf/core: Implement the 'perf_uprobe' PMU"
//
// Returns ErrNotSupported if the kernel doesn't support perf_[k,u]probe PMU
func pmuProbe(typ probeType, symbol, path string, offset uint64, ret bool) (*perfEvent, error) {
// Getting the PMU type will fail if the kernel doesn't support
// the perf_[k,u]probe PMU.
et, err := getPMUEventType(typ)
// the perf_kprobe PMU.
et, err := getPMUEventType("kprobe")
if err != nil {
return nil, err
}
var config uint64
if ret {
bit, err := typ.RetprobeBit()
if err != nil {
return nil, err
}
config |= 1 << bit
// Create a pointer to a NUL-terminated string for the kernel.
sp, err := unsafeStringPtr(symbol)
if err != nil {
return nil, err
}
var (
attr unix.PerfEventAttr
sp unsafe.Pointer
)
switch typ {
case kprobeType:
// Create a pointer to a NUL-terminated string for the kernel.
sp, err := unsafeStringPtr(symbol)
if err != nil {
return nil, err
}
// TODO: Parse the position of the bit from /sys/bus/event_source/devices/%s/format/retprobe.
config := 0
if ret {
config = 1
}
attr = unix.PerfEventAttr{
Type: uint32(et), // PMU event type read from sysfs
Ext1: uint64(uintptr(sp)), // Kernel symbol to trace
Config: config, // Retprobe flag
}
case uprobeType:
sp, err := unsafeStringPtr(path)
if err != nil {
return nil, err
}
attr = unix.PerfEventAttr{
// The minimum size required for PMU uprobes is PERF_ATTR_SIZE_VER1,
// since it added the config2 (Ext2) field. The Size field controls the
// size of the internal buffer the kernel allocates for reading the
// perf_event_attr argument from userspace.
Size: unix.PERF_ATTR_SIZE_VER1,
Type: uint32(et), // PMU event type read from sysfs
Ext1: uint64(uintptr(sp)), // Uprobe path
Ext2: offset, // Uprobe offset
Config: config, // Retprobe flag
}
attr := unix.PerfEventAttr{
Type: uint32(et), // PMU event type read from sysfs
Ext1: uint64(uintptr(sp)), // Kernel symbol to trace
Config: uint64(config), // perf_kprobe PMU treats config as flags
}
fd, err := unix.PerfEventOpen(&attr, perfAllThreads, 0, -1, unix.PERF_FLAG_FD_CLOEXEC)
@@ -235,27 +144,22 @@ func pmuProbe(typ probeType, symbol, path string, offset uint64, ret bool) (*per
// Ensure the string pointer is not collected before PerfEventOpen returns.
runtime.KeepAlive(sp)
// Kernel has perf_[k,u]probe PMU available, initialize perf event.
// Kernel has perf_kprobe PMU available, initialize perf event.
return &perfEvent{
fd: internal.NewFD(uint32(fd)),
pmuID: et,
name: symbol,
typ: typ.PerfEventType(ret),
fd: internal.NewFD(uint32(fd)),
pmuID: et,
name: symbol,
ret: ret,
progType: ebpf.Kprobe,
}, nil
}
// tracefsKprobe creates a Kprobe tracefs entry.
func tracefsKprobe(symbol string, ret bool) (*perfEvent, error) {
return tracefsProbe(kprobeType, symbol, "", 0, ret)
}
// tracefsProbe creates a trace event by writing an entry to <tracefs>/[k,u]probe_events.
// tracefsKprobe creates a trace event by writing an entry to <tracefs>/kprobe_events.
// A new trace event group name is generated on every call to support creating
// multiple trace events for the same kernel or userspace symbol.
// Path and offset are only set in the case of uprobe(s) and are used to set
// the executable/library path on the filesystem and the offset where the probe is inserted.
// A perf event is then opened on the newly-created trace event and returned to the caller.
func tracefsProbe(typ probeType, symbol, path string, offset uint64, ret bool) (*perfEvent, error) {
// multiple trace events for the same kernel symbol. A perf event is then opened
// on the newly-created trace event and returned to the caller.
func tracefsKprobe(symbol string, ret bool) (*perfEvent, error) {
// Generate a random string for each trace event we attempt to create.
// This value is used as the 'group' token in tracefs to allow creating
// multiple kprobe trace events with the same name.
@@ -272,13 +176,14 @@ func tracefsProbe(typ probeType, symbol, path string, offset uint64, ret bool) (
if err == nil {
return nil, fmt.Errorf("trace event already exists: %s/%s", group, symbol)
}
if err != nil && !errors.Is(err, os.ErrNotExist) {
// The read is expected to fail with ErrNotSupported due to a non-existing event.
if err != nil && !errors.Is(err, ErrNotSupported) {
return nil, fmt.Errorf("checking trace event %s/%s: %w", group, symbol, err)
}
// Create the [k,u]probe trace event using tracefs.
if err := createTraceFSProbeEvent(typ, group, symbol, path, offset, ret); err != nil {
return nil, fmt.Errorf("creating probe entry on tracefs: %w", err)
// Create the kprobe trace event using tracefs.
if err := createTraceFSKprobeEvent(group, symbol, ret); err != nil {
return nil, fmt.Errorf("creating kprobe event on tracefs: %w", err)
}
// Get the newly-created trace event's id.
@@ -297,83 +202,65 @@ func tracefsProbe(typ probeType, symbol, path string, offset uint64, ret bool) (
fd: fd,
group: group,
name: symbol,
ret: ret,
tracefsID: tid,
typ: typ.PerfEventType(ret),
progType: ebpf.Kprobe, // kernel only allows attaching kprobe programs to kprobe events
}, nil
}
// createTraceFSProbeEvent creates a new ephemeral trace event by writing to
// <tracefs>/[k,u]probe_events. Returns os.ErrNotExist if symbol is not a valid
// kernel symbol, or if it is not traceable with kprobes. Returns os.ErrExist
// if a probe with the same group and symbol already exists.
func createTraceFSProbeEvent(typ probeType, group, symbol, path string, offset uint64, ret bool) error {
// createTraceFSKprobeEvent creates a new ephemeral trace event by writing to
// <tracefs>/kprobe_events. Returns ErrNotSupported if symbol is not a valid
// kernel symbol, or if it is not traceable with kprobes.
func createTraceFSKprobeEvent(group, symbol string, ret bool) error {
// Open the kprobe_events file in tracefs.
f, err := os.OpenFile(typ.EventsPath(), os.O_APPEND|os.O_WRONLY, 0666)
f, err := os.OpenFile(kprobeEventsPath, os.O_APPEND|os.O_WRONLY, 0666)
if err != nil {
return fmt.Errorf("error opening '%s': %w", typ.EventsPath(), err)
return fmt.Errorf("error opening kprobe_events: %w", err)
}
defer f.Close()
var pe string
switch typ {
case kprobeType:
// The kprobe_events syntax is as follows (see Documentation/trace/kprobetrace.txt):
// p[:[GRP/]EVENT] [MOD:]SYM[+offs]|MEMADDR [FETCHARGS] : Set a probe
// r[MAXACTIVE][:[GRP/]EVENT] [MOD:]SYM[+0] [FETCHARGS] : Set a return probe
// -:[GRP/]EVENT : Clear a probe
//
// Some examples:
// r:ebpf_1234/r_my_kretprobe nf_conntrack_destroy
// p:ebpf_5678/p_my_kprobe __x64_sys_execve
//
// Leaving the kretprobe's MAXACTIVE set to 0 (or absent) will make the
// kernel default to NR_CPUS. This is desired in most eBPF cases since
// subsampling or rate limiting logic can be more accurately implemented in
// the eBPF program itself.
// See Documentation/kprobes.txt for more details.
pe = fmt.Sprintf("%s:%s/%s %s", probePrefix(ret), group, symbol, symbol)
case uprobeType:
// The uprobe_events syntax is as follows:
// p[:[GRP/]EVENT] PATH:OFFSET [FETCHARGS] : Set a probe
// r[:[GRP/]EVENT] PATH:OFFSET [FETCHARGS] : Set a return probe
// -:[GRP/]EVENT : Clear a probe
//
// Some examples:
// r:ebpf_1234/readline /bin/bash:0x12345
// p:ebpf_5678/main_mySymbol /bin/mybin:0x12345
//
// See Documentation/trace/uprobetracer.txt for more details.
pathOffset := uprobePathOffset(path, offset)
pe = fmt.Sprintf("%s:%s/%s %s", probePrefix(ret), group, symbol, pathOffset)
}
// The kprobe_events syntax is as follows (see Documentation/trace/kprobetrace.txt):
// p[:[GRP/]EVENT] [MOD:]SYM[+offs]|MEMADDR [FETCHARGS] : Set a probe
// r[MAXACTIVE][:[GRP/]EVENT] [MOD:]SYM[+0] [FETCHARGS] : Set a return probe
// -:[GRP/]EVENT : Clear a probe
//
// Some examples:
// r:ebpf_1234/r_my_kretprobe nf_conntrack_destroy
// p:ebpf_5678/p_my_kprobe __x64_sys_execve
//
// Leaving the kretprobe's MAXACTIVE set to 0 (or absent) will make the
// kernel default to NR_CPUS. This is desired in most eBPF cases since
// subsampling or rate limiting logic can be more accurately implemented in
// the eBPF program itself. See Documentation/kprobes.txt for more details.
pe := fmt.Sprintf("%s:%s/%s %s", kprobePrefix(ret), group, symbol, symbol)
_, err = f.WriteString(pe)
// Since commit 97c753e62e6c, ENOENT is correctly returned instead of EINVAL
// when trying to create a kretprobe for a missing symbol. Make sure ENOENT
// is returned to the caller.
if errors.Is(err, os.ErrNotExist) || errors.Is(err, unix.EINVAL) {
return fmt.Errorf("symbol %s not found: %w", symbol, os.ErrNotExist)
return fmt.Errorf("kernel symbol %s not found: %w", symbol, os.ErrNotExist)
}
if err != nil {
return fmt.Errorf("writing '%s' to '%s': %w", pe, typ.EventsPath(), err)
return fmt.Errorf("writing '%s' to kprobe_events: %w", pe, err)
}
return nil
}
// closeTraceFSProbeEvent removes the [k,u]probe with the given type, group and symbol
// from <tracefs>/[k,u]probe_events.
func closeTraceFSProbeEvent(typ probeType, group, symbol string) error {
f, err := os.OpenFile(typ.EventsPath(), os.O_APPEND|os.O_WRONLY, 0666)
// closeTraceFSKprobeEvent removes the kprobe with the given group, symbol and kind
// from <tracefs>/kprobe_events.
func closeTraceFSKprobeEvent(group, symbol string) error {
f, err := os.OpenFile(kprobeEventsPath, os.O_APPEND|os.O_WRONLY, 0666)
if err != nil {
return fmt.Errorf("error opening %s: %w", typ.EventsPath(), err)
return fmt.Errorf("error opening kprobe_events: %w", err)
}
defer f.Close()
// See [k,u]probe_events syntax above. The probe type does not need to be specified
// See kprobe_events syntax above. Kprobe type does not need to be specified
// for removals.
pe := fmt.Sprintf("-:%s/%s", group, symbol)
if _, err = f.WriteString(pe); err != nil {
return fmt.Errorf("writing '%s' to '%s': %w", pe, typ.EventsPath(), err)
return fmt.Errorf("writing '%s' to kprobe_events: %w", pe, err)
}
return nil
@@ -401,38 +288,9 @@ func randomGroup(prefix string) (string, error) {
return group, nil
}
func probePrefix(ret bool) string {
func kprobePrefix(ret bool) string {
if ret {
return "r"
}
return "p"
}
// determineRetprobeBit reads a Performance Monitoring Unit's retprobe bit
// from /sys/bus/event_source/devices/<pmu>/format/retprobe.
func determineRetprobeBit(typ probeType) (uint64, error) {
p := filepath.Join("/sys/bus/event_source/devices/", typ.String(), "/format/retprobe")
data, err := ioutil.ReadFile(p)
if err != nil {
return 0, err
}
var rp uint64
n, err := fmt.Sscanf(string(bytes.TrimSpace(data)), "config:%d", &rp)
if err != nil {
return 0, fmt.Errorf("parse retprobe bit: %w", err)
}
if n != 1 {
return 0, fmt.Errorf("parse retprobe bit: expected 1 item, got %d", n)
}
return rp, nil
}
func kretprobeBit() (uint64, error) {
kprobeRetprobeBit.once.Do(func() {
kprobeRetprobeBit.value, kprobeRetprobeBit.err = determineRetprobeBit(kprobeType)
})
return kprobeRetprobeBit.value, kprobeRetprobeBit.err
}

View File

@@ -31,10 +31,6 @@ import (
// exported kernel symbols. kprobe-based (tracefs) trace events can be
// created system-wide by writing to the <tracefs>/kprobe_events file, or
// they can be scoped to the current process by creating PMU perf events.
// - u(ret)probe: Ephemeral trace events based on user provides ELF binaries
// and offsets. uprobe-based (tracefs) trace events can be
// created system-wide by writing to the <tracefs>/uprobe_events file, or
// they can be scoped to the current process by creating PMU perf events.
// - perf event: An object instantiated based on an existing trace event or
// kernel symbol. Referred to by fd in userspace.
// Exactly one eBPF program can be attached to a perf event. Multiple perf
@@ -56,16 +52,6 @@ const (
perfAllThreads = -1
)
type perfEventType uint8
const (
tracepointEvent perfEventType = iota
kprobeEvent
kretprobeEvent
uprobeEvent
uretprobeEvent
)
// A perfEvent represents a perf event kernel object. Exactly one eBPF program
// can be attached to it. It is created based on a tracefs trace event or a
// Performance Monitoring Unit (PMU).
@@ -80,10 +66,11 @@ type perfEvent struct {
// ID of the trace event read from tracefs. Valid IDs are non-zero.
tracefsID uint64
// The event type determines the types of programs that can be attached.
typ perfEventType
// True for kretprobes/uretprobes.
ret bool
fd *internal.FD
fd *internal.FD
progType ebpf.ProgramType
}
func (pe *perfEvent) isLink() {}
@@ -130,18 +117,13 @@ func (pe *perfEvent) Close() error {
return fmt.Errorf("closing perf event fd: %w", err)
}
switch pe.typ {
case kprobeEvent, kretprobeEvent:
// Clean up kprobe tracefs entry.
switch t := pe.progType; t {
case ebpf.Kprobe:
// For kprobes created using tracefs, clean up the <tracefs>/kprobe_events entry.
if pe.tracefsID != 0 {
return closeTraceFSProbeEvent(kprobeType, pe.group, pe.name)
return closeTraceFSKprobeEvent(pe.group, pe.name)
}
case uprobeEvent, uretprobeEvent:
// Clean up uprobe tracefs entry.
if pe.tracefsID != 0 {
return closeTraceFSProbeEvent(uprobeType, pe.group, pe.name)
}
case tracepointEvent:
case ebpf.TracePoint:
// Tracepoint trace events don't hold any extra resources.
return nil
}
@@ -159,21 +141,12 @@ func (pe *perfEvent) attach(prog *ebpf.Program) error {
if pe.fd == nil {
return errors.New("cannot attach to nil perf event")
}
if t := prog.Type(); t != pe.progType {
return fmt.Errorf("invalid program type (expected %s): %s", pe.progType, t)
}
if prog.FD() < 0 {
return fmt.Errorf("invalid program: %w", internal.ErrClosedFd)
}
switch pe.typ {
case kprobeEvent, kretprobeEvent, uprobeEvent, uretprobeEvent:
if t := prog.Type(); t != ebpf.Kprobe {
return fmt.Errorf("invalid program type (expected %s): %s", ebpf.Kprobe, t)
}
case tracepointEvent:
if t := prog.Type(); t != ebpf.TracePoint {
return fmt.Errorf("invalid program type (expected %s): %s", ebpf.TracePoint, t)
}
default:
return fmt.Errorf("unknown perf event type: %d", pe.typ)
}
// The ioctl below will fail when the fd is invalid.
kfd, _ := pe.fd.Value()
@@ -207,8 +180,8 @@ func unsafeStringPtr(str string) (unsafe.Pointer, error) {
// group and name must be alphanumeric or underscore, as required by the kernel.
func getTraceEventID(group, name string) (uint64, error) {
tid, err := uint64FromFile(tracefsPath, "events", group, name, "id")
if errors.Is(err, os.ErrNotExist) {
return 0, fmt.Errorf("trace event %s/%s: %w", group, name, os.ErrNotExist)
if errors.Is(err, ErrNotSupported) {
return 0, fmt.Errorf("trace event %s/%s: %w", group, name, ErrNotSupported)
}
if err != nil {
return 0, fmt.Errorf("reading trace event ID of %s/%s: %w", group, name, err)
@@ -219,22 +192,20 @@ func getTraceEventID(group, name string) (uint64, error) {
// getPMUEventType reads a Performance Monitoring Unit's type (numeric identifier)
// from /sys/bus/event_source/devices/<pmu>/type.
//
// Returns ErrNotSupported if the pmu type is not supported.
func getPMUEventType(typ probeType) (uint64, error) {
et, err := uint64FromFile("/sys/bus/event_source/devices", typ.String(), "type")
if errors.Is(err, os.ErrNotExist) {
return 0, fmt.Errorf("pmu type %s: %w", typ, ErrNotSupported)
func getPMUEventType(pmu string) (uint64, error) {
et, err := uint64FromFile("/sys/bus/event_source/devices", pmu, "type")
if errors.Is(err, ErrNotSupported) {
return 0, fmt.Errorf("pmu type %s: %w", pmu, ErrNotSupported)
}
if err != nil {
return 0, fmt.Errorf("reading pmu type %s: %w", typ, err)
return 0, fmt.Errorf("reading pmu type %s: %w", pmu, err)
}
return et, nil
}
// openTracepointPerfEvent opens a tracepoint-type perf event. System-wide
// [k,u]probes created by writing to <tracefs>/[k,u]probe_events are tracepoints
// kprobes created by writing to <tracefs>/kprobe_events are tracepoints
// behind the scenes, and can be attached to using these perf events.
func openTracepointPerfEvent(tid uint64) (*internal.FD, error) {
attr := unix.PerfEventAttr{
@@ -257,13 +228,22 @@ func openTracepointPerfEvent(tid uint64) (*internal.FD, error) {
// and joined onto base. Returns error if base no longer prefixes the path after
// joining all components.
func uint64FromFile(base string, path ...string) (uint64, error) {
// Resolve leaf path separately for error feedback. Makes the join onto
// base more readable (can't mix with variadic args).
l := filepath.Join(path...)
p := filepath.Join(base, l)
if !strings.HasPrefix(p, base) {
return 0, fmt.Errorf("path '%s' attempts to escape base path '%s': %w", l, base, errInvalidInput)
}
data, err := ioutil.ReadFile(p)
if os.IsNotExist(err) {
// Only echo leaf path, the base path can be prepended at the call site
// if more verbosity is required.
return 0, fmt.Errorf("symbol %s: %w", l, ErrNotSupported)
}
if err != nil {
return 0, fmt.Errorf("reading file %s: %w", p, err)
}

View File

@@ -1,25 +0,0 @@
package link
import (
"fmt"
"runtime"
)
func platformPrefix(symbol string) string {
prefix := runtime.GOARCH
// per https://github.com/golang/go/blob/master/src/go/build/syslist.go
switch prefix {
case "386":
prefix = "ia32"
case "amd64", "amd64p32":
prefix = "x64"
case "arm64", "arm64be":
prefix = "arm64"
default:
return symbol
}
return fmt.Sprintf("__%s_%s", prefix, symbol)
}

View File

@@ -43,7 +43,7 @@ func RawAttachProgram(opts RawAttachProgramOptions) error {
}
if err := internal.BPFProgAttach(&attr); err != nil {
return fmt.Errorf("can't attach program: %w", err)
return fmt.Errorf("can't attach program: %s", err)
}
return nil
}
@@ -69,7 +69,7 @@ func RawDetachProgram(opts RawDetachProgramOptions) error {
AttachType: uint32(opts.Attach),
}
if err := internal.BPFProgDetach(&attr); err != nil {
return fmt.Errorf("can't detach program: %w", err)
return fmt.Errorf("can't detach program: %s", err)
}
return nil

View File

@@ -11,7 +11,7 @@ import (
// tracepoints. The top-level directory is the group, the event's subdirectory
// is the name. Example:
//
// Tracepoint("syscalls", "sys_enter_fork", prog)
// Tracepoint("syscalls", "sys_enter_fork")
//
// Note that attaching eBPF programs to syscalls (sys_enter_*/sys_exit_*) is
// only possible as of kernel 4.14 (commit cf5f5ce).
@@ -44,7 +44,7 @@ func Tracepoint(group, name string, prog *ebpf.Program) (Link, error) {
tracefsID: tid,
group: group,
name: name,
typ: tracepointEvent,
progType: ebpf.TracePoint,
}
if err := pe.attach(prog); err != nil {

View File

@@ -1,207 +0,0 @@
package link
import (
"debug/elf"
"errors"
"fmt"
"os"
"path/filepath"
"regexp"
"sync"
"github.com/cilium/ebpf"
"github.com/cilium/ebpf/internal"
)
var (
uprobeEventsPath = filepath.Join(tracefsPath, "uprobe_events")
// rgxUprobeSymbol is used to strip invalid characters from the uprobe symbol
// as they are not allowed to be used as the EVENT token in tracefs.
rgxUprobeSymbol = regexp.MustCompile("[^a-zA-Z0-9]+")
uprobeRetprobeBit = struct {
once sync.Once
value uint64
err error
}{}
)
// Executable defines an executable program on the filesystem.
type Executable struct {
// Path of the executable on the filesystem.
path string
// Parsed ELF symbols and dynamic symbols.
symbols map[string]elf.Symbol
}
// To open a new Executable, use:
//
// OpenExecutable("/bin/bash")
//
// The returned value can then be used to open Uprobe(s).
func OpenExecutable(path string) (*Executable, error) {
if path == "" {
return nil, fmt.Errorf("path cannot be empty")
}
f, err := os.Open(path)
if err != nil {
return nil, fmt.Errorf("open file '%s': %w", path, err)
}
defer f.Close()
se, err := internal.NewSafeELFFile(f)
if err != nil {
return nil, fmt.Errorf("parse ELF file: %w", err)
}
var ex = Executable{
path: path,
symbols: make(map[string]elf.Symbol),
}
if err := ex.addSymbols(se.Symbols); err != nil {
return nil, err
}
if err := ex.addSymbols(se.DynamicSymbols); err != nil {
return nil, err
}
return &ex, nil
}
func (ex *Executable) addSymbols(f func() ([]elf.Symbol, error)) error {
// elf.Symbols and elf.DynamicSymbols return ErrNoSymbols if the section is not found.
syms, err := f()
if err != nil && !errors.Is(err, elf.ErrNoSymbols) {
return err
}
for _, s := range syms {
ex.symbols[s.Name] = s
}
return nil
}
func (ex *Executable) symbol(symbol string) (*elf.Symbol, error) {
if s, ok := ex.symbols[symbol]; ok {
return &s, nil
}
return nil, fmt.Errorf("symbol %s not found", symbol)
}
// Uprobe attaches the given eBPF program to a perf event that fires when the
// given symbol starts executing in the given Executable.
// For example, /bin/bash::main():
//
// ex, _ = OpenExecutable("/bin/bash")
// ex.Uprobe("main", prog)
//
// The resulting Link must be Closed during program shutdown to avoid leaking
// system resources. Functions provided by shared libraries can currently not
// be traced and will result in an ErrNotSupported.
func (ex *Executable) Uprobe(symbol string, prog *ebpf.Program) (Link, error) {
u, err := ex.uprobe(symbol, prog, false)
if err != nil {
return nil, err
}
err = u.attach(prog)
if err != nil {
u.Close()
return nil, err
}
return u, nil
}
// Uretprobe attaches the given eBPF program to a perf event that fires right
// before the given symbol exits. For example, /bin/bash::main():
//
// ex, _ = OpenExecutable("/bin/bash")
// ex.Uretprobe("main", prog)
//
// The resulting Link must be Closed during program shutdown to avoid leaking
// system resources. Functions provided by shared libraries can currently not
// be traced and will result in an ErrNotSupported.
func (ex *Executable) Uretprobe(symbol string, prog *ebpf.Program) (Link, error) {
u, err := ex.uprobe(symbol, prog, true)
if err != nil {
return nil, err
}
err = u.attach(prog)
if err != nil {
u.Close()
return nil, err
}
return u, nil
}
// uprobe opens a perf event for the given binary/symbol and attaches prog to it.
// If ret is true, create a uretprobe.
func (ex *Executable) uprobe(symbol string, prog *ebpf.Program, ret bool) (*perfEvent, error) {
if prog == nil {
return nil, fmt.Errorf("prog cannot be nil: %w", errInvalidInput)
}
if prog.Type() != ebpf.Kprobe {
return nil, fmt.Errorf("eBPF program type %s is not Kprobe: %w", prog.Type(), errInvalidInput)
}
sym, err := ex.symbol(symbol)
if err != nil {
return nil, fmt.Errorf("symbol '%s' not found in '%s': %w", symbol, ex.path, err)
}
// Symbols with location 0 from section undef are shared library calls and
// are relocated before the binary is executed. Dynamic linking is not
// implemented by the library, so mark this as unsupported for now.
if sym.Section == elf.SHN_UNDEF && sym.Value == 0 {
return nil, fmt.Errorf("cannot resolve %s library call '%s': %w", ex.path, symbol, ErrNotSupported)
}
// Use uprobe PMU if the kernel has it available.
tp, err := pmuUprobe(sym.Name, ex.path, sym.Value, ret)
if err == nil {
return tp, nil
}
if err != nil && !errors.Is(err, ErrNotSupported) {
return nil, fmt.Errorf("creating perf_uprobe PMU: %w", err)
}
// Use tracefs if uprobe PMU is missing.
tp, err = tracefsUprobe(uprobeSanitizedSymbol(sym.Name), ex.path, sym.Value, ret)
if err != nil {
return nil, fmt.Errorf("creating trace event '%s:%s' in tracefs: %w", ex.path, symbol, err)
}
return tp, nil
}
// pmuUprobe opens a perf event based on the uprobe PMU.
func pmuUprobe(symbol, path string, offset uint64, ret bool) (*perfEvent, error) {
return pmuProbe(uprobeType, symbol, path, offset, ret)
}
// tracefsUprobe creates a Uprobe tracefs entry.
func tracefsUprobe(symbol, path string, offset uint64, ret bool) (*perfEvent, error) {
return tracefsProbe(uprobeType, symbol, path, offset, ret)
}
// uprobeSanitizedSymbol replaces every invalid characted for the tracefs api with an underscore.
func uprobeSanitizedSymbol(symbol string) string {
return rgxUprobeSymbol.ReplaceAllString(symbol, "_")
}
// uprobePathOffset creates the PATH:OFFSET token for the tracefs api.
func uprobePathOffset(path string, offset uint64) string {
return fmt.Sprintf("%s:%#x", path, offset)
}
func uretprobeBit() (uint64, error) {
uprobeRetprobeBit.once.Do(func() {
uprobeRetprobeBit.value, uprobeRetprobeBit.err = determineRetprobeBit(uprobeType)
})
return uprobeRetprobeBit.value, uprobeRetprobeBit.err
}

View File

@@ -108,16 +108,12 @@ func fixupJumpsAndCalls(insns asm.Instructions) error {
offset := iter.Offset
ins := iter.Ins
if ins.Reference == "" {
continue
}
switch {
case ins.IsFunctionCall() && ins.Constant == -1:
// Rewrite bpf to bpf call
callOffset, ok := symbolOffsets[ins.Reference]
if !ok {
return fmt.Errorf("call at %d: reference to missing symbol %q", i, ins.Reference)
return fmt.Errorf("instruction %d: reference to missing symbol %q", i, ins.Reference)
}
ins.Constant = int64(callOffset - offset - 1)
@@ -126,13 +122,10 @@ func fixupJumpsAndCalls(insns asm.Instructions) error {
// Rewrite jump to label
jumpOffset, ok := symbolOffsets[ins.Reference]
if !ok {
return fmt.Errorf("jump at %d: reference to missing symbol %q", i, ins.Reference)
return fmt.Errorf("instruction %d: reference to missing symbol %q", i, ins.Reference)
}
ins.Offset = int16(jumpOffset - offset - 1)
case ins.IsLoadFromMap() && ins.MapPtr() == -1:
return fmt.Errorf("map %s: %w", ins.Reference, errUnsatisfiedReference)
}
}

50
vendor/github.com/cilium/ebpf/map.go generated vendored
View File

@@ -18,7 +18,6 @@ var (
ErrKeyNotExist = errors.New("key does not exist")
ErrKeyExist = errors.New("key already exists")
ErrIterationAborted = errors.New("iteration aborted")
ErrMapIncompatible = errors.New("map's spec is incompatible with pinned map")
)
// MapOptions control loading a map into the kernel.
@@ -88,23 +87,6 @@ func (ms *MapSpec) Copy() *MapSpec {
return &cpy
}
func (ms *MapSpec) clampPerfEventArraySize() error {
if ms.Type != PerfEventArray {
return nil
}
n, err := internal.PossibleCPUs()
if err != nil {
return fmt.Errorf("perf event array: %w", err)
}
if n := uint32(n); ms.MaxEntries > n {
ms.MaxEntries = n
}
return nil
}
// MapKV is used to initialize the contents of a Map.
type MapKV struct {
Key interface{}
@@ -114,19 +96,19 @@ type MapKV struct {
func (ms *MapSpec) checkCompatibility(m *Map) error {
switch {
case m.typ != ms.Type:
return fmt.Errorf("expected type %v, got %v: %w", ms.Type, m.typ, ErrMapIncompatible)
return fmt.Errorf("expected type %v, got %v", ms.Type, m.typ)
case m.keySize != ms.KeySize:
return fmt.Errorf("expected key size %v, got %v: %w", ms.KeySize, m.keySize, ErrMapIncompatible)
return fmt.Errorf("expected key size %v, got %v", ms.KeySize, m.keySize)
case m.valueSize != ms.ValueSize:
return fmt.Errorf("expected value size %v, got %v: %w", ms.ValueSize, m.valueSize, ErrMapIncompatible)
return fmt.Errorf("expected value size %v, got %v", ms.ValueSize, m.valueSize)
case m.maxEntries != ms.MaxEntries:
return fmt.Errorf("expected max entries %v, got %v: %w", ms.MaxEntries, m.maxEntries, ErrMapIncompatible)
return fmt.Errorf("expected max entries %v, got %v", ms.MaxEntries, m.maxEntries)
case m.flags != ms.Flags:
return fmt.Errorf("expected flags %v, got %v: %w", ms.Flags, m.flags, ErrMapIncompatible)
return fmt.Errorf("expected flags %v, got %v", ms.Flags, m.flags)
}
return nil
}
@@ -189,16 +171,14 @@ func NewMap(spec *MapSpec) (*Map, error) {
// The caller is responsible for ensuring the process' rlimit is set
// sufficiently high for locking memory during map creation. This can be done
// by calling unix.Setrlimit with unix.RLIMIT_MEMLOCK prior to calling NewMapWithOptions.
//
// May return an error wrapping ErrMapIncompatible.
func NewMapWithOptions(spec *MapSpec, opts MapOptions) (*Map, error) {
handles := newHandleCache()
defer handles.close()
btfs := make(btfHandleCache)
defer btfs.close()
return newMapWithOptions(spec, opts, handles)
return newMapWithOptions(spec, opts, btfs)
}
func newMapWithOptions(spec *MapSpec, opts MapOptions, handles *handleCache) (_ *Map, err error) {
func newMapWithOptions(spec *MapSpec, opts MapOptions, btfs btfHandleCache) (_ *Map, err error) {
closeOnError := func(c io.Closer) {
if err != nil {
c.Close()
@@ -222,7 +202,7 @@ func newMapWithOptions(spec *MapSpec, opts MapOptions, handles *handleCache) (_
defer closeOnError(m)
if err := spec.checkCompatibility(m); err != nil {
return nil, fmt.Errorf("use pinned map %s: %w", spec.Name, err)
return nil, fmt.Errorf("use pinned map %s: %s", spec.Name, err)
}
return m, nil
@@ -231,7 +211,7 @@ func newMapWithOptions(spec *MapSpec, opts MapOptions, handles *handleCache) (_
// Nothing to do here
default:
return nil, fmt.Errorf("pin type %d: %w", int(spec.Pinning), ErrNotSupported)
return nil, fmt.Errorf("unsupported pin type %d", int(spec.Pinning))
}
var innerFd *internal.FD
@@ -244,7 +224,7 @@ func newMapWithOptions(spec *MapSpec, opts MapOptions, handles *handleCache) (_
return nil, errors.New("inner maps cannot be pinned")
}
template, err := createMap(spec.InnerMap, nil, opts, handles)
template, err := createMap(spec.InnerMap, nil, opts, btfs)
if err != nil {
return nil, err
}
@@ -253,7 +233,7 @@ func newMapWithOptions(spec *MapSpec, opts MapOptions, handles *handleCache) (_
innerFd = template.fd
}
m, err := createMap(spec, innerFd, opts, handles)
m, err := createMap(spec, innerFd, opts, btfs)
if err != nil {
return nil, err
}
@@ -269,7 +249,7 @@ func newMapWithOptions(spec *MapSpec, opts MapOptions, handles *handleCache) (_
return m, nil
}
func createMap(spec *MapSpec, inner *internal.FD, opts MapOptions, handles *handleCache) (_ *Map, err error) {
func createMap(spec *MapSpec, inner *internal.FD, opts MapOptions, btfs btfHandleCache) (_ *Map, err error) {
closeOnError := func(closer io.Closer) {
if err != nil {
closer.Close()
@@ -340,7 +320,7 @@ func createMap(spec *MapSpec, inner *internal.FD, opts MapOptions, handles *hand
var btfDisabled bool
if spec.BTF != nil {
handle, err := handles.btfHandle(btf.MapSpec(spec.BTF))
handle, err := btfs.load(btf.MapSpec(spec.BTF))
btfDisabled = errors.Is(err, btf.ErrNotSupported)
if err != nil && !btfDisabled {
return nil, fmt.Errorf("load BTF: %w", err)

149
vendor/github.com/cilium/ebpf/prog.go generated vendored
View File

@@ -5,7 +5,6 @@ import (
"encoding/binary"
"errors"
"fmt"
"io"
"math"
"path/filepath"
"strings"
@@ -20,8 +19,6 @@ import (
// ErrNotSupported is returned whenever the kernel doesn't support a feature.
var ErrNotSupported = internal.ErrNotSupported
var errUnsatisfiedReference = errors.New("unsatisfied reference")
// ProgramID represents the unique ID of an eBPF program.
type ProgramID uint32
@@ -44,12 +41,6 @@ type ProgramOptions struct {
// Controls the output buffer size for the verifier. Defaults to
// DefaultVerifierLogSize.
LogSize int
// An ELF containing the target BTF for this program. It is used both to
// find the correct function to trace and to apply CO-RE relocations.
// This is useful in environments where the kernel BTF is not available
// (containers) or where it is in a non-standard location. Defaults to
// use the kernel BTF from a well-known location.
TargetBTF io.ReaderAt
}
// ProgramSpec defines a Program.
@@ -134,21 +125,21 @@ func NewProgram(spec *ProgramSpec) (*Program, error) {
// Loading a program for the first time will perform
// feature detection by loading small, temporary programs.
func NewProgramWithOptions(spec *ProgramSpec, opts ProgramOptions) (*Program, error) {
handles := newHandleCache()
defer handles.close()
btfs := make(btfHandleCache)
defer btfs.close()
prog, err := newProgramWithOptions(spec, opts, handles)
if errors.Is(err, errUnsatisfiedReference) {
return nil, fmt.Errorf("cannot load program without loading its whole collection: %w", err)
}
return prog, err
return newProgramWithOptions(spec, opts, btfs)
}
func newProgramWithOptions(spec *ProgramSpec, opts ProgramOptions, handles *handleCache) (*Program, error) {
func newProgramWithOptions(spec *ProgramSpec, opts ProgramOptions, btfs btfHandleCache) (*Program, error) {
if len(spec.Instructions) == 0 {
return nil, errors.New("Instructions cannot be empty")
}
if len(spec.License) == 0 {
return nil, errors.New("License cannot be empty")
}
if spec.ByteOrder != nil && spec.ByteOrder != internal.NativeEndian {
return nil, fmt.Errorf("can't load %s program on %s", spec.ByteOrder, internal.NativeEndian)
}
@@ -166,10 +157,27 @@ func newProgramWithOptions(spec *ProgramSpec, opts ProgramOptions, handles *hand
kv = v.Kernel()
}
insns := make(asm.Instructions, len(spec.Instructions))
copy(insns, spec.Instructions)
if err := fixupJumpsAndCalls(insns); err != nil {
return nil, err
}
buf := bytes.NewBuffer(make([]byte, 0, len(spec.Instructions)*asm.InstructionSize))
err := insns.Marshal(buf, internal.NativeEndian)
if err != nil {
return nil, err
}
bytecode := buf.Bytes()
insCount := uint32(len(bytecode) / asm.InstructionSize)
attr := &bpfProgLoadAttr{
progType: spec.Type,
progFlags: spec.Flags,
expectedAttachType: spec.AttachType,
insCount: insCount,
instructions: internal.NewSlicePointer(bytecode),
license: internal.NewStringPointer(spec.License),
kernelVersion: kv,
}
@@ -178,24 +186,15 @@ func newProgramWithOptions(spec *ProgramSpec, opts ProgramOptions, handles *hand
attr.progName = newBPFObjName(spec.Name)
}
var err error
var targetBTF *btf.Spec
if opts.TargetBTF != nil {
targetBTF, err = handles.btfSpec(opts.TargetBTF)
if err != nil {
return nil, fmt.Errorf("load target BTF: %w", err)
}
}
var btfDisabled bool
var core btf.COREFixups
if spec.BTF != nil {
core, err = btf.ProgramFixups(spec.BTF, targetBTF)
if err != nil {
return nil, fmt.Errorf("CO-RE relocations: %w", err)
if relos, err := btf.ProgramRelocations(spec.BTF, nil); err != nil {
return nil, fmt.Errorf("CO-RE relocations: %s", err)
} else if len(relos) > 0 {
return nil, fmt.Errorf("applying CO-RE relocations: %w", ErrNotSupported)
}
handle, err := handles.btfHandle(btf.ProgramSpec(spec.BTF))
handle, err := btfs.load(btf.ProgramSpec(spec.BTF))
btfDisabled = errors.Is(err, btf.ErrNotSupported)
if err != nil && !btfDisabled {
return nil, fmt.Errorf("load BTF: %w", err)
@@ -222,27 +221,8 @@ func newProgramWithOptions(spec *ProgramSpec, opts ProgramOptions, handles *hand
}
}
insns, err := core.Apply(spec.Instructions)
if err != nil {
return nil, fmt.Errorf("CO-RE fixup: %w", err)
}
if err := fixupJumpsAndCalls(insns); err != nil {
return nil, err
}
buf := bytes.NewBuffer(make([]byte, 0, len(spec.Instructions)*asm.InstructionSize))
err = insns.Marshal(buf, internal.NativeEndian)
if err != nil {
return nil, err
}
bytecode := buf.Bytes()
attr.instructions = internal.NewSlicePointer(bytecode)
attr.insCount = uint32(len(bytecode) / asm.InstructionSize)
if spec.AttachTo != "" {
target, err := resolveBTFType(targetBTF, spec.AttachTo, spec.Type, spec.AttachType)
target, err := resolveBTFType(spec.AttachTo, spec.Type, spec.AttachType)
if err != nil {
return nil, err
}
@@ -270,7 +250,7 @@ func newProgramWithOptions(spec *ProgramSpec, opts ProgramOptions, handles *hand
}
logErr := err
if opts.LogLevel == 0 && opts.LogSize >= 0 {
if opts.LogLevel == 0 {
// Re-run with the verifier enabled to get better error messages.
logBuf = make([]byte, logSize)
attr.logLevel = 1
@@ -684,45 +664,52 @@ func (p *Program) ID() (ProgramID, error) {
return ProgramID(info.id), nil
}
func resolveBTFType(kernel *btf.Spec, name string, progType ProgramType, attachType AttachType) (btf.Type, error) {
func findKernelType(name string, typ btf.Type) error {
kernel, err := btf.LoadKernelSpec()
if err != nil {
return fmt.Errorf("can't load kernel spec: %w", err)
}
return kernel.FindType(name, typ)
}
func resolveBTFType(name string, progType ProgramType, attachType AttachType) (btf.Type, error) {
type match struct {
p ProgramType
a AttachType
}
var target btf.Type
var typeName, featureName string
switch (match{progType, attachType}) {
target := match{progType, attachType}
switch target {
case match{LSM, AttachLSMMac}:
target = new(btf.Func)
typeName = "bpf_lsm_" + name
featureName = name + " LSM hook"
var target btf.Func
err := findKernelType("bpf_lsm_"+name, &target)
if errors.Is(err, btf.ErrNotFound) {
return nil, &internal.UnsupportedFeatureError{
Name: name + " LSM hook",
}
}
if err != nil {
return nil, fmt.Errorf("resolve BTF for LSM hook %s: %w", name, err)
}
return &target, nil
case match{Tracing, AttachTraceIter}:
target = new(btf.Func)
typeName = "bpf_iter_" + name
featureName = name + " iterator"
var target btf.Func
err := findKernelType("bpf_iter_"+name, &target)
if errors.Is(err, btf.ErrNotFound) {
return nil, &internal.UnsupportedFeatureError{
Name: name + " iterator",
}
}
if err != nil {
return nil, fmt.Errorf("resolve BTF for iterator %s: %w", name, err)
}
return &target, nil
default:
return nil, nil
}
if kernel == nil {
var err error
kernel, err = btf.LoadKernelSpec()
if err != nil {
return nil, fmt.Errorf("load kernel spec: %w", err)
}
}
err := kernel.FindType(typeName, target)
if errors.Is(err, btf.ErrNotFound) {
return nil, &internal.UnsupportedFeatureError{
Name: featureName,
}
}
if err != nil {
return nil, fmt.Errorf("resolve BTF for %s: %w", featureName, err)
}
return target, nil
}

View File

@@ -1,95 +1,56 @@
#!/bin/bash
# Test the current package under a different kernel.
# Requires virtme and qemu to be installed.
# Examples:
# Run all tests on a 5.4 kernel
# $ ./run-tests.sh 5.4
# Run a subset of tests:
# $ ./run-tests.sh 5.4 go test ./link
set -euo pipefail
set -eu
set -o pipefail
script="$(realpath "$0")"
readonly script
# This script is a bit like a Matryoshka doll since it keeps re-executing itself
# in various different contexts:
#
# 1. invoked by the user like run-tests.sh 5.4
# 2. invoked by go test like run-tests.sh --exec-vm
# 3. invoked by init in the vm like run-tests.sh --exec-test
#
# This allows us to use all available CPU on the host machine to compile our
# code, and then only use the VM to execute the test. This is because the VM
# is usually slower at compiling than the host.
if [[ "${1:-}" = "--exec-vm" ]]; then
shift
input="$1"
shift
# Use sudo if /dev/kvm isn't accessible by the current user.
sudo=""
if [[ ! -r /dev/kvm || ! -w /dev/kvm ]]; then
sudo="sudo"
fi
readonly sudo
testdir="$(dirname "$1")"
output="$(mktemp -d)"
printf -v cmd "%q " "$@"
if [[ "$(stat -c '%t:%T' -L /proc/$$/fd/0)" == "1:3" ]]; then
# stdin is /dev/null, which doesn't play well with qemu. Use a fifo as a
# blocking substitute.
mkfifo "${output}/fake-stdin"
# Open for reading and writing to avoid blocking.
exec 0<> "${output}/fake-stdin"
rm "${output}/fake-stdin"
fi
$sudo virtme-run --kimg "${input}/bzImage" --memory 768M --pwd \
--rwdir="${testdir}=${testdir}" \
--rodir=/run/input="${input}" \
--rwdir=/run/output="${output}" \
--script-sh "PATH=\"$PATH\" \"$script\" --exec-test $cmd" \
--qemu-opts -smp 2 # need at least two CPUs for some tests
if [[ ! -e "${output}/success" ]]; then
exit 1
fi
$sudo rm -r "$output"
exit 0
elif [[ "${1:-}" = "--exec-test" ]]; then
if [[ "${1:-}" = "--in-vm" ]]; then
shift
mount -t bpf bpf /sys/fs/bpf
mount -t tracefs tracefs /sys/kernel/debug/tracing
export CGO_ENABLED=0
export GOFLAGS=-mod=readonly
export GOPATH=/run/go-path
export GOPROXY=file:///run/go-path/pkg/mod/cache/download
export GOSUMDB=off
export GOCACHE=/run/go-cache
if [[ -d "/run/input/bpf" ]]; then
export KERNEL_SELFTESTS="/run/input/bpf"
fi
dmesg -C
if ! "$@"; then
dmesg
exit 1
fi
touch "/run/output/success"
readonly output="${1}"
shift
echo Running tests...
go test -v -coverpkg=./... -coverprofile="$output/coverage.txt" -count 1 ./...
touch "$output/success"
exit 0
fi
# Pull all dependencies, so that we can run tests without the
# vm having network access.
go mod download
# Use sudo if /dev/kvm isn't accessible by the current user.
sudo=""
if [[ ! -r /dev/kvm || ! -w /dev/kvm ]]; then
sudo="sudo"
fi
readonly sudo
readonly kernel_version="${1:-}"
if [[ -z "${kernel_version}" ]]; then
echo "Expecting kernel version as first argument"
exit 1
fi
shift
readonly kernel="linux-${kernel_version}.bz"
readonly selftests="linux-${kernel_version}-selftests-bpf.bz"
readonly input="$(mktemp -d)"
readonly output="$(mktemp -d)"
readonly tmp_dir="${TMPDIR:-/tmp}"
readonly branch="${BRANCH:-master}"
@@ -99,7 +60,6 @@ fetch() {
}
fetch "${kernel}"
cp "${tmp_dir}/${kernel}" "${input}/bzImage"
if fetch "${selftests}"; then
mkdir "${input}/bpf"
@@ -108,16 +68,25 @@ else
echo "No selftests found, disabling"
fi
args=(-v -short -coverpkg=./... -coverprofile=coverage.out -count 1 ./...)
if (( $# > 0 )); then
args=("$@")
echo Testing on "${kernel_version}"
$sudo virtme-run --kimg "${tmp_dir}/${kernel}" --memory 512M --pwd \
--rw \
--rwdir=/run/input="${input}" \
--rwdir=/run/output="${output}" \
--rodir=/run/go-path="$(go env GOPATH)" \
--rwdir=/run/go-cache="$(go env GOCACHE)" \
--script-sh "PATH=\"$PATH\" $(realpath "$0") --in-vm /run/output" \
--qemu-opts -smp 2 # need at least two CPUs for some tests
if [[ ! -e "${output}/success" ]]; then
echo "Test failed on ${kernel_version}"
exit 1
else
echo "Test successful on ${kernel_version}"
if [[ -v COVERALLS_TOKEN ]]; then
goveralls -coverprofile="${output}/coverage.txt" -service=semaphore -repotoken "$COVERALLS_TOKEN"
fi
fi
export GOFLAGS=-mod=readonly
export CGO_ENABLED=0
echo Testing on "${kernel_version}"
go test -exec "$script --exec-vm $input" "${args[@]}"
echo "Test successful on ${kernel_version}"
rm -r "${input}"
$sudo rm -r "${input}"
$sudo rm -r "${output}"

View File

@@ -1,16 +0,0 @@
package apparmor
import "errors"
var (
// IsEnabled returns true if apparmor is enabled for the host.
IsEnabled = isEnabled
// ApplyProfile will apply the profile with the specified name to the process after
// the next exec. It is only supported on Linux and produces an ErrApparmorNotEnabled
// on other platforms.
ApplyProfile = applyProfile
// ErrApparmorNotEnabled indicates that AppArmor is not enabled or not supported.
ErrApparmorNotEnabled = errors.New("apparmor: config provided but apparmor not supported")
)

View File

@@ -15,8 +15,8 @@ var (
checkAppArmor sync.Once
)
// isEnabled returns true if apparmor is enabled for the host.
func isEnabled() bool {
// IsEnabled returns true if apparmor is enabled for the host.
func IsEnabled() bool {
checkAppArmor.Do(func() {
if _, err := os.Stat("/sys/kernel/security/apparmor"); err == nil {
buf, err := ioutil.ReadFile("/sys/module/apparmor/parameters/enabled")
@@ -57,10 +57,9 @@ func changeOnExec(name string) error {
return nil
}
// applyProfile will apply the profile with the specified name to the process after
// the next exec. It is only supported on Linux and produces an error on other
// platforms.
func applyProfile(name string) error {
// ApplyProfile will apply the profile with the specified name to the process after
// the next exec.
func ApplyProfile(name string) error {
if name == "" {
return nil
}

View File

@@ -2,11 +2,17 @@
package apparmor
func isEnabled() bool {
import (
"errors"
)
var ErrApparmorNotEnabled = errors.New("apparmor: config provided but apparmor not supported")
func IsEnabled() bool {
return false
}
func applyProfile(name string) error {
func ApplyProfile(name string) error {
if name != "" {
return ErrApparmorNotEnabled
}

View File

@@ -258,9 +258,9 @@ func (e *Emulator) Apply(rule devices.Rule) error {
if rule.Allow {
return e.allow(innerRule)
} else {
return e.deny(innerRule)
}
return e.deny(innerRule)
}
// EmulatorFromList takes a reader to a "devices.list"-like source, and returns
@@ -371,12 +371,3 @@ func (source *Emulator) Transition(target *Emulator) ([]*devices.Rule, error) {
}
return transitionRules, nil
}
// Rules returns the minimum set of rules necessary to convert a *deny-all*
// cgroup to the emulated filter state (note that this is not the same as a
// default cgroupv1 cgroup -- which is allow-all). This is effectively just a
// wrapper around Transition() with the source emulator being an empty cgroup.
func (e *Emulator) Rules() ([]*devices.Rule, error) {
defaultCgroup := &Emulator{defaultAllow: false}
return defaultCgroup.Transition(e)
}

View File

@@ -11,7 +11,6 @@ import (
"strconv"
"github.com/cilium/ebpf/asm"
devicesemulator "github.com/opencontainers/runc/libcontainer/cgroups/devices"
"github.com/opencontainers/runc/libcontainer/devices"
"github.com/pkg/errors"
"golang.org/x/sys/unix"
@@ -23,44 +22,11 @@ const (
)
// DeviceFilter returns eBPF device filter program and its license string
func DeviceFilter(rules []*devices.Rule) (asm.Instructions, string, error) {
// Generate the minimum ruleset for the device rules we are given. While we
// don't care about minimum transitions in cgroupv2, using the emulator
// gives us a guarantee that the behaviour of devices filtering is the same
// as cgroupv1, including security hardenings to avoid misconfiguration
// (such as punching holes in wildcard rules).
emu := new(devicesemulator.Emulator)
for _, rule := range rules {
if err := emu.Apply(*rule); err != nil {
return nil, "", err
}
}
cleanRules, err := emu.Rules()
if err != nil {
return nil, "", err
}
p := &program{
defaultAllow: emu.IsBlacklist(),
}
func DeviceFilter(devices []*devices.Rule) (asm.Instructions, string, error) {
p := &program{}
p.init()
for idx, rule := range cleanRules {
if rule.Type == devices.WildcardDevice {
// We can safely skip over wildcard entries because there should
// only be one (at most) at the very start to instruct cgroupv1 to
// go into allow-list mode. However we do double-check this here.
if idx != 0 || rule.Allow != emu.IsBlacklist() {
return nil, "", errors.Errorf("[internal error] emulated cgroupv2 devices ruleset had bad wildcard at idx %v (%s)", idx, rule.CgroupString())
}
continue
}
if rule.Allow == p.defaultAllow {
// There should be no rules which have an action equal to the
// default action, the emulator removes those.
return nil, "", errors.Errorf("[internal error] emulated cgroupv2 devices ruleset had no-op rule at idx %v (%s)", idx, rule.CgroupString())
}
if err := p.appendRule(rule); err != nil {
for i := len(devices) - 1; i >= 0; i-- {
if err := p.appendDevice(devices[i]); err != nil {
return nil, "", err
}
}
@@ -69,9 +35,9 @@ func DeviceFilter(rules []*devices.Rule) (asm.Instructions, string, error) {
}
type program struct {
insts asm.Instructions
defaultAllow bool
blockID int
insts asm.Instructions
hasWildCard bool
blockID int
}
func (p *program) init() {
@@ -101,35 +67,39 @@ func (p *program) init() {
asm.LoadMem(asm.R5, asm.R1, 8, asm.Word))
}
// appendRule rule converts an OCI rule to the relevant eBPF block and adds it
// to the in-progress filter program. In order to operate properly, it must be
// called with a "clean" rule list (generated by devices.Emulator.Rules() --
// with any "a" rules removed).
func (p *program) appendRule(rule *devices.Rule) error {
// appendDevice needs to be called from the last element of OCI linux.resources.devices to the head element.
func (p *program) appendDevice(dev *devices.Rule) error {
if p.blockID < 0 {
return errors.New("the program is finalized")
}
if p.hasWildCard {
// All entries after wildcard entry are ignored
return nil
}
var bpfType int32
switch rule.Type {
case devices.CharDevice:
bpfType := int32(-1)
hasType := true
switch dev.Type {
case 'c':
bpfType = int32(unix.BPF_DEVCG_DEV_CHAR)
case devices.BlockDevice:
case 'b':
bpfType = int32(unix.BPF_DEVCG_DEV_BLOCK)
case 'a':
hasType = false
default:
// We do not permit 'a', nor any other types we don't know about.
return errors.Errorf("invalid type %q", string(rule.Type))
// if not specified in OCI json, typ is set to DeviceTypeAll
return errors.Errorf("invalid Type %q", string(dev.Type))
}
if rule.Major > math.MaxUint32 {
return errors.Errorf("invalid major %d", rule.Major)
if dev.Major > math.MaxUint32 {
return errors.Errorf("invalid major %d", dev.Major)
}
if rule.Minor > math.MaxUint32 {
return errors.Errorf("invalid minor %d", rule.Major)
if dev.Minor > math.MaxUint32 {
return errors.Errorf("invalid minor %d", dev.Major)
}
hasMajor := rule.Major >= 0 // if not specified in OCI json, major is set to -1
hasMinor := rule.Minor >= 0
hasMajor := dev.Major >= 0 // if not specified in OCI json, major is set to -1
hasMinor := dev.Minor >= 0
bpfAccess := int32(0)
for _, r := range rule.Permissions {
for _, r := range dev.Permissions {
switch r {
case 'r':
bpfAccess |= unix.BPF_DEVCG_ACC_READ
@@ -149,10 +119,12 @@ func (p *program) appendRule(rule *devices.Rule) error {
nextBlockSym = "block-" + strconv.Itoa(p.blockID+1)
prevBlockLastIdx = len(p.insts) - 1
)
p.insts = append(p.insts,
// if (R2 != bpfType) goto next
asm.JNE.Imm(asm.R2, bpfType, nextBlockSym),
)
if hasType {
p.insts = append(p.insts,
// if (R2 != bpfType) goto next
asm.JNE.Imm(asm.R2, bpfType, nextBlockSym),
)
}
if hasAccess {
p.insts = append(p.insts,
// if (R3 & bpfAccess != R3 /* use R1 as a temp var */) goto next
@@ -164,16 +136,19 @@ func (p *program) appendRule(rule *devices.Rule) error {
if hasMajor {
p.insts = append(p.insts,
// if (R4 != major) goto next
asm.JNE.Imm(asm.R4, int32(rule.Major), nextBlockSym),
asm.JNE.Imm(asm.R4, int32(dev.Major), nextBlockSym),
)
}
if hasMinor {
p.insts = append(p.insts,
// if (R5 != minor) goto next
asm.JNE.Imm(asm.R5, int32(rule.Minor), nextBlockSym),
asm.JNE.Imm(asm.R5, int32(dev.Minor), nextBlockSym),
)
}
p.insts = append(p.insts, acceptBlock(rule.Allow)...)
if !hasType && !hasAccess && !hasMajor && !hasMinor {
p.hasWildCard = true
}
p.insts = append(p.insts, acceptBlock(dev.Allow)...)
// set blockSym to the first instruction we added in this iteration
p.insts[prevBlockLastIdx+1] = p.insts[prevBlockLastIdx+1].Sym(blockSym)
p.blockID++
@@ -181,14 +156,14 @@ func (p *program) appendRule(rule *devices.Rule) error {
}
func (p *program) finalize() (asm.Instructions, error) {
var v int32
if p.defaultAllow {
v = 1
if p.hasWildCard {
// acceptBlock with asm.Return() is already inserted
return p.insts, nil
}
blockSym := "block-" + strconv.Itoa(p.blockID)
p.insts = append(p.insts,
// R0 <- v
asm.Mov.Imm32(asm.R0, v).Sym(blockSym),
// R0 <- 0
asm.Mov.Imm32(asm.R0, 0).Sym(blockSym),
asm.Return(),
)
p.blockID = -1
@@ -196,7 +171,7 @@ func (p *program) finalize() (asm.Instructions, error) {
}
func acceptBlock(accept bool) asm.Instructions {
var v int32
v := int32(0)
if accept {
v = 1
}

View File

@@ -0,0 +1,57 @@
package ebpf
import (
"github.com/cilium/ebpf"
"github.com/cilium/ebpf/asm"
"github.com/cilium/ebpf/link"
"github.com/pkg/errors"
"golang.org/x/sys/unix"
)
// LoadAttachCgroupDeviceFilter installs eBPF device filter program to /sys/fs/cgroup/<foo> directory.
//
// Requires the system to be running in cgroup2 unified-mode with kernel >= 4.15 .
//
// https://github.com/torvalds/linux/commit/ebc614f687369f9df99828572b1d85a7c2de3d92
func LoadAttachCgroupDeviceFilter(insts asm.Instructions, license string, dirFD int) (func() error, error) {
nilCloser := func() error {
return nil
}
// Increase `ulimit -l` limit to avoid BPF_PROG_LOAD error (#2167).
// This limit is not inherited into the container.
memlockLimit := &unix.Rlimit{
Cur: unix.RLIM_INFINITY,
Max: unix.RLIM_INFINITY,
}
_ = unix.Setrlimit(unix.RLIMIT_MEMLOCK, memlockLimit)
spec := &ebpf.ProgramSpec{
Type: ebpf.CGroupDevice,
Instructions: insts,
License: license,
}
prog, err := ebpf.NewProgram(spec)
if err != nil {
return nilCloser, err
}
err = link.RawAttachProgram(link.RawAttachProgramOptions{
Target: dirFD,
Program: prog,
Attach: ebpf.AttachCGroupDevice,
Flags: unix.BPF_F_ALLOW_MULTI,
})
if err != nil {
return nilCloser, errors.Wrap(err, "failed to call BPF_PROG_ATTACH (BPF_CGROUP_DEVICE, BPF_F_ALLOW_MULTI)")
}
closer := func() error {
err = link.RawDetachProgram(link.RawDetachProgramOptions{
Target: dirFD,
Program: prog,
Attach: ebpf.AttachCGroupDevice,
})
if err != nil {
return errors.Wrap(err, "failed to call BPF_PROG_DETACH (BPF_CGROUP_DEVICE)")
}
return nil
}
return closer, nil
}

View File

@@ -1,240 +0,0 @@
package ebpf
import (
"fmt"
"os"
"runtime"
"sync"
"unsafe"
"github.com/cilium/ebpf"
"github.com/cilium/ebpf/asm"
"github.com/cilium/ebpf/link"
"github.com/pkg/errors"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
)
func nilCloser() error {
return nil
}
func findAttachedCgroupDeviceFilters(dirFd int) ([]*ebpf.Program, error) {
type bpfAttrQuery struct {
TargetFd uint32
AttachType uint32
QueryType uint32
AttachFlags uint32
ProgIds uint64 // __aligned_u64
ProgCnt uint32
}
// Currently you can only have 64 eBPF programs attached to a cgroup.
size := 64
retries := 0
for retries < 10 {
progIds := make([]uint32, size)
query := bpfAttrQuery{
TargetFd: uint32(dirFd),
AttachType: uint32(unix.BPF_CGROUP_DEVICE),
ProgIds: uint64(uintptr(unsafe.Pointer(&progIds[0]))),
ProgCnt: uint32(len(progIds)),
}
// Fetch the list of program ids.
_, _, errno := unix.Syscall(unix.SYS_BPF,
uintptr(unix.BPF_PROG_QUERY),
uintptr(unsafe.Pointer(&query)),
unsafe.Sizeof(query))
size = int(query.ProgCnt)
runtime.KeepAlive(query)
if errno != 0 {
// On ENOSPC we get the correct number of programs.
if errno == unix.ENOSPC {
retries++
continue
}
return nil, fmt.Errorf("bpf_prog_query(BPF_CGROUP_DEVICE) failed: %w", errno)
}
// Convert the ids to program handles.
progIds = progIds[:size]
programs := make([]*ebpf.Program, len(progIds))
for idx, progId := range progIds {
program, err := ebpf.NewProgramFromID(ebpf.ProgramID(progId))
if err != nil {
return nil, fmt.Errorf("cannot fetch program from id: %w", err)
}
programs[idx] = program
}
runtime.KeepAlive(progIds)
return programs, nil
}
return nil, errors.New("could not get complete list of CGROUP_DEVICE programs")
}
var (
haveBpfProgReplaceBool bool
haveBpfProgReplaceOnce sync.Once
)
// Loosely based on the BPF_F_REPLACE support check in
// <https://github.com/cilium/ebpf/blob/v0.6.0/link/syscalls.go>.
//
// TODO: move this logic to cilium/ebpf
func haveBpfProgReplace() bool {
haveBpfProgReplaceOnce.Do(func() {
prog, err := ebpf.NewProgram(&ebpf.ProgramSpec{
Type: ebpf.CGroupDevice,
License: "MIT",
Instructions: asm.Instructions{
asm.Mov.Imm(asm.R0, 0),
asm.Return(),
},
})
if err != nil {
logrus.Debugf("checking for BPF_F_REPLACE support: ebpf.NewProgram failed: %v", err)
return
}
defer prog.Close()
devnull, err := os.Open("/dev/null")
if err != nil {
logrus.Debugf("checking for BPF_F_REPLACE support: open dummy target fd: %v", err)
return
}
defer devnull.Close()
// We know that we have BPF_PROG_ATTACH since we can load
// BPF_CGROUP_DEVICE programs. If passing BPF_F_REPLACE gives us EINVAL
// we know that the feature isn't present.
err = link.RawAttachProgram(link.RawAttachProgramOptions{
// We rely on this fd being checked after attachFlags.
Target: int(devnull.Fd()),
// Attempt to "replace" bad fds with this program.
Program: prog,
Attach: ebpf.AttachCGroupDevice,
Flags: unix.BPF_F_ALLOW_MULTI | unix.BPF_F_REPLACE,
})
if errors.Is(err, unix.EINVAL) {
// not supported
return
}
// attach_flags test succeded.
if !errors.Is(err, unix.EBADF) {
logrus.Debugf("checking for BPF_F_REPLACE: got unexpected (not EBADF or EINVAL) error: %v", err)
}
haveBpfProgReplaceBool = true
})
return haveBpfProgReplaceBool
}
// LoadAttachCgroupDeviceFilter installs eBPF device filter program to /sys/fs/cgroup/<foo> directory.
//
// Requires the system to be running in cgroup2 unified-mode with kernel >= 4.15 .
//
// https://github.com/torvalds/linux/commit/ebc614f687369f9df99828572b1d85a7c2de3d92
func LoadAttachCgroupDeviceFilter(insts asm.Instructions, license string, dirFd int) (func() error, error) {
// Increase `ulimit -l` limit to avoid BPF_PROG_LOAD error (#2167).
// This limit is not inherited into the container.
memlockLimit := &unix.Rlimit{
Cur: unix.RLIM_INFINITY,
Max: unix.RLIM_INFINITY,
}
_ = unix.Setrlimit(unix.RLIMIT_MEMLOCK, memlockLimit)
// Get the list of existing programs.
oldProgs, err := findAttachedCgroupDeviceFilters(dirFd)
if err != nil {
return nilCloser, err
}
useReplaceProg := haveBpfProgReplace() && len(oldProgs) == 1
// Generate new program.
spec := &ebpf.ProgramSpec{
Type: ebpf.CGroupDevice,
Instructions: insts,
License: license,
}
prog, err := ebpf.NewProgram(spec)
if err != nil {
return nilCloser, err
}
// If there is only one old program, we can just replace it directly.
var (
replaceProg *ebpf.Program
attachFlags uint32 = unix.BPF_F_ALLOW_MULTI
)
if useReplaceProg {
replaceProg = oldProgs[0]
attachFlags |= unix.BPF_F_REPLACE
}
err = link.RawAttachProgram(link.RawAttachProgramOptions{
Target: dirFd,
Program: prog,
Replace: replaceProg,
Attach: ebpf.AttachCGroupDevice,
Flags: attachFlags,
})
if err != nil {
return nilCloser, fmt.Errorf("failed to call BPF_PROG_ATTACH (BPF_CGROUP_DEVICE, BPF_F_ALLOW_MULTI): %w", err)
}
closer := func() error {
err = link.RawDetachProgram(link.RawDetachProgramOptions{
Target: dirFd,
Program: prog,
Attach: ebpf.AttachCGroupDevice,
})
if err != nil {
return fmt.Errorf("failed to call BPF_PROG_DETACH (BPF_CGROUP_DEVICE): %w", err)
}
// TODO: Should we attach the old filters back in this case? Otherwise
// we fail-open on a security feature, which is a bit scary.
return nil
}
if !useReplaceProg {
logLevel := logrus.DebugLevel
// If there was more than one old program, give a warning (since this
// really shouldn't happen with runc-managed cgroups) and then detach
// all the old programs.
if len(oldProgs) > 1 {
// NOTE: Ideally this should be a warning but it turns out that
// systemd-managed cgroups trigger this warning (apparently
// systemd doesn't delete old non-systemd programs when
// setting properties).
logrus.Infof("found more than one filter (%d) attached to a cgroup -- removing extra filters!", len(oldProgs))
logLevel = logrus.InfoLevel
}
for idx, oldProg := range oldProgs {
// Output some extra debug info.
if info, err := oldProg.Info(); err == nil {
fields := logrus.Fields{
"type": info.Type.String(),
"tag": info.Tag,
"name": info.Name,
}
if id, ok := info.ID(); ok {
fields["id"] = id
}
if runCount, ok := info.RunCount(); ok {
fields["run_count"] = runCount
}
if runtime, ok := info.Runtime(); ok {
fields["runtime"] = runtime.String()
}
logrus.WithFields(fields).Logf(logLevel, "removing old filter %d from cgroup", idx)
}
err = link.RawDetachProgram(link.RawDetachProgramOptions{
Target: dirFd,
Program: oldProg,
Attach: ebpf.AttachCGroupDevice,
})
if err != nil {
return closer, fmt.Errorf("failed to call BPF_PROG_DETACH (BPF_CGROUP_DEVICE) on old filter program: %w", err)
}
}
}
return closer, nil
}

View File

@@ -6,17 +6,15 @@ import (
"bufio"
"fmt"
"os"
"path/filepath"
"strconv"
"strings"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/configs"
)
type BlkioGroup struct {
weightFilename string
weightDeviceFilename string
}
func (s *BlkioGroup) Name() string {
@@ -28,47 +26,42 @@ func (s *BlkioGroup) Apply(path string, d *cgroupData) error {
}
func (s *BlkioGroup) Set(path string, r *configs.Resources) error {
s.detectWeightFilenames(path)
if r.BlkioWeight != 0 {
if err := cgroups.WriteFile(path, s.weightFilename, strconv.FormatUint(uint64(r.BlkioWeight), 10)); err != nil {
if err := fscommon.WriteFile(path, "blkio.weight", strconv.FormatUint(uint64(r.BlkioWeight), 10)); err != nil {
return err
}
}
if r.BlkioLeafWeight != 0 {
if err := cgroups.WriteFile(path, "blkio.leaf_weight", strconv.FormatUint(uint64(r.BlkioLeafWeight), 10)); err != nil {
if err := fscommon.WriteFile(path, "blkio.leaf_weight", strconv.FormatUint(uint64(r.BlkioLeafWeight), 10)); err != nil {
return err
}
}
for _, wd := range r.BlkioWeightDevice {
if wd.Weight != 0 {
if err := cgroups.WriteFile(path, s.weightDeviceFilename, wd.WeightString()); err != nil {
return err
}
if err := fscommon.WriteFile(path, "blkio.weight_device", wd.WeightString()); err != nil {
return err
}
if wd.LeafWeight != 0 {
if err := cgroups.WriteFile(path, "blkio.leaf_weight_device", wd.LeafWeightString()); err != nil {
return err
}
if err := fscommon.WriteFile(path, "blkio.leaf_weight_device", wd.LeafWeightString()); err != nil {
return err
}
}
for _, td := range r.BlkioThrottleReadBpsDevice {
if err := cgroups.WriteFile(path, "blkio.throttle.read_bps_device", td.String()); err != nil {
if err := fscommon.WriteFile(path, "blkio.throttle.read_bps_device", td.String()); err != nil {
return err
}
}
for _, td := range r.BlkioThrottleWriteBpsDevice {
if err := cgroups.WriteFile(path, "blkio.throttle.write_bps_device", td.String()); err != nil {
if err := fscommon.WriteFile(path, "blkio.throttle.write_bps_device", td.String()); err != nil {
return err
}
}
for _, td := range r.BlkioThrottleReadIOPSDevice {
if err := cgroups.WriteFile(path, "blkio.throttle.read_iops_device", td.String()); err != nil {
if err := fscommon.WriteFile(path, "blkio.throttle.read_iops_device", td.String()); err != nil {
return err
}
}
for _, td := range r.BlkioThrottleWriteIOPSDevice {
if err := cgroups.WriteFile(path, "blkio.throttle.write_iops_device", td.String()); err != nil {
if err := fscommon.WriteFile(path, "blkio.throttle.write_iops_device", td.String()); err != nil {
return err
}
}
@@ -113,7 +106,7 @@ func splitBlkioStatLine(r rune) bool {
func getBlkioStat(dir, file string) ([]cgroups.BlkioStatEntry, error) {
var blkioStats []cgroups.BlkioStatEntry
f, err := cgroups.OpenFile(dir, file, os.O_RDONLY)
f, err := fscommon.OpenFile(dir, file, os.O_RDONLY)
if err != nil {
if os.IsNotExist(err) {
return blkioStats, nil
@@ -168,7 +161,7 @@ func (s *BlkioGroup) GetStats(path string, stats *cgroups.Stats) error {
filename string
blkioStatEntriesPtr *[]cgroups.BlkioStatEntry
}
bfqDebugStats := []blkioStatInfo{
var bfqDebugStats = []blkioStatInfo{
{
filename: "blkio.bfq.sectors_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.SectorsRecursive,
@@ -202,7 +195,7 @@ func (s *BlkioGroup) GetStats(path string, stats *cgroups.Stats) error {
blkioStatEntriesPtr: &stats.BlkioStats.IoServiceBytesRecursive,
},
}
bfqStats := []blkioStatInfo{
var bfqStats = []blkioStatInfo{
{
filename: "blkio.bfq.io_serviced_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.IoServicedRecursive,
@@ -212,7 +205,7 @@ func (s *BlkioGroup) GetStats(path string, stats *cgroups.Stats) error {
blkioStatEntriesPtr: &stats.BlkioStats.IoServiceBytesRecursive,
},
}
cfqStats := []blkioStatInfo{
var cfqStats = []blkioStatInfo{
{
filename: "blkio.sectors_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.SectorsRecursive,
@@ -246,7 +239,7 @@ func (s *BlkioGroup) GetStats(path string, stats *cgroups.Stats) error {
blkioStatEntriesPtr: &stats.BlkioStats.IoServiceBytesRecursive,
},
}
throttleRecursiveStats := []blkioStatInfo{
var throttleRecursiveStats = []blkioStatInfo{
{
filename: "blkio.throttle.io_serviced_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.IoServicedRecursive,
@@ -256,7 +249,7 @@ func (s *BlkioGroup) GetStats(path string, stats *cgroups.Stats) error {
blkioStatEntriesPtr: &stats.BlkioStats.IoServiceBytesRecursive,
},
}
baseStats := []blkioStatInfo{
var baseStats = []blkioStatInfo{
{
filename: "blkio.throttle.io_serviced",
blkioStatEntriesPtr: &stats.BlkioStats.IoServicedRecursive,
@@ -266,7 +259,7 @@ func (s *BlkioGroup) GetStats(path string, stats *cgroups.Stats) error {
blkioStatEntriesPtr: &stats.BlkioStats.IoServiceBytesRecursive,
},
}
orderedStats := [][]blkioStatInfo{
var orderedStats = [][]blkioStatInfo{
bfqDebugStats,
bfqStats,
cfqStats,
@@ -287,7 +280,7 @@ func (s *BlkioGroup) GetStats(path string, stats *cgroups.Stats) error {
return err
}
*statInfo.blkioStatEntriesPtr = blkioStats
// finish if all stats are gathered
//finish if all stats are gathered
if i == len(statGroup)-1 {
return nil
}
@@ -295,17 +288,3 @@ func (s *BlkioGroup) GetStats(path string, stats *cgroups.Stats) error {
}
return nil
}
func (s *BlkioGroup) detectWeightFilenames(path string) {
if s.weightFilename != "" {
// Already detected.
return
}
if cgroups.PathExists(filepath.Join(path, "blkio.weight")) {
s.weightFilename = "blkio.weight"
s.weightDeviceFilename = "blkio.weight_device"
} else {
s.weightFilename = "blkio.bfq.weight"
s.weightDeviceFilename = "blkio.bfq.weight_device"
}
}

View File

@@ -13,7 +13,8 @@ import (
"github.com/opencontainers/runc/libcontainer/configs"
)
type CpuGroup struct{}
type CpuGroup struct {
}
func (s *CpuGroup) Name() string {
return "cpu"
@@ -25,7 +26,7 @@ func (s *CpuGroup) Apply(path string, d *cgroupData) error {
if path == "" {
return nil
}
if err := os.MkdirAll(path, 0o755); err != nil {
if err := os.MkdirAll(path, 0755); err != nil {
return err
}
// We should set the real-Time group scheduling settings before moving
@@ -41,12 +42,12 @@ func (s *CpuGroup) Apply(path string, d *cgroupData) error {
func (s *CpuGroup) SetRtSched(path string, r *configs.Resources) error {
if r.CpuRtPeriod != 0 {
if err := cgroups.WriteFile(path, "cpu.rt_period_us", strconv.FormatUint(r.CpuRtPeriod, 10)); err != nil {
if err := fscommon.WriteFile(path, "cpu.rt_period_us", strconv.FormatUint(r.CpuRtPeriod, 10)); err != nil {
return err
}
}
if r.CpuRtRuntime != 0 {
if err := cgroups.WriteFile(path, "cpu.rt_runtime_us", strconv.FormatInt(r.CpuRtRuntime, 10)); err != nil {
if err := fscommon.WriteFile(path, "cpu.rt_runtime_us", strconv.FormatInt(r.CpuRtRuntime, 10)); err != nil {
return err
}
}
@@ -56,7 +57,7 @@ func (s *CpuGroup) SetRtSched(path string, r *configs.Resources) error {
func (s *CpuGroup) Set(path string, r *configs.Resources) error {
if r.CpuShares != 0 {
shares := r.CpuShares
if err := cgroups.WriteFile(path, "cpu.shares", strconv.FormatUint(shares, 10)); err != nil {
if err := fscommon.WriteFile(path, "cpu.shares", strconv.FormatUint(shares, 10)); err != nil {
return err
}
// read it back
@@ -72,12 +73,12 @@ func (s *CpuGroup) Set(path string, r *configs.Resources) error {
}
}
if r.CpuPeriod != 0 {
if err := cgroups.WriteFile(path, "cpu.cfs_period_us", strconv.FormatUint(r.CpuPeriod, 10)); err != nil {
if err := fscommon.WriteFile(path, "cpu.cfs_period_us", strconv.FormatUint(r.CpuPeriod, 10)); err != nil {
return err
}
}
if r.CpuQuota != 0 {
if err := cgroups.WriteFile(path, "cpu.cfs_quota_us", strconv.FormatInt(r.CpuQuota, 10)); err != nil {
if err := fscommon.WriteFile(path, "cpu.cfs_quota_us", strconv.FormatInt(r.CpuQuota, 10)); err != nil {
return err
}
}
@@ -85,7 +86,7 @@ func (s *CpuGroup) Set(path string, r *configs.Resources) error {
}
func (s *CpuGroup) GetStats(path string, stats *cgroups.Stats) error {
f, err := cgroups.OpenFile(path, "cpu.stat", os.O_RDONLY)
f, err := fscommon.OpenFile(path, "cpu.stat", os.O_RDONLY)
if err != nil {
if os.IsNotExist(err) {
return nil

View File

@@ -32,7 +32,8 @@ const (
clockTicks uint64 = 100
)
type CpuacctGroup struct{}
type CpuacctGroup struct {
}
func (s *CpuacctGroup) Name() string {
return "cpuacct"
@@ -90,7 +91,7 @@ func getCpuUsageBreakdown(path string) (uint64, uint64, error) {
// Expected format:
// user <usage in ticks>
// system <usage in ticks>
data, err := cgroups.ReadFile(path, cgroupCpuacctStat)
data, err := fscommon.ReadFile(path, cgroupCpuacctStat)
if err != nil {
return 0, 0, err
}
@@ -116,7 +117,7 @@ func getCpuUsageBreakdown(path string) (uint64, uint64, error) {
func getPercpuUsage(path string) ([]uint64, error) {
percpuUsage := []uint64{}
data, err := cgroups.ReadFile(path, "cpuacct.usage_percpu")
data, err := fscommon.ReadFile(path, "cpuacct.usage_percpu")
if err != nil {
return percpuUsage, err
}
@@ -134,7 +135,7 @@ func getPercpuUsageInModes(path string) ([]uint64, []uint64, error) {
usageKernelMode := []uint64{}
usageUserMode := []uint64{}
file, err := cgroups.OpenFile(path, cgroupCpuacctUsageAll, os.O_RDONLY)
file, err := fscommon.OpenFile(path, cgroupCpuacctUsageAll, os.O_RDONLY)
if os.IsNotExist(err) {
return usageKernelMode, usageUserMode, nil
} else if err != nil {
@@ -143,7 +144,7 @@ func getPercpuUsageInModes(path string) ([]uint64, []uint64, error) {
defer file.Close()
scanner := bufio.NewScanner(file)
scanner.Scan() // skipping header line
scanner.Scan() //skipping header line
for scanner.Scan() {
lineFields := strings.SplitN(scanner.Text(), " ", cuacctUsageAllColumnsNumber+1)

View File

@@ -16,7 +16,8 @@ import (
"golang.org/x/sys/unix"
)
type CpusetGroup struct{}
type CpusetGroup struct {
}
func (s *CpusetGroup) Name() string {
return "cpuset"
@@ -28,12 +29,12 @@ func (s *CpusetGroup) Apply(path string, d *cgroupData) error {
func (s *CpusetGroup) Set(path string, r *configs.Resources) error {
if r.CpusetCpus != "" {
if err := cgroups.WriteFile(path, "cpuset.cpus", r.CpusetCpus); err != nil {
if err := fscommon.WriteFile(path, "cpuset.cpus", r.CpusetCpus); err != nil {
return err
}
}
if r.CpusetMems != "" {
if err := cgroups.WriteFile(path, "cpuset.mems", r.CpusetMems); err != nil {
if err := fscommon.WriteFile(path, "cpuset.mems", r.CpusetMems); err != nil {
return err
}
}
@@ -155,7 +156,7 @@ func (s *CpusetGroup) ApplyDir(dir string, r *configs.Resources, pid int) error
if err := cpusetEnsureParent(filepath.Dir(dir)); err != nil {
return err
}
if err := os.Mkdir(dir, 0o755); err != nil && !os.IsExist(err) {
if err := os.Mkdir(dir, 0755); err != nil && !os.IsExist(err) {
return err
}
// We didn't inherit cpuset configs from parent, but we have
@@ -175,10 +176,10 @@ func (s *CpusetGroup) ApplyDir(dir string, r *configs.Resources, pid int) error
}
func getCpusetSubsystemSettings(parent string) (cpus, mems string, err error) {
if cpus, err = cgroups.ReadFile(parent, "cpuset.cpus"); err != nil {
if cpus, err = fscommon.ReadFile(parent, "cpuset.cpus"); err != nil {
return
}
if mems, err = cgroups.ReadFile(parent, "cpuset.mems"); err != nil {
if mems, err = fscommon.ReadFile(parent, "cpuset.mems"); err != nil {
return
}
return cpus, mems, nil
@@ -205,7 +206,7 @@ func cpusetEnsureParent(current string) error {
if err := cpusetEnsureParent(parent); err != nil {
return err
}
if err := os.Mkdir(current, 0o755); err != nil && !os.IsExist(err) {
if err := os.Mkdir(current, 0755); err != nil && !os.IsExist(err) {
return err
}
return cpusetCopyIfNeeded(current, parent)
@@ -224,12 +225,12 @@ func cpusetCopyIfNeeded(current, parent string) error {
}
if isEmptyCpuset(currentCpus) {
if err := cgroups.WriteFile(current, "cpuset.cpus", string(parentCpus)); err != nil {
if err := fscommon.WriteFile(current, "cpuset.cpus", string(parentCpus)); err != nil {
return err
}
}
if isEmptyCpuset(currentMems) {
if err := cgroups.WriteFile(current, "cpuset.mems", string(parentMems)); err != nil {
if err := fscommon.WriteFile(current, "cpuset.mems", string(parentMems)); err != nil {
return err
}
}

View File

@@ -9,6 +9,7 @@ import (
"github.com/opencontainers/runc/libcontainer/cgroups"
cgroupdevices "github.com/opencontainers/runc/libcontainer/cgroups/devices"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/devices"
"github.com/opencontainers/runc/libcontainer/userns"
@@ -35,7 +36,7 @@ func (s *DevicesGroup) Apply(path string, d *cgroupData) error {
}
func loadEmulator(path string) (*cgroupdevices.Emulator, error) {
list, err := cgroups.ReadFile(path, "devices.list")
list, err := fscommon.ReadFile(path, "devices.list")
if err != nil {
return nil, err
}
@@ -80,7 +81,7 @@ func (s *DevicesGroup) Set(path string, r *configs.Resources) error {
if rule.Allow {
file = "devices.allow"
}
if err := cgroups.WriteFile(path, file, rule.CgroupString()); err != nil {
if err := fscommon.WriteFile(path, file, rule.CgroupString()); err != nil {
return err
}
}

View File

@@ -10,12 +10,14 @@ import (
"time"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
)
type FreezerGroup struct{}
type FreezerGroup struct {
}
func (s *FreezerGroup) Name() string {
return "freezer"
@@ -33,7 +35,7 @@ func (s *FreezerGroup) Set(path string, r *configs.Resources) (Err error) {
// Freezing failed, and it is bad and dangerous
// to leave the cgroup in FROZEN or FREEZING
// state, so (try to) thaw it back.
_ = cgroups.WriteFile(path, "freezer.state", string(configs.Thawed))
_ = fscommon.WriteFile(path, "freezer.state", string(configs.Thawed))
}
}()
@@ -66,11 +68,11 @@ func (s *FreezerGroup) Set(path string, r *configs.Resources) (Err error) {
// the chances to succeed in freezing
// in case new processes keep appearing
// in the cgroup.
_ = cgroups.WriteFile(path, "freezer.state", string(configs.Thawed))
_ = fscommon.WriteFile(path, "freezer.state", string(configs.Thawed))
time.Sleep(10 * time.Millisecond)
}
if err := cgroups.WriteFile(path, "freezer.state", string(configs.Frozen)); err != nil {
if err := fscommon.WriteFile(path, "freezer.state", string(configs.Frozen)); err != nil {
return err
}
@@ -81,7 +83,7 @@ func (s *FreezerGroup) Set(path string, r *configs.Resources) (Err error) {
// system.
time.Sleep(10 * time.Microsecond)
}
state, err := cgroups.ReadFile(path, "freezer.state")
state, err := fscommon.ReadFile(path, "freezer.state")
if err != nil {
return err
}
@@ -102,7 +104,7 @@ func (s *FreezerGroup) Set(path string, r *configs.Resources) (Err error) {
// Despite our best efforts, it got stuck in FREEZING.
return errors.New("unable to freeze")
case configs.Thawed:
return cgroups.WriteFile(path, "freezer.state", string(configs.Thawed))
return fscommon.WriteFile(path, "freezer.state", string(configs.Thawed))
case configs.Undefined:
return nil
default:
@@ -116,7 +118,7 @@ func (s *FreezerGroup) GetStats(path string, stats *cgroups.Stats) error {
func (s *FreezerGroup) GetState(path string) (configs.FreezerState, error) {
for {
state, err := cgroups.ReadFile(path, "freezer.state")
state, err := fscommon.ReadFile(path, "freezer.state")
if err != nil {
// If the kernel is too old, then we just treat the freezer as
// being in an "undefined" state.

View File

@@ -64,10 +64,8 @@ func NewManager(cg *configs.Cgroup, paths map[string]string, rootless bool) cgro
}
// The absolute path to the root of the cgroup hierarchies.
var (
cgroupRootLock sync.Mutex
cgroupRoot string
)
var cgroupRootLock sync.Mutex
var cgroupRoot string
const defaultCgroupRoot = "/sys/fs/cgroup"
@@ -395,7 +393,7 @@ func join(path string, pid int) error {
if path == "" {
return nil
}
if err := os.MkdirAll(path, 0o755); err != nil {
if err := os.MkdirAll(path, 0755); err != nil {
return err
}
return cgroups.WriteCgroupProc(path, pid)

View File

@@ -11,7 +11,8 @@ import (
"github.com/opencontainers/runc/libcontainer/configs"
)
type HugetlbGroup struct{}
type HugetlbGroup struct {
}
func (s *HugetlbGroup) Name() string {
return "hugetlb"
@@ -23,7 +24,7 @@ func (s *HugetlbGroup) Apply(path string, d *cgroupData) error {
func (s *HugetlbGroup) Set(path string, r *configs.Resources) error {
for _, hugetlb := range r.HugetlbLimit {
if err := cgroups.WriteFile(path, "hugetlb."+hugetlb.Pagesize+".limit_in_bytes", strconv.FormatUint(hugetlb.Limit, 10)); err != nil {
if err := fscommon.WriteFile(path, "hugetlb."+hugetlb.Pagesize+".limit_in_bytes", strconv.FormatUint(hugetlb.Limit, 10)); err != nil {
return err
}
}

View File

@@ -25,7 +25,8 @@ const (
cgroupMemoryMaxUsage = "memory.max_usage_in_bytes"
)
type MemoryGroup struct{}
type MemoryGroup struct {
}
func (s *MemoryGroup) Name() string {
return "memory"
@@ -40,7 +41,7 @@ func setMemory(path string, val int64) error {
return nil
}
err := cgroups.WriteFile(path, cgroupMemoryLimit, strconv.FormatInt(val, 10))
err := fscommon.WriteFile(path, cgroupMemoryLimit, strconv.FormatInt(val, 10))
if !errors.Is(err, unix.EBUSY) {
return err
}
@@ -64,7 +65,7 @@ func setSwap(path string, val int64) error {
return nil
}
return cgroups.WriteFile(path, cgroupMemorySwapLimit, strconv.FormatInt(val, 10))
return fscommon.WriteFile(path, cgroupMemorySwapLimit, strconv.FormatInt(val, 10))
}
func setMemoryAndSwap(path string, r *configs.Resources) error {
@@ -117,20 +118,20 @@ func (s *MemoryGroup) Set(path string, r *configs.Resources) error {
// ignore KernelMemory and KernelMemoryTCP
if r.MemoryReservation != 0 {
if err := cgroups.WriteFile(path, "memory.soft_limit_in_bytes", strconv.FormatInt(r.MemoryReservation, 10)); err != nil {
if err := fscommon.WriteFile(path, "memory.soft_limit_in_bytes", strconv.FormatInt(r.MemoryReservation, 10)); err != nil {
return err
}
}
if r.OomKillDisable {
if err := cgroups.WriteFile(path, "memory.oom_control", "1"); err != nil {
if err := fscommon.WriteFile(path, "memory.oom_control", "1"); err != nil {
return err
}
}
if r.MemorySwappiness == nil || int64(*r.MemorySwappiness) == -1 {
return nil
} else if *r.MemorySwappiness <= 100 {
if err := cgroups.WriteFile(path, "memory.swappiness", strconv.FormatUint(*r.MemorySwappiness, 10)); err != nil {
if err := fscommon.WriteFile(path, "memory.swappiness", strconv.FormatUint(*r.MemorySwappiness, 10)); err != nil {
return err
}
} else {
@@ -142,7 +143,7 @@ func (s *MemoryGroup) Set(path string, r *configs.Resources) error {
func (s *MemoryGroup) GetStats(path string, stats *cgroups.Stats) error {
// Set stats from memory.stat.
statsFile, err := cgroups.OpenFile(path, "memory.stat", os.O_RDONLY)
statsFile, err := fscommon.OpenFile(path, "memory.stat", os.O_RDONLY)
if err != nil {
if os.IsNotExist(err) {
return nil
@@ -199,6 +200,14 @@ func (s *MemoryGroup) GetStats(path string, stats *cgroups.Stats) error {
return nil
}
func memoryAssigned(cgroup *configs.Cgroup) bool {
return cgroup.Resources.Memory != 0 ||
cgroup.Resources.MemoryReservation != 0 ||
cgroup.Resources.MemorySwap > 0 ||
cgroup.Resources.OomKillDisable ||
(cgroup.Resources.MemorySwappiness != nil && int64(*cgroup.Resources.MemorySwappiness) != -1)
}
func getMemoryData(path, name string) (cgroups.MemoryData, error) {
memoryData := cgroups.MemoryData{}
@@ -249,13 +258,12 @@ func getPageUsageByNUMA(cgroupPath string) (cgroups.PageUsageByNUMA, error) {
)
stats := cgroups.PageUsageByNUMA{}
file, err := cgroups.OpenFile(cgroupPath, filename, os.O_RDONLY)
file, err := fscommon.OpenFile(cgroupPath, filename, os.O_RDONLY)
if os.IsNotExist(err) {
return stats, nil
} else if err != nil {
return stats, err
}
defer file.Close()
// File format is documented in linux/Documentation/cgroup-v1/memory.txt
// and it looks like this:

View File

@@ -6,10 +6,12 @@ import (
"strconv"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/configs"
)
type NetClsGroup struct{}
type NetClsGroup struct {
}
func (s *NetClsGroup) Name() string {
return "net_cls"
@@ -21,7 +23,7 @@ func (s *NetClsGroup) Apply(path string, d *cgroupData) error {
func (s *NetClsGroup) Set(path string, r *configs.Resources) error {
if r.NetClsClassid != 0 {
if err := cgroups.WriteFile(path, "net_cls.classid", strconv.FormatUint(uint64(r.NetClsClassid), 10)); err != nil {
if err := fscommon.WriteFile(path, "net_cls.classid", strconv.FormatUint(uint64(r.NetClsClassid), 10)); err != nil {
return err
}
}

View File

@@ -4,10 +4,12 @@ package fs
import (
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/configs"
)
type NetPrioGroup struct{}
type NetPrioGroup struct {
}
func (s *NetPrioGroup) Name() string {
return "net_prio"
@@ -19,7 +21,7 @@ func (s *NetPrioGroup) Apply(path string, d *cgroupData) error {
func (s *NetPrioGroup) Set(path string, r *configs.Resources) error {
for _, prioMap := range r.NetPrioIfpriomap {
if err := cgroups.WriteFile(path, "net_prio.ifpriomap", prioMap.CgroupString()); err != nil {
if err := fscommon.WriteFile(path, "net_prio.ifpriomap", prioMap.CgroupString()); err != nil {
return err
}
}

View File

@@ -7,7 +7,8 @@ import (
"github.com/opencontainers/runc/libcontainer/configs"
)
type PerfEventGroup struct{}
type PerfEventGroup struct {
}
func (s *PerfEventGroup) Name() string {
return "perf_event"

View File

@@ -12,7 +12,8 @@ import (
"github.com/opencontainers/runc/libcontainer/configs"
)
type PidsGroup struct{}
type PidsGroup struct {
}
func (s *PidsGroup) Name() string {
return "pids"
@@ -31,7 +32,7 @@ func (s *PidsGroup) Set(path string, r *configs.Resources) error {
limit = strconv.FormatInt(r.PidsLimit, 10)
}
if err := cgroups.WriteFile(path, "pids.max", limit); err != nil {
if err := fscommon.WriteFile(path, "pids.max", limit); err != nil {
return err
}
}

View File

@@ -23,7 +23,7 @@ func setCpu(dirPath string, r *configs.Resources) error {
// NOTE: .CpuShares is not used here. Conversion is the caller's responsibility.
if r.CpuWeight != 0 {
if err := cgroups.WriteFile(dirPath, "cpu.weight", strconv.FormatUint(r.CpuWeight, 10)); err != nil {
if err := fscommon.WriteFile(dirPath, "cpu.weight", strconv.FormatUint(r.CpuWeight, 10)); err != nil {
return err
}
}
@@ -40,16 +40,15 @@ func setCpu(dirPath string, r *configs.Resources) error {
period = 100000
}
str += " " + strconv.FormatUint(period, 10)
if err := cgroups.WriteFile(dirPath, "cpu.max", str); err != nil {
if err := fscommon.WriteFile(dirPath, "cpu.max", str); err != nil {
return err
}
}
return nil
}
func statCpu(dirPath string, stats *cgroups.Stats) error {
f, err := cgroups.OpenFile(dirPath, "cpu.stat", os.O_RDONLY)
f, err := fscommon.OpenFile(dirPath, "cpu.stat", os.O_RDONLY)
if err != nil {
return err
}

View File

@@ -3,7 +3,7 @@
package fs2
import (
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/configs"
)
@@ -17,12 +17,12 @@ func setCpuset(dirPath string, r *configs.Resources) error {
}
if r.CpusetCpus != "" {
if err := cgroups.WriteFile(dirPath, "cpuset.cpus", r.CpusetCpus); err != nil {
if err := fscommon.WriteFile(dirPath, "cpuset.cpus", r.CpusetCpus); err != nil {
return err
}
}
if r.CpusetMems != "" {
if err := cgroups.WriteFile(dirPath, "cpuset.mems", r.CpusetMems); err != nil {
if err := fscommon.WriteFile(dirPath, "cpuset.mems", r.CpusetMems); err != nil {
return err
}
}

View File

@@ -6,12 +6,12 @@ import (
"path/filepath"
"strings"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/configs"
)
func supportedControllers() (string, error) {
return cgroups.ReadFile(UnifiedMountpoint, "/cgroup.controllers")
return fscommon.ReadFile(UnifiedMountpoint, "/cgroup.controllers")
}
// needAnyControllers returns whether we enable some supported controllers or not,
@@ -92,7 +92,7 @@ func CreateCgroupPath(path string, c *configs.Cgroup) (Err error) {
for i, e := range elements {
current = filepath.Join(current, e)
if i > 0 {
if err := os.Mkdir(current, 0o755); err != nil {
if err := os.Mkdir(current, 0755); err != nil {
if !os.IsExist(err) {
return err
}
@@ -105,7 +105,7 @@ func CreateCgroupPath(path string, c *configs.Cgroup) (Err error) {
}
}()
}
cgType, _ := cgroups.ReadFile(current, cgTypeFile)
cgType, _ := fscommon.ReadFile(current, cgTypeFile)
cgType = strings.TrimSpace(cgType)
switch cgType {
// If the cgroup is in an invalid mode (usually this means there's an internal
@@ -122,7 +122,7 @@ func CreateCgroupPath(path string, c *configs.Cgroup) (Err error) {
// since that means we're a properly delegated cgroup subtree) but in
// this case there's not much we can do and it's better than giving an
// error.
_ = cgroups.WriteFile(current, cgTypeFile, "threaded")
_ = fscommon.WriteFile(current, cgTypeFile, "threaded")
}
// If the cgroup is in (threaded) or (domain threaded) mode, we can only use thread-aware controllers
// (and you cannot usually take a cgroup out of threaded mode).
@@ -136,11 +136,11 @@ func CreateCgroupPath(path string, c *configs.Cgroup) (Err error) {
}
// enable all supported controllers
if i < len(elements)-1 {
if err := cgroups.WriteFile(current, cgStCtlFile, res); err != nil {
if err := fscommon.WriteFile(current, cgStCtlFile, res); err != nil {
// try write one by one
allCtrs := strings.Split(res, " ")
for _, ctr := range allCtrs {
_ = cgroups.WriteFile(current, cgStCtlFile, ctr)
_ = fscommon.WriteFile(current, cgStCtlFile, ctr)
}
}
// Some controllers might not be enabled when rootless or containerized,

View File

@@ -82,7 +82,9 @@ func parseCgroupFile(path string) (string, error) {
}
func parseCgroupFromReader(r io.Reader) (string, error) {
s := bufio.NewScanner(r)
var (
s = bufio.NewScanner(r)
)
for s.Scan() {
var (
text = s.Text()

View File

@@ -58,15 +58,29 @@ func setDevices(dirPath string, r *configs.Resources) error {
if r.SkipDevices {
return nil
}
// XXX: This is currently a white-list (but all callers pass a blacklist of
// devices). This is bad for a whole variety of reasons, but will need
// to be fixed with co-ordinated effort with downstreams.
insts, license, err := devicefilter.DeviceFilter(r.Devices)
if err != nil {
return err
}
dirFD, err := unix.Open(dirPath, unix.O_DIRECTORY|unix.O_RDONLY, 0o600)
dirFD, err := unix.Open(dirPath, unix.O_DIRECTORY|unix.O_RDONLY, 0600)
if err != nil {
return errors.Errorf("cannot get dir FD for %s", dirPath)
}
defer unix.Close(dirFD)
// XXX: This code is currently incorrect when it comes to updating an
// existing cgroup with new rules (new rulesets are just appended to
// the program list because this uses BPF_F_ALLOW_MULTI). If we didn't
// use BPF_F_ALLOW_MULTI we could actually atomically swap the
// programs.
//
// The real issue is that BPF_F_ALLOW_MULTI makes it hard to have a
// race-free blacklist because it acts as a whitelist by default, and
// having a deny-everything program cannot be overridden by other
// programs. You could temporarily insert a deny-everything program
// but that would result in spurrious failures during updates.
if _, err := ebpf.LoadAttachCgroupDeviceFilter(insts, license, dirFD); err != nil {
if !canSkipEBPFError(r) {
return err

View File

@@ -3,20 +3,27 @@
package fs2
import (
"bufio"
stdErrors "errors"
"fmt"
"os"
"strings"
"time"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/pkg/errors"
"golang.org/x/sys/unix"
)
func setFreezer(dirPath string, state configs.FreezerState) error {
if err := supportsFreezer(dirPath); err != nil {
// We can ignore this request as long as the user didn't ask us to
// freeze the container (since without the freezer cgroup, that's a
// no-op).
if state == configs.Undefined || state == configs.Thawed {
return nil
}
return errors.Wrap(err, "freezer not supported")
}
var stateStr string
switch state {
case configs.Undefined:
@@ -29,23 +36,11 @@ func setFreezer(dirPath string, state configs.FreezerState) error {
return errors.Errorf("invalid freezer state %q requested", state)
}
fd, err := cgroups.OpenFile(dirPath, "cgroup.freeze", unix.O_RDWR)
if err != nil {
// We can ignore this request as long as the user didn't ask us to
// freeze the container (since without the freezer cgroup, that's a
// no-op).
if state != configs.Frozen {
return nil
}
return errors.Wrap(err, "freezer not supported")
}
defer fd.Close()
if _, err := fd.WriteString(stateStr); err != nil {
if err := fscommon.WriteFile(dirPath, "cgroup.freeze", stateStr); err != nil {
return err
}
// Confirm that the cgroup did actually change states.
if actualState, err := readFreezer(dirPath, fd); err != nil {
if actualState, err := getFreezer(dirPath); err != nil {
return err
} else if actualState != state {
return errors.Errorf(`expected "cgroup.freeze" to be in state %q but was in %q`, state, actualState)
@@ -53,8 +48,13 @@ func setFreezer(dirPath string, state configs.FreezerState) error {
return nil
}
func supportsFreezer(dirPath string) error {
_, err := fscommon.ReadFile(dirPath, "cgroup.freeze")
return err
}
func getFreezer(dirPath string) (configs.FreezerState, error) {
fd, err := cgroups.OpenFile(dirPath, "cgroup.freeze", unix.O_RDONLY)
state, err := fscommon.ReadFile(dirPath, "cgroup.freeze")
if err != nil {
// If the kernel is too old, then we just treat the freezer as being in
// an "undefined" state.
@@ -63,67 +63,12 @@ func getFreezer(dirPath string) (configs.FreezerState, error) {
}
return configs.Undefined, err
}
defer fd.Close()
return readFreezer(dirPath, fd)
}
func readFreezer(dirPath string, fd *os.File) (configs.FreezerState, error) {
if _, err := fd.Seek(0, 0); err != nil {
return configs.Undefined, err
}
state := make([]byte, 2)
if _, err := fd.Read(state); err != nil {
return configs.Undefined, err
}
switch string(state) {
case "0\n":
switch strings.TrimSpace(state) {
case "0":
return configs.Thawed, nil
case "1\n":
return waitFrozen(dirPath)
case "1":
return configs.Frozen, nil
default:
return configs.Undefined, errors.Errorf(`unknown "cgroup.freeze" state: %q`, state)
}
}
// waitFrozen polls cgroup.events until it sees "frozen 1" in it.
func waitFrozen(dirPath string) (configs.FreezerState, error) {
fd, err := cgroups.OpenFile(dirPath, "cgroup.events", unix.O_RDONLY)
if err != nil {
return configs.Undefined, err
}
defer fd.Close()
// XXX: Simple wait/read/retry is used here. An implementation
// based on poll(2) or inotify(7) is possible, but it makes the code
// much more complicated. Maybe address this later.
const (
// Perform maxIter with waitTime in between iterations.
waitTime = 10 * time.Millisecond
maxIter = 1000
)
scanner := bufio.NewScanner(fd)
for i := 0; scanner.Scan(); {
if i == maxIter {
return configs.Undefined, fmt.Errorf("timeout of %s reached waiting for the cgroup to freeze", waitTime*maxIter)
}
line := scanner.Text()
val := strings.TrimPrefix(line, "frozen ")
if val != line { // got prefix
if val[0] == '1' {
return configs.Frozen, nil
}
i++
// wait, then re-read
time.Sleep(waitTime)
_, err := fd.Seek(0, 0)
if err != nil {
return configs.Undefined, err
}
}
}
// Should only reach here either on read error,
// or if the file does not contain "frozen " line.
return configs.Undefined, scanner.Err()
}

View File

@@ -51,7 +51,7 @@ func (m *manager) getControllers() error {
return nil
}
data, err := cgroups.ReadFile(m.dirPath, "cgroup.controllers")
data, err := fscommon.ReadFile(m.dirPath, "cgroup.controllers")
if err != nil {
if m.rootless && m.config.Path == "" {
return nil
@@ -98,7 +98,9 @@ func (m *manager) GetAllPids() ([]int, error) {
}
func (m *manager) GetStats() (*cgroups.Stats, error) {
var errs []error
var (
errs []error
)
st := cgroups.NewStats()
@@ -197,7 +199,7 @@ func (m *manager) setUnified(res map[string]string) error {
if strings.Contains(k, "/") {
return fmt.Errorf("unified resource %q must be a file name (no slashes)", k)
}
if err := cgroups.WriteFile(m.dirPath, k, v); err != nil {
if err := fscommon.WriteFile(m.dirPath, k, v); err != nil {
errC := errors.Cause(err)
// Check for both EPERM and ENOENT since O_CREAT is used by WriteFile.
if errors.Is(errC, os.ErrPermission) || errors.Is(errC, os.ErrNotExist) {

View File

@@ -21,7 +21,7 @@ func setHugeTlb(dirPath string, r *configs.Resources) error {
return nil
}
for _, hugetlb := range r.HugetlbLimit {
if err := cgroups.WriteFile(dirPath, "hugetlb."+hugetlb.Pagesize+".max", strconv.FormatUint(hugetlb.Limit, 10)); err != nil {
if err := fscommon.WriteFile(dirPath, "hugetlb."+hugetlb.Pagesize+".max", strconv.FormatUint(hugetlb.Limit, 10)); err != nil {
return err
}
}

View File

@@ -4,95 +4,60 @@ package fs2
import (
"bufio"
"bytes"
"fmt"
"os"
"strconv"
"strings"
"github.com/sirupsen/logrus"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/configs"
)
func isIoSet(r *configs.Resources) bool {
return r.BlkioWeight != 0 ||
len(r.BlkioWeightDevice) > 0 ||
len(r.BlkioThrottleReadBpsDevice) > 0 ||
len(r.BlkioThrottleWriteBpsDevice) > 0 ||
len(r.BlkioThrottleReadIOPSDevice) > 0 ||
len(r.BlkioThrottleWriteIOPSDevice) > 0
}
// bfqDeviceWeightSupported checks for per-device BFQ weight support (added
// in kernel v5.4, commit 795fe54c2a8) by reading from "io.bfq.weight".
func bfqDeviceWeightSupported(bfq *os.File) bool {
if bfq == nil {
return false
}
_, _ = bfq.Seek(0, 0)
buf := make([]byte, 32)
_, _ = bfq.Read(buf)
// If only a single number (default weight) if read back, we have older kernel.
_, err := strconv.ParseInt(string(bytes.TrimSpace(buf)), 10, 64)
return err != nil
}
func setIo(dirPath string, r *configs.Resources) error {
if !isIoSet(r) {
return nil
}
// If BFQ IO scheduler is available, use it.
var bfq *os.File
if r.BlkioWeight != 0 || len(r.BlkioWeightDevice) > 0 {
var err error
bfq, err = cgroups.OpenFile(dirPath, "io.bfq.weight", os.O_RDWR)
if err == nil {
defer bfq.Close()
} else if !os.IsNotExist(err) {
return err
}
}
if r.BlkioWeight != 0 {
if bfq != nil { // Use BFQ.
if _, err := bfq.WriteString(strconv.FormatUint(uint64(r.BlkioWeight), 10)); err != nil {
filename := "io.bfq.weight"
if err := fscommon.WriteFile(dirPath, filename,
strconv.FormatUint(uint64(r.BlkioWeight), 10)); err != nil {
// if io.bfq.weight does not exist, then bfq module is not loaded.
// Fallback to use io.weight with a conversion scheme
if !os.IsNotExist(err) {
return err
}
} else {
// Fallback to io.weight with a conversion scheme.
v := cgroups.ConvertBlkIOToIOWeightValue(r.BlkioWeight)
if err := cgroups.WriteFile(dirPath, "io.weight", strconv.FormatUint(v, 10)); err != nil {
if err := fscommon.WriteFile(dirPath, "io.weight", strconv.FormatUint(v, 10)); err != nil {
return err
}
}
}
if bfqDeviceWeightSupported(bfq) {
for _, wd := range r.BlkioWeightDevice {
if _, err := bfq.WriteString(wd.WeightString() + "\n"); err != nil {
return fmt.Errorf("setting device weight %q: %w", wd.WeightString(), err)
}
}
}
for _, td := range r.BlkioThrottleReadBpsDevice {
if err := cgroups.WriteFile(dirPath, "io.max", td.StringName("rbps")); err != nil {
if err := fscommon.WriteFile(dirPath, "io.max", td.StringName("rbps")); err != nil {
return err
}
}
for _, td := range r.BlkioThrottleWriteBpsDevice {
if err := cgroups.WriteFile(dirPath, "io.max", td.StringName("wbps")); err != nil {
if err := fscommon.WriteFile(dirPath, "io.max", td.StringName("wbps")); err != nil {
return err
}
}
for _, td := range r.BlkioThrottleReadIOPSDevice {
if err := cgroups.WriteFile(dirPath, "io.max", td.StringName("riops")); err != nil {
if err := fscommon.WriteFile(dirPath, "io.max", td.StringName("riops")); err != nil {
return err
}
}
for _, td := range r.BlkioThrottleWriteIOPSDevice {
if err := cgroups.WriteFile(dirPath, "io.max", td.StringName("wiops")); err != nil {
if err := fscommon.WriteFile(dirPath, "io.max", td.StringName("wiops")); err != nil {
return err
}
}
@@ -102,7 +67,7 @@ func setIo(dirPath string, r *configs.Resources) error {
func readCgroup2MapFile(dirPath string, name string) (map[string][]string, error) {
ret := map[string][]string{}
f, err := cgroups.OpenFile(dirPath, name, os.O_RDONLY)
f, err := fscommon.OpenFile(dirPath, name, os.O_RDONLY)
if err != nil {
return nil, err
}
@@ -123,22 +88,22 @@ func readCgroup2MapFile(dirPath string, name string) (map[string][]string, error
}
func statIo(dirPath string, stats *cgroups.Stats) error {
// more details on the io.stat file format: https://www.kernel.org/doc/Documentation/cgroup-v2.txt
var ioServiceBytesRecursive []cgroups.BlkioStatEntry
values, err := readCgroup2MapFile(dirPath, "io.stat")
if err != nil {
return err
}
// more details on the io.stat file format: https://www.kernel.org/doc/Documentation/cgroup-v2.txt
var parsedStats cgroups.BlkioStats
for k, v := range values {
d := strings.Split(k, ":")
if len(d) != 2 {
continue
}
major, err := strconv.ParseUint(d[0], 10, 64)
major, err := strconv.ParseUint(d[0], 10, 0)
if err != nil {
return err
}
minor, err := strconv.ParseUint(d[1], 10, 64)
minor, err := strconv.ParseUint(d[1], 10, 0)
if err != nil {
return err
}
@@ -150,32 +115,15 @@ func statIo(dirPath string, stats *cgroups.Stats) error {
}
op := d[0]
// Map to the cgroupv1 naming and layout (in separate tables).
var targetTable *[]cgroups.BlkioStatEntry
// Accommodate the cgroup v1 naming
switch op {
// Equivalent to cgroupv1's blkio.io_service_bytes.
case "rbytes":
op = "Read"
targetTable = &parsedStats.IoServiceBytesRecursive
op = "read"
case "wbytes":
op = "Write"
targetTable = &parsedStats.IoServiceBytesRecursive
// Equivalent to cgroupv1's blkio.io_serviced.
case "rios":
op = "Read"
targetTable = &parsedStats.IoServicedRecursive
case "wios":
op = "Write"
targetTable = &parsedStats.IoServicedRecursive
default:
// Skip over entries we cannot map to cgroupv1 stats for now.
// In the future we should expand the stats struct to include
// them.
logrus.Debugf("cgroupv2 io stats: skipping over unmappable %s entry", item)
continue
op = "write"
}
value, err := strconv.ParseUint(d[1], 10, 64)
value, err := strconv.ParseUint(d[1], 10, 0)
if err != nil {
return err
}
@@ -186,9 +134,9 @@ func statIo(dirPath string, stats *cgroups.Stats) error {
Minor: minor,
Value: value,
}
*targetTable = append(*targetTable, entry)
ioServiceBytesRecursive = append(ioServiceBytesRecursive, entry)
}
}
stats.BlkioStats = parsedStats
stats.BlkioStats = cgroups.BlkioStats{IoServiceBytesRecursive: ioServiceBytesRecursive}
return nil
}

View File

@@ -52,13 +52,13 @@ func setMemory(dirPath string, r *configs.Resources) error {
}
// never write empty string to `memory.swap.max`, it means set to 0.
if swapStr != "" {
if err := cgroups.WriteFile(dirPath, "memory.swap.max", swapStr); err != nil {
if err := fscommon.WriteFile(dirPath, "memory.swap.max", swapStr); err != nil {
return err
}
}
if val := numToStr(r.Memory); val != "" {
if err := cgroups.WriteFile(dirPath, "memory.max", val); err != nil {
if err := fscommon.WriteFile(dirPath, "memory.max", val); err != nil {
return err
}
}
@@ -66,7 +66,7 @@ func setMemory(dirPath string, r *configs.Resources) error {
// cgroup.Resources.KernelMemory is ignored
if val := numToStr(r.MemoryReservation); val != "" {
if err := cgroups.WriteFile(dirPath, "memory.low", val); err != nil {
if err := fscommon.WriteFile(dirPath, "memory.low", val); err != nil {
return err
}
}
@@ -76,7 +76,7 @@ func setMemory(dirPath string, r *configs.Resources) error {
func statMemory(dirPath string, stats *cgroups.Stats) error {
// Set stats from memory.stat.
statsFile, err := cgroups.OpenFile(dirPath, "memory.stat", os.O_RDONLY)
statsFile, err := fscommon.OpenFile(dirPath, "memory.stat", os.O_RDONLY)
if err != nil {
return err
}

View File

@@ -23,7 +23,7 @@ func setPids(dirPath string, r *configs.Resources) error {
return nil
}
if val := numToStr(r.PidsLimit); val != "" {
if err := cgroups.WriteFile(dirPath, "pids.max", val); err != nil {
if err := fscommon.WriteFile(dirPath, "pids.max", val); err != nil {
return err
}
}
@@ -34,9 +34,9 @@ func setPids(dirPath string, r *configs.Resources) error {
func statPidsFromCgroupProcs(dirPath string, stats *cgroups.Stats) error {
// if the controller is not enabled, let's read PIDS from cgroups.procs
// (or threads if cgroup.threads is enabled)
contents, err := cgroups.ReadFile(dirPath, "cgroup.procs")
contents, err := fscommon.ReadFile(dirPath, "cgroup.procs")
if errors.Is(err, unix.ENOTSUP) {
contents, err = cgroups.ReadFile(dirPath, "cgroup.threads")
contents, err = fscommon.ReadFile(dirPath, "cgroup.threads")
}
if err != nil {
return err

View File

@@ -0,0 +1,51 @@
// +build linux
package fscommon
import (
"bytes"
"os"
"github.com/pkg/errors"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
)
// WriteFile writes data to a cgroup file in dir.
// It is supposed to be used for cgroup files only.
func WriteFile(dir, file, data string) error {
fd, err := OpenFile(dir, file, unix.O_WRONLY)
if err != nil {
return err
}
defer fd.Close()
if err := retryingWriteFile(fd, data); err != nil {
return errors.Wrapf(err, "failed to write %q", data)
}
return nil
}
// ReadFile reads data from a cgroup file in dir.
// It is supposed to be used for cgroup files only.
func ReadFile(dir, file string) (string, error) {
fd, err := OpenFile(dir, file, unix.O_RDONLY)
if err != nil {
return "", err
}
defer fd.Close()
var buf bytes.Buffer
_, err = buf.ReadFrom(fd)
return buf.String(), err
}
func retryingWriteFile(fd *os.File, data string) error {
for {
_, err := fd.Write([]byte(data))
if errors.Is(err, unix.EINTR) {
logrus.Infof("interrupted while writing %s to %s", data, fd.Name())
continue
}
return err
}
}

View File

@@ -1,7 +1,6 @@
package cgroups
package fscommon
import (
"bytes"
"os"
"strings"
"sync"
@@ -11,54 +10,6 @@ import (
"golang.org/x/sys/unix"
)
// OpenFile opens a cgroup file in a given dir with given flags.
// It is supposed to be used for cgroup files only.
func OpenFile(dir, file string, flags int) (*os.File, error) {
if dir == "" {
return nil, errors.Errorf("no directory specified for %s", file)
}
return openFile(dir, file, flags)
}
// ReadFile reads data from a cgroup file in dir.
// It is supposed to be used for cgroup files only.
func ReadFile(dir, file string) (string, error) {
fd, err := OpenFile(dir, file, unix.O_RDONLY)
if err != nil {
return "", err
}
defer fd.Close()
var buf bytes.Buffer
_, err = buf.ReadFrom(fd)
return buf.String(), err
}
// WriteFile writes data to a cgroup file in dir.
// It is supposed to be used for cgroup files only.
func WriteFile(dir, file, data string) error {
fd, err := OpenFile(dir, file, unix.O_WRONLY)
if err != nil {
return err
}
defer fd.Close()
if err := retryingWriteFile(fd, data); err != nil {
return errors.Wrapf(err, "failed to write %q", data)
}
return nil
}
func retryingWriteFile(fd *os.File, data string) error {
for {
_, err := fd.Write([]byte(data))
if errors.Is(err, unix.EINTR) {
logrus.Infof("interrupted while writing %s to %s", data, fd.Name())
continue
}
return err
}
}
const (
cgroupfsDir = "/sys/fs/cgroup"
cgroupfsPrefix = cgroupfsDir + "/"
@@ -77,8 +28,7 @@ var (
func prepareOpenat2() error {
prepOnce.Do(func() {
fd, err := unix.Openat2(-1, cgroupfsDir, &unix.OpenHow{
Flags: unix.O_DIRECTORY | unix.O_PATH,
})
Flags: unix.O_DIRECTORY | unix.O_PATH})
if err != nil {
prepErr = &os.PathError{Op: "openat2", Path: cgroupfsDir, Err: err}
if err != unix.ENOSYS {
@@ -102,6 +52,7 @@ func prepareOpenat2() error {
// cgroupv2 has a single mountpoint and no "cpu,cpuacct" symlinks
resolveFlags |= unix.RESOLVE_NO_XDEV | unix.RESOLVE_NO_SYMLINKS
}
})
return prepErr
@@ -109,7 +60,10 @@ func prepareOpenat2() error {
// OpenFile opens a cgroup file in a given dir with given flags.
// It is supposed to be used for cgroup files only.
func openFile(dir, file string, flags int) (*os.File, error) {
func OpenFile(dir, file string, flags int) (*os.File, error) {
if dir == "" {
return nil, errors.Errorf("no directory specified for %s", file)
}
mode := os.FileMode(0)
if TestMode && flags&os.O_WRONLY != 0 {
// "emulate" cgroup fs for unit tests

View File

@@ -8,19 +8,10 @@ import (
"math"
"strconv"
"strings"
"github.com/opencontainers/runc/libcontainer/cgroups"
)
var (
ErrNotValidFormat = errors.New("line is not a valid key value format")
// Deprecated: use cgroups.OpenFile instead.
OpenFile = cgroups.OpenFile
// Deprecated: use cgroups.ReadFile instead.
ReadFile = cgroups.ReadFile
// Deprecated: use cgroups.WriteFile instead.
WriteFile = cgroups.WriteFile
)
// ParseUint converts a string to an uint64 integer.
@@ -66,7 +57,7 @@ func ParseKeyValue(t string) (string, uint64, error) {
// and returns a value of the specified key. ParseUint is used for value
// conversion.
func GetValueByKey(path, file, key string) (uint64, error) {
content, err := cgroups.ReadFile(path, file)
content, err := ReadFile(path, file)
if err != nil {
return 0, err
}
@@ -104,7 +95,7 @@ func GetCgroupParamUint(path, file string) (uint64, error) {
// GetCgroupParamInt reads a single int64 value from specified cgroup file.
// If the value read is "max", the math.MaxInt64 is returned.
func GetCgroupParamInt(path, file string) (int64, error) {
contents, err := cgroups.ReadFile(path, file)
contents, err := ReadFile(path, file)
if err != nil {
return 0, err
}
@@ -122,7 +113,7 @@ func GetCgroupParamInt(path, file string) (int64, error) {
// GetCgroupParamString reads a string from the specified cgroup file.
func GetCgroupParamString(path, file string) (string, error) {
contents, err := cgroups.ReadFile(path, file)
contents, err := ReadFile(path, file)
if err != nil {
return "", err
}

View File

@@ -158,27 +158,14 @@ func findDeviceGroup(ruleType devices.Type, ruleMajor int64) (string, error) {
return "", nil
}
// DeviceAllow is the dbus type "a(ss)" which means we need a struct
// to represent it in Go.
type deviceAllowEntry struct {
Path string
Perms string
}
func allowAllDevices() []systemdDbus.Property {
// Setting mode to auto and removing all DeviceAllow rules
// results in allowing access to all devices.
return []systemdDbus.Property{
newProp("DevicePolicy", "auto"),
newProp("DeviceAllow", []deviceAllowEntry{}),
}
}
// generateDeviceProperties takes the configured device rules and generates a
// corresponding set of systemd properties to configure the devices correctly.
func generateDeviceProperties(r *configs.Resources) ([]systemdDbus.Property, error) {
if r.SkipDevices {
return nil, nil
func generateDeviceProperties(rules []*devices.Rule) ([]systemdDbus.Property, error) {
// DeviceAllow is the type "a(ss)" which means we need a temporary struct
// to represent it in Go.
type deviceAllowEntry struct {
Path string
Perms string
}
properties := []systemdDbus.Property{
@@ -190,7 +177,7 @@ func generateDeviceProperties(r *configs.Resources) ([]systemdDbus.Property, err
// Figure out the set of rules.
configEmu := &cgroupdevices.Emulator{}
for _, rule := range r.Devices {
for _, rule := range rules {
if err := configEmu.Apply(*rule); err != nil {
return nil, errors.Wrap(err, "apply rule for systemd")
}
@@ -202,7 +189,12 @@ func generateDeviceProperties(r *configs.Resources) ([]systemdDbus.Property, err
if configEmu.IsBlacklist() {
// However, if we're dealing with an allow-all rule then we can do it.
if configEmu.IsAllowAll() {
return allowAllDevices(), nil
return []systemdDbus.Property{
// Run in white-list mode by setting to "auto" and removing all
// DeviceAllow rules.
newProp("DevicePolicy", "auto"),
newProp("DeviceAllow", []deviceAllowEntry{}),
}, nil
}
logrus.Warn("systemd doesn't support blacklist device rules -- applying temporary deny-all rule")
return properties, nil
@@ -211,7 +203,8 @@ func generateDeviceProperties(r *configs.Resources) ([]systemdDbus.Property, err
// Now generate the set of rules we actually need to apply. Unlike the
// normal devices cgroup, in "strict" mode systemd defaults to a deny-all
// whitelist which is the default for devices.Emulator.
finalRules, err := configEmu.Rules()
baseEmu := &cgroupdevices.Emulator{}
finalRules, err := baseEmu.Transition(configEmu)
if err != nil {
return nil, errors.Wrap(err, "get simplified rules for systemd")
}
@@ -313,7 +306,7 @@ func getUnitName(c *configs.Cgroup) string {
// isDbusError returns true if the error is a specific dbus error.
func isDbusError(err error, name string) bool {
if err != nil {
var derr dbus.Error
var derr *dbus.Error
if errors.As(err, &derr) {
return strings.Contains(derr.Name, name)
}
@@ -362,9 +355,6 @@ func stopUnit(cm *dbusConnManager, unitName string) error {
return err
})
if err == nil {
timeout := time.NewTimer(30 * time.Second)
defer timeout.Stop()
select {
case s := <-statusChan:
close(statusChan)
@@ -372,8 +362,8 @@ func stopUnit(cm *dbusConnManager, unitName string) error {
if s != "done" {
logrus.Warnf("error removing unit `%s`: got `%s`. Continuing...", unitName, s)
}
case <-timeout.C:
return errors.New("Timed out while waiting for systemd to remove " + unitName)
case <-time.After(time.Second):
logrus.Warnf("Timed out while waiting for StopUnit(%s) completion signal from dbus. Continuing...", unitName)
}
}
return nil
@@ -486,7 +476,7 @@ func addCpuset(cm *dbusConnManager, props *[]systemdDbus.Property, cpus, mems st
}
if cpus != "" {
bits, err := RangeToBits(cpus)
bits, err := rangeToBits(cpus)
if err != nil {
return fmt.Errorf("resources.CPU.Cpus=%q conversion error: %w",
cpus, err)
@@ -495,7 +485,7 @@ func addCpuset(cm *dbusConnManager, props *[]systemdDbus.Property, cpus, mems st
newProp("AllowedCPUs", bits))
}
if mems != "" {
bits, err := RangeToBits(mems)
bits, err := rangeToBits(mems)
if err != nil {
return fmt.Errorf("resources.CPU.Mems=%q conversion error: %w",
mems, err)

View File

@@ -5,15 +5,15 @@ import (
"strconv"
"strings"
"github.com/bits-and-blooms/bitset"
"github.com/pkg/errors"
"github.com/willf/bitset"
)
// RangeToBits converts a text representation of a CPU mask (as written to
// rangeToBits converts a text representation of a CPU mask (as written to
// or read from cgroups' cpuset.* files, e.g. "1,3-5") to a slice of bytes
// with the corresponding bits set (as consumed by systemd over dbus as
// AllowedCPUs/AllowedMemoryNodes unit property value).
func RangeToBits(str string) ([]byte, error) {
func rangeToBits(str string) ([]byte, error) {
bits := &bitset.BitSet{}
for _, r := range strings.Split(str, ",") {

View File

@@ -17,16 +17,14 @@ var (
dbusRootless bool
)
type dbusConnManager struct{}
type dbusConnManager struct {
}
// newDbusConnManager initializes systemd dbus connection manager.
func newDbusConnManager(rootless bool) *dbusConnManager {
dbusMu.Lock()
defer dbusMu.Unlock()
if dbusInited && rootless != dbusRootless {
panic("can't have both root and rootless dbus")
}
dbusInited = true
dbusRootless = rootless
return &dbusConnManager{}
}

View File

@@ -61,7 +61,7 @@ var legacySubsystems = []subsystem{
func genV1ResourcesProperties(r *configs.Resources, cm *dbusConnManager) ([]systemdDbus.Property, error) {
var properties []systemdDbus.Property
deviceProperties, err := generateDeviceProperties(r)
deviceProperties, err := generateDeviceProperties(r.Devices)
if err != nil {
return nil, err
}
@@ -207,10 +207,9 @@ func (m *legacyManager) Destroy() error {
stopErr := stopUnit(m.dbus, getUnitName(m.cgroups))
// Both on success and on error, cleanup all the cgroups
// we are aware of, as some of them were created directly
// by Apply() and are not managed by systemd.
if err := cgroups.RemovePaths(m.paths); err != nil && stopErr == nil {
// Both on success and on error, cleanup all the cgroups we are aware of.
// Some of them were created directly by Apply() and are not managed by systemd.
if err := cgroups.RemovePaths(m.paths); err != nil {
return err
}
@@ -238,7 +237,7 @@ func (m *legacyManager) joinCgroups(pid int) error {
}
default:
if path, ok := m.paths[name]; ok {
if err := os.MkdirAll(path, 0o755); err != nil {
if err := os.MkdirAll(path, 0755); err != nil {
return err
}
if err := cgroups.WriteCgroupProc(path, pid); err != nil {
@@ -339,24 +338,27 @@ func (m *legacyManager) Set(r *configs.Resources) error {
return err
}
// Figure out the current freezer state, so we can revert to it after we
// temporarily freeze the container.
targetFreezerState, err := m.GetFreezerState()
if err != nil {
return err
}
if targetFreezerState == configs.Undefined {
targetFreezerState = configs.Thawed
}
// We have to freeze the container while systemd sets the cgroup settings.
// The reason for this is that systemd's application of DeviceAllow rules
// is done disruptively, resulting in spurrious errors to common devices
// (unlike our fs driver, they will happily write deny-all rules to running
// containers). So we freeze the container to avoid them hitting the cgroup
// error. But if the freezer cgroup isn't supported, we just warn about it.
if err := m.Freeze(configs.Frozen); err != nil {
logrus.Infof("freeze container before SetUnitProperties failed: %v", err)
targetFreezerState := configs.Undefined
if !m.cgroups.SkipDevices {
// Figure out the current freezer state, so we can revert to it after we
// temporarily freeze the container.
targetFreezerState, err = m.GetFreezerState()
if err != nil {
return err
}
if targetFreezerState == configs.Undefined {
targetFreezerState = configs.Thawed
}
if err := m.Freeze(configs.Frozen); err != nil {
logrus.Infof("freeze container before SetUnitProperties failed: %v", err)
}
}
if err := setUnitProperties(m.dbus, getUnitName(m.cgroups), properties...); err != nil {

View File

@@ -96,7 +96,7 @@ func unifiedResToSystemdProps(cm *dbusConnManager, res map[string]string) (props
newProp("CPUWeight", num))
case "cpuset.cpus", "cpuset.mems":
bits, err := RangeToBits(v)
bits, err := rangeToBits(v)
if err != nil {
return nil, fmt.Errorf("unified resource %q=%q conversion error: %w", k, v, err)
}
@@ -172,7 +172,7 @@ func genV2ResourcesProperties(r *configs.Resources, cm *dbusConnManager) ([]syst
// aren't the end of the world, but it is a bit concerning. However
// it's unclear if systemd removes all eBPF programs attached when
// doing SetUnitProperties...
deviceProperties, err := generateDeviceProperties(r)
deviceProperties, err := generateDeviceProperties(r.Devices)
if err != nil {
return nil, err
}
@@ -418,24 +418,27 @@ func (m *unifiedManager) Set(r *configs.Resources) error {
return err
}
// Figure out the current freezer state, so we can revert to it after we
// temporarily freeze the container.
targetFreezerState, err := m.GetFreezerState()
if err != nil {
return err
}
if targetFreezerState == configs.Undefined {
targetFreezerState = configs.Thawed
}
// We have to freeze the container while systemd sets the cgroup settings.
// The reason for this is that systemd's application of DeviceAllow rules
// is done disruptively, resulting in spurrious errors to common devices
// (unlike our fs driver, they will happily write deny-all rules to running
// containers). So we freeze the container to avoid them hitting the cgroup
// error. But if the freezer cgroup isn't supported, we just warn about it.
if err := m.Freeze(configs.Frozen); err != nil {
logrus.Infof("freeze container before SetUnitProperties failed: %v", err)
targetFreezerState := configs.Undefined
if !m.cgroups.SkipDevices {
// Figure out the current freezer state, so we can revert to it after we
// temporarily freeze the container.
targetFreezerState, err = m.GetFreezerState()
if err != nil {
return err
}
if targetFreezerState == configs.Undefined {
targetFreezerState = configs.Thawed
}
if err := m.Freeze(configs.Frozen); err != nil {
logrus.Infof("freeze container before SetUnitProperties failed: %v", err)
}
}
if err := setUnitProperties(m.dbus, getUnitName(m.cgroups), properties...); err != nil {

View File

@@ -15,6 +15,7 @@ import (
"sync"
"time"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/userns"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
@@ -87,7 +88,7 @@ func GetAllSubsystems() ([]string, error) {
// - freezer: implemented in kernel 5.2
// We assume these are always available, as it is hard to detect availability.
pseudo := []string{"devices", "freezer"}
data, err := ReadFile("/sys/fs/cgroup", "cgroup.controllers")
data, err := fscommon.ReadFile("/sys/fs/cgroup", "cgroup.controllers")
if err != nil {
return nil, err
}
@@ -266,6 +267,7 @@ func RemovePaths(paths map[string]string) (err error) {
case retries - 1:
logrus.WithError(err).Error("Failed to remove cgroup")
}
}
_, err := os.Stat(p)
// We need this strange way of checking cgroups existence because
@@ -374,7 +376,7 @@ func WriteCgroupProc(dir string, pid int) error {
return nil
}
file, err := OpenFile(dir, CgroupProcesses, os.O_WRONLY)
file, err := fscommon.OpenFile(dir, CgroupProcesses, os.O_WRONLY)
if err != nil {
return fmt.Errorf("failed to write %v to %v: %v", pid, CgroupProcesses, err)
}

View File

@@ -13,12 +13,12 @@ const (
Thawed FreezerState = "THAWED"
)
// Cgroup holds properties of a cgroup on Linux.
type Cgroup struct {
// Name specifies the name of the cgroup
// Deprecated, use Path instead
Name string `json:"name,omitempty"`
// Parent specifies the name of parent of cgroup or slice
// name of parent of cgroup or slice
// Deprecated, use Path instead
Parent string `json:"parent,omitempty"`
// Path specifies the path to cgroups that are created and/or joined by the container.
@@ -127,8 +127,8 @@ type Resources struct {
// SkipDevices allows to skip configuring device permissions.
// Used by e.g. kubelet while creating a parent cgroup (kubepods)
// common for many containers, and by runc update.
// common for many containers.
//
// NOTE it is impossible to start a container which has this flag set.
SkipDevices bool `json:"-"`
SkipDevices bool `json:"skip_devices"`
}

View File

@@ -2,7 +2,7 @@
package configs
// Cgroup holds properties of a cgroup on Linux
// TODO Windows: This can ultimately be entirely factored out on Windows as
// cgroups are a Unix-specific construct.
type Cgroup struct{}
type Cgroup struct {
}

View File

@@ -208,11 +208,9 @@ type Config struct {
RootlessCgroups bool `json:"rootless_cgroups,omitempty"`
}
type (
HookName string
HookList []Hook
Hooks map[HookName]HookList
)
type HookName string
type HookList []Hook
type Hooks map[HookName]HookList
const (
// Prestart commands are executed after the container namespaces are created,
@@ -389,7 +387,7 @@ func (c Command) Run(s *specs.State) error {
case err := <-errC:
return err
case <-timerCh:
_ = cmd.Process.Kill()
cmd.Process.Kill()
<-errC
return fmt.Errorf("hook ran past specified timeout of %.1fs", c.Timeout.Seconds())
}

View File

@@ -0,0 +1,17 @@
package configs
import "github.com/opencontainers/runc/libcontainer/devices"
type (
// Deprecated: use libcontainer/devices.Device
Device = devices.Device
// Deprecated: use libcontainer/devices.Rule
DeviceRule = devices.Rule
// Deprecated: use libcontainer/devices.Type
DeviceType = devices.Type
// Deprecated: use libcontainer/devices.Permissions
DevicePermissions = devices.Permissions
)

View File

@@ -3,7 +3,7 @@ package configs
const (
// EXT_COPYUP is a directive to copy up the contents of a directory when
// a tmpfs is mounted over it.
EXT_COPYUP = 1 << iota //nolint:golint // ignore "don't use ALL_CAPS" warning
EXT_COPYUP = 1 << iota
)
type Mount struct {

View File

@@ -4,4 +4,5 @@ package configs
// Namespace defines configuration for each namespace. It specifies an
// alternate path that is able to be joined via setns.
type Namespace struct{}
type Namespace struct {
}

View File

@@ -50,10 +50,7 @@ type Network struct {
HairpinMode bool `json:"hairpin_mode"`
}
// Route defines a routing table entry.
//
// Routes can be specified to create entries in the routing table as the container
// is started.
// Routes can be specified to create entries in the route table as the container is started
//
// All of destination, source, and gateway should be either IPv4 or IPv6.
// One of the three options must be present, and omitted entries will use their
@@ -61,15 +58,15 @@ type Network struct {
// gateway to 1.2.3.4 and the interface to eth0 will set up a standard
// destination of 0.0.0.0(or *) when viewed in the route table.
type Route struct {
// Destination specifies the destination IP address and mask in the CIDR form.
// Sets the destination and mask, should be a CIDR. Accepts IPv4 and IPv6
Destination string `json:"destination"`
// Source specifies the source IP address and mask in the CIDR form.
// Sets the source and mask, should be a CIDR. Accepts IPv4 and IPv6
Source string `json:"source"`
// Gateway specifies the gateway IP address.
// Sets the gateway. Accepts IPv4 and IPv6
Gateway string `json:"gateway"`
// InterfaceName specifies the device to set this route up for, for example eth0.
// The device to set this route up for, for example: eth0
InterfaceName string `json:"interface_name"`
}

View File

@@ -12,7 +12,6 @@ import (
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/intelrdt"
selinux "github.com/opencontainers/selinux/go-selinux"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
)
@@ -24,13 +23,13 @@ func New() Validator {
return &ConfigValidator{}
}
type ConfigValidator struct{}
type ConfigValidator struct {
}
type check func(config *configs.Config) error
func (v *ConfigValidator) Validate(config *configs.Config) error {
checks := []check{
v.cgroups,
v.rootfs,
v.network,
v.hostname,
@@ -40,21 +39,17 @@ func (v *ConfigValidator) Validate(config *configs.Config) error {
v.sysctl,
v.intelrdt,
v.rootlessEUID,
v.mounts,
}
for _, c := range checks {
if err := c(config); err != nil {
return err
}
}
// Relaxed validation rules for backward compatibility
warns := []check{
v.mounts, // TODO (runc v1.x.x): make this an error instead of a warning
}
for _, c := range warns {
if err := c(config); err != nil {
logrus.WithError(err).Warnf("invalid configuration")
}
if err := v.cgroups(config); err != nil {
return err
}
return nil
}

View File

@@ -9,7 +9,7 @@ import (
// mount initializes the console inside the rootfs mounting with the specified mount label
// and applying the correct ownership of the console.
func mountConsole(slavePath string) error {
oldMask := unix.Umask(0o000)
oldMask := unix.Umask(0000)
defer unix.Umask(oldMask)
f, err := os.Create("/dev/console")
if err != nil && !os.IsExist(err) {

View File

@@ -437,8 +437,8 @@ func (c *linuxContainer) createExecFifo() error {
if _, err := os.Stat(fifoName); err == nil {
return fmt.Errorf("exec fifo %s already exists", fifoName)
}
oldMask := unix.Umask(0o000)
if err := unix.Mkfifo(fifoName, 0o622); err != nil {
oldMask := unix.Umask(0000)
if err := unix.Mkfifo(fifoName, 0622); err != nil {
unix.Umask(oldMask)
return err
}
@@ -699,6 +699,7 @@ func (c *linuxContainer) NotifyMemoryPressure(level PressureLevel) (<-chan struc
var criuFeatures *criurpc.CriuFeatures
func (c *linuxContainer) checkCriuFeatures(criuOpts *CriuOpts, rpcOpts *criurpc.CriuOpts, criuFeat *criurpc.CriuFeatures) error {
t := criurpc.CriuReqType_FEATURE_CHECK
// make sure the features we are looking for are really not from
@@ -760,6 +761,7 @@ func compareCriuVersion(criuVersion int, minVersion int) error {
// checkCriuVersion checks Criu version greater than or equal to minVersion
func (c *linuxContainer) checkCriuVersion(minVersion int) error {
// If the version of criu has already been determined there is no need
// to ask criu for the version again. Use the value from c.criuVersion.
if c.criuVersion != 0 {
@@ -968,7 +970,7 @@ func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error {
// Since a container can be C/R'ed multiple times,
// the checkpoint directory may already exist.
if err := os.Mkdir(criuOpts.ImagesDirectory, 0o700); err != nil && !os.IsExist(err) {
if err := os.Mkdir(criuOpts.ImagesDirectory, 0700); err != nil && !os.IsExist(err) {
return err
}
@@ -976,7 +978,7 @@ func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error {
criuOpts.WorkDirectory = filepath.Join(c.root, "criu.work")
}
if err := os.Mkdir(criuOpts.WorkDirectory, 0o700); err != nil && !os.IsExist(err) {
if err := os.Mkdir(criuOpts.WorkDirectory, 0700); err != nil && !os.IsExist(err) {
return err
}
@@ -1046,7 +1048,7 @@ func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error {
}
}
// pre-dump may need parentImage param to complete iterative migration
//pre-dump may need parentImage param to complete iterative migration
if criuOpts.ParentImage != "" {
rpcOpts.ParentImg = proto.String(criuOpts.ParentImage)
rpcOpts.TrackMem = proto.Bool(true)
@@ -1144,7 +1146,7 @@ func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error {
return err
}
err = ioutil.WriteFile(filepath.Join(criuOpts.ImagesDirectory, descriptorsFilename), fdsJSON, 0o600)
err = ioutil.WriteFile(filepath.Join(criuOpts.ImagesDirectory, descriptorsFilename), fdsJSON, 0600)
if err != nil {
return err
}
@@ -1215,7 +1217,7 @@ func (c *linuxContainer) makeCriuRestoreMountpoints(m *configs.Mount) error {
if err := checkProcMount(c.config.Rootfs, dest, ""); err != nil {
return err
}
if err := os.MkdirAll(dest, 0o755); err != nil {
if err := os.MkdirAll(dest, 0755); err != nil {
return err
}
}
@@ -1316,7 +1318,7 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
}
// Since a container can be C/R'ed multiple times,
// the work directory may already exist.
if err := os.Mkdir(criuOpts.WorkDirectory, 0o700); err != nil && !os.IsExist(err) {
if err := os.Mkdir(criuOpts.WorkDirectory, 0700); err != nil && !os.IsExist(err) {
return err
}
workDir, err := os.Open(criuOpts.WorkDirectory)
@@ -1338,7 +1340,7 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
// c.config.Rootfs is bind-mounted to a temporary directory
// to satisfy these requirements.
root := filepath.Join(c.root, "criu-root")
if err := os.Mkdir(root, 0o755); err != nil {
if err := os.Mkdir(root, 0755); err != nil {
return err
}
defer os.Remove(root)
@@ -1350,7 +1352,7 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
if err != nil {
return err
}
defer unix.Unmount(root, unix.MNT_DETACH) //nolint: errcheck
defer unix.Unmount(root, unix.MNT_DETACH)
t := criurpc.CriuReqType_RESTORE
req := &criurpc.CriuReq{
Type: &t,
@@ -1375,15 +1377,6 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
},
}
if criuOpts.LsmProfile != "" {
// CRIU older than 3.16 has a bug which breaks the possibility
// to set a different LSM profile.
if err := c.checkCriuVersion(31600); err != nil {
return errors.New("--lsm-profile requires at least CRIU 3.16")
}
req.Opts.LsmProfile = proto.String(criuOpts.LsmProfile)
}
c.handleCriuConfigurationFile(req.Opts)
if err := c.handleRestoringNamespaces(req.Opts, &extraFiles); err != nil {
@@ -1672,7 +1665,7 @@ func (c *linuxContainer) criuSwrk(process *Process, req *criurpc.CriuReq, opts *
break
}
_ = criuClientCon.CloseWrite()
criuClientCon.CloseWrite()
// cmd.Wait() waits cmd.goroutines which are used for proxying file descriptors.
// Here we want to wait only the CRIU process.
criuProcessState, err = criuProcess.Wait()

View File

@@ -29,5 +29,4 @@ type CriuOpts struct {
AutoDedup bool // auto deduplication for incremental dumps
LazyPages bool // restore memory pages lazily using userfaultfd
StatusFd int // fd for feedback when lazy server is ready
LsmProfile string // LSM profile used to restore the container
}

View File

@@ -11,8 +11,10 @@ import (
"golang.org/x/sys/unix"
)
// ErrNotADevice denotes that a file is not a valid linux device.
var ErrNotADevice = errors.New("not a device node")
var (
// ErrNotADevice denotes that a file is not a valid linux device.
ErrNotADevice = errors.New("not a device node")
)
// Testing dependencies
var (
@@ -27,9 +29,8 @@ func mkDev(d *Rule) (uint64, error) {
return unix.Mkdev(uint32(d.Major), uint32(d.Minor)), nil
}
// DeviceFromPath takes the path to a device and its cgroup_permissions (which
// cannot be easily queried) to look up the information about a linux device
// and returns that information as a Device struct.
// Given the path to a device and its cgroup_permissions(which cannot be easily queried) look up the
// information about a linux device and return that information as a Device struct.
func DeviceFromPath(path, permissions string) (*Device, error) {
var stat unix.Stat_t
err := unixLstat(path, &stat)

View File

@@ -196,11 +196,7 @@ func New(root string, options ...func(*LinuxFactory) error) (Factory, error) {
Validator: validate.New(),
CriuPath: "criu",
}
if err := Cgroupfs(l); err != nil {
return nil, err
}
Cgroupfs(l)
for _, opt := range options {
if opt == nil {
continue
@@ -291,7 +287,7 @@ func (l *LinuxFactory) Load(id string) (Container, error) {
if l.Root == "" {
return nil, newGenericError(fmt.Errorf("invalid root"), ConfigInvalid)
}
// when load, we need to check id is valid or not.
//when load, we need to check id is valid or not.
if err := l.validateID(id); err != nil {
return nil, err
}

View File

@@ -112,19 +112,9 @@ func populateProcessEnvironment(env []string) error {
for _, pair := range env {
p := strings.SplitN(pair, "=", 2)
if len(p) < 2 {
return fmt.Errorf("invalid environment variable: %q", pair)
return fmt.Errorf("invalid environment '%v'", pair)
}
name, val := p[0], p[1]
if name == "" {
return fmt.Errorf("environment variable name can't be empty: %q", pair)
}
if strings.IndexByte(name, 0) >= 0 {
return fmt.Errorf("environment variable name can't contain null(\\x00): %q", pair)
}
if strings.IndexByte(val, 0) >= 0 {
return fmt.Errorf("environment variable value can't contain null(\\x00): %q", pair)
}
if err := os.Setenv(name, val); err != nil {
if err := os.Setenv(p[0], p[1]); err != nil {
return err
}
}

View File

@@ -1,6 +1,8 @@
package intelrdt
var cmtEnabled bool
var (
cmtEnabled bool
)
// Check if Intel RDT/CMT is enabled.
func IsCMTEnabled() bool {

View File

@@ -2,8 +2,10 @@
package intelrdt
// The flag to indicate if Intel RDT/MBM is enabled
var mbmEnabled bool
var (
// The flag to indicate if Intel RDT/MBM is enabled
mbmEnabled bool
)
// Check if Intel RDT/MBM is enabled.
func IsMBMEnabled() bool {

View File

@@ -10,7 +10,9 @@ import (
"github.com/sirupsen/logrus"
)
var enabledMonFeatures monFeatures
var (
enabledMonFeatures monFeatures
)
type monFeatures struct {
mbmTotalBytes bool

View File

@@ -14,18 +14,18 @@ import (
type KeySerial uint32
func JoinSessionKeyring(name string) (KeySerial, error) {
sessKeyID, err := unix.KeyctlJoinSessionKeyring(name)
sessKeyId, err := unix.KeyctlJoinSessionKeyring(name)
if err != nil {
return 0, errors.Wrap(err, "create session key")
}
return KeySerial(sessKeyID), nil
return KeySerial(sessKeyId), nil
}
// ModKeyringPerm modifies permissions on a keyring by reading the current permissions,
// anding the bits with the given mask (clearing permissions) and setting
// additional permission bits
func ModKeyringPerm(ringID KeySerial, mask, setbits uint32) error {
dest, err := unix.KeyctlString(unix.KEYCTL_DESCRIBE, int(ringID))
func ModKeyringPerm(ringId KeySerial, mask, setbits uint32) error {
dest, err := unix.KeyctlString(unix.KEYCTL_DESCRIBE, int(ringId))
if err != nil {
return err
}
@@ -43,5 +43,5 @@ func ModKeyringPerm(ringID KeySerial, mask, setbits uint32) error {
perm := (uint32(perm64) & mask) | setbits
return unix.KeyctlSetperm(int(ringID), perm)
return unix.KeyctlSetperm(int(ringId), perm)
}

View File

@@ -85,7 +85,7 @@ func ConfigureLogging(config Config) error {
if config.LogPipeFd > 0 {
logrus.SetOutput(os.NewFile(uintptr(config.LogPipeFd), "logpipe"))
} else if config.LogFilePath != "" {
f, err := os.OpenFile(config.LogFilePath, os.O_CREATE|os.O_WRONLY|os.O_APPEND|os.O_SYNC, 0o644)
f, err := os.OpenFile(config.LogFilePath, os.O_CREATE|os.O_WRONLY|os.O_APPEND|os.O_SYNC, 0644)
if err != nil {
return err
}

View File

@@ -83,7 +83,8 @@ func readSysfsNetworkStats(ethInterface, statsFile string) (uint64, error) {
}
// loopback is a network strategy that provides a basic loopback device
type loopback struct{}
type loopback struct {
}
func (l *loopback) create(n *network, nspid int) error {
return nil

View File

@@ -35,7 +35,7 @@ func registerMemoryEvent(cgDir string, evName string, arg string) (<-chan struct
eventControlPath := filepath.Join(cgDir, "cgroup.event_control")
data := fmt.Sprintf("%d %d %s", eventfd.Fd(), evFile.Fd(), arg)
if err := ioutil.WriteFile(eventControlPath, []byte(data), 0o700); err != nil {
if err := ioutil.WriteFile(eventControlPath, []byte(data), 0700); err != nil {
eventfd.Close()
evFile.Close()
return nil, err

View File

@@ -188,7 +188,7 @@ func (p *setnsProcess) start() (retErr error) {
}
// Must be done after Shutdown so the child will exit and we can wait for it.
if ierr != nil {
_, _ = p.wait()
p.wait()
return ierr
}
return nil
@@ -201,16 +201,16 @@ func (p *setnsProcess) start() (retErr error) {
func (p *setnsProcess) execSetns() error {
status, err := p.cmd.Process.Wait()
if err != nil {
_ = p.cmd.Wait()
p.cmd.Wait()
return newSystemErrorWithCause(err, "waiting on setns process to finish")
}
if !status.Success() {
_ = p.cmd.Wait()
p.cmd.Wait()
return newSystemError(&exec.ExitError{ProcessState: status})
}
var pid *pid
if err := json.NewDecoder(p.messageSockPair.parent).Decode(&pid); err != nil {
_ = p.cmd.Wait()
p.cmd.Wait()
return newSystemErrorWithCause(err, "reading pid from init pipe")
}
@@ -292,7 +292,7 @@ func (p *initProcess) externalDescriptors() []string {
func (p *initProcess) getChildPid() (int, error) {
var pid pid
if err := json.NewDecoder(p.messageSockPair.parent).Decode(&pid); err != nil {
_ = p.cmd.Wait()
p.cmd.Wait()
return -1, err
}
@@ -309,11 +309,11 @@ func (p *initProcess) getChildPid() (int, error) {
func (p *initProcess) waitForChildExit(childPid int) error {
status, err := p.cmd.Process.Wait()
if err != nil {
_ = p.cmd.Wait()
p.cmd.Wait()
return err
}
if !status.Success() {
_ = p.cmd.Wait()
p.cmd.Wait()
return &exec.ExitError{ProcessState: status}
}
@@ -327,12 +327,12 @@ func (p *initProcess) waitForChildExit(childPid int) error {
}
func (p *initProcess) start() (retErr error) {
defer p.messageSockPair.parent.Close() //nolint: errcheck
defer p.messageSockPair.parent.Close()
err := p.cmd.Start()
p.process.ops = p
// close the write-side of the pipes (controlled by child)
_ = p.messageSockPair.child.Close()
_ = p.logFilePair.child.Close()
p.messageSockPair.child.Close()
p.logFilePair.child.Close()
if err != nil {
p.process.ops = nil
return newSystemErrorWithCause(err, "starting init process command")
@@ -371,9 +371,9 @@ func (p *initProcess) start() (retErr error) {
logrus.WithError(err).Warn("unable to terminate initProcess")
}
_ = p.manager.Destroy()
p.manager.Destroy()
if p.intelRdtManager != nil {
_ = p.intelRdtManager.Destroy()
p.intelRdtManager.Destroy()
}
}
}()
@@ -553,7 +553,7 @@ func (p *initProcess) start() (retErr error) {
// Must be done after Shutdown so the child will exit and we can wait for it.
if ierr != nil {
_, _ = p.wait()
p.wait()
return ierr
}
return nil
@@ -563,7 +563,7 @@ func (p *initProcess) wait() (*os.ProcessState, error) {
err := p.cmd.Wait()
// we should kill all processes in cgroup when init is died if we use host PID namespace
if p.sharePidns {
_ = signalAllProcesses(p.manager, unix.SIGKILL)
signalAllProcesses(p.manager, unix.SIGKILL)
}
return p.cmd.ProcessState, err
}
@@ -668,7 +668,7 @@ func (p *Process) InitializeIO(rootuid, rootgid int) (i *IO, err error) {
defer func() {
if err != nil {
for _, fd := range fds {
_ = unix.Close(int(fd))
unix.Close(int(fd))
}
}
}()

View File

@@ -11,7 +11,9 @@ import (
)
func newRestoredProcess(cmd *exec.Cmd, fds []string) (*restoredProcess, error) {
var err error
var (
err error
)
pid := cmd.Process.Pid
stat, err := system.Stat(pid)
if err != nil {

View File

@@ -143,7 +143,7 @@ func prepareRootfs(pipe io.ReadWriter, iConfig *initConfig) (err error) {
if cwd := iConfig.Cwd; cwd != "" {
// Note that spec.Process.Cwd can contain unclean value like "../../../../foo/bar...".
// However, we are safe to call MkDirAll directly because we are in the jail here.
if err := os.MkdirAll(cwd, 0o755); err != nil {
if err := os.MkdirAll(cwd, 0755); err != nil {
return err
}
}
@@ -176,7 +176,7 @@ func finalizeRootfs(config *configs.Config) (err error) {
if config.Umask != nil {
unix.Umask(int(*config.Umask))
} else {
unix.Umask(0o022)
unix.Umask(0022)
}
return nil
}
@@ -196,9 +196,9 @@ func prepareTmp(topTmpDir string) (string, error) {
return tmpdir, nil
}
func cleanupTmp(tmpdir string) {
_ = unix.Unmount(tmpdir, 0)
_ = os.RemoveAll(tmpdir)
func cleanupTmp(tmpdir string) error {
unix.Unmount(tmpdir, 0)
return os.RemoveAll(tmpdir)
}
func mountCmd(cmd configs.Command) error {
@@ -262,7 +262,7 @@ func mountCgroupV1(m *configs.Mount, c *mountConfig) error {
for _, b := range binds {
if c.cgroupns {
subsystemPath := filepath.Join(c.root, b.Destination)
if err := os.MkdirAll(subsystemPath, 0o755); err != nil {
if err := os.MkdirAll(subsystemPath, 0755); err != nil {
return err
}
if err := utils.WithProcfd(c.root, b.Destination, func(procfd string) error {
@@ -306,7 +306,7 @@ func mountCgroupV2(m *configs.Mount, c *mountConfig) error {
if err != nil {
return err
}
if err := os.MkdirAll(dest, 0o755); err != nil {
if err := os.MkdirAll(dest, 0755); err != nil {
return err
}
return utils.WithProcfd(c.root, m.Destination, func(procfd string) error {
@@ -398,13 +398,13 @@ func mountToRootfs(m *configs.Mount, c *mountConfig) error {
} else if fi.Mode()&os.ModeDir == 0 {
return fmt.Errorf("filesystem %q must be mounted on ordinary directory", m.Device)
}
if err := os.MkdirAll(dest, 0o755); err != nil {
if err := os.MkdirAll(dest, 0755); err != nil {
return err
}
// Selinux kernels do not support labeling of /proc or /sys
return mountPropagate(m, rootfs, "")
case "mqueue":
if err := os.MkdirAll(dest, 0o755); err != nil {
if err := os.MkdirAll(dest, 0755); err != nil {
return err
}
if err := mountPropagate(m, rootfs, ""); err != nil {
@@ -414,7 +414,7 @@ func mountToRootfs(m *configs.Mount, c *mountConfig) error {
case "tmpfs":
stat, err := os.Stat(dest)
if err != nil {
if err := os.MkdirAll(dest, 0o755); err != nil {
if err := os.MkdirAll(dest, 0755); err != nil {
return err
}
}
@@ -473,7 +473,7 @@ func mountToRootfs(m *configs.Mount, c *mountConfig) error {
if err := checkProcMount(rootfs, dest, m.Source); err != nil {
return err
}
if err := os.MkdirAll(dest, 0o755); err != nil {
if err := os.MkdirAll(dest, 0755); err != nil {
return err
}
return mountPropagate(m, rootfs, mountLabel)
@@ -582,7 +582,7 @@ func isProc(path string) (bool, error) {
}
func setupDevSymlinks(rootfs string) error {
links := [][2]string{
var links = [][2]string{
{"/proc/self/fd", "/dev/fd"},
{"/proc/self/fd/0", "/dev/stdin"},
{"/proc/self/fd/1", "/dev/stdout"},
@@ -615,7 +615,7 @@ func reOpenDevNull() error {
if err != nil {
return fmt.Errorf("Failed to open /dev/null - %s", err)
}
defer file.Close() //nolint: errcheck
defer file.Close()
if err := unix.Fstat(int(file.Fd()), &devNullStat); err != nil {
return err
}
@@ -636,7 +636,7 @@ func reOpenDevNull() error {
// Create the device nodes in the container.
func createDevices(config *configs.Config) error {
useBindMount := userns.RunningInUserNS() || config.Namespaces.Contains(configs.NEWUSER)
oldMask := unix.Umask(0o000)
oldMask := unix.Umask(0000)
for _, node := range config.Devices {
// The /dev/ptmx device is setup by setupPtmx()
@@ -661,7 +661,7 @@ func bindMountDeviceNode(rootfs, dest string, node *devices.Device) error {
return err
}
if f != nil {
_ = f.Close()
f.Close()
}
return utils.WithProcfd(rootfs, dest, func(procfd string) error {
return unix.Mount(node.Path, procfd, "bind", unix.MS_BIND, "")
@@ -678,7 +678,7 @@ func createDeviceNode(rootfs string, node *devices.Device, bind bool) error {
if err != nil {
return err
}
if err := os.MkdirAll(filepath.Dir(dest), 0o755); err != nil {
if err := os.MkdirAll(filepath.Dir(dest), 0755); err != nil {
return err
}
if bind {
@@ -799,6 +799,7 @@ func setReadonly() error {
}
flags |= uintptr(s.Flags)
return unix.Mount("", "/", "", flags, "")
}
func setupPtmx(config *configs.Config) error {
@@ -825,13 +826,13 @@ func pivotRoot(rootfs string) error {
if err != nil {
return err
}
defer unix.Close(oldroot) //nolint: errcheck
defer unix.Close(oldroot)
newroot, err := unix.Open(rootfs, unix.O_DIRECTORY|unix.O_RDONLY, 0)
if err != nil {
return err
}
defer unix.Close(newroot) //nolint: errcheck
defer unix.Close(newroot)
// Change to the new root so that the pivot_root actually acts on it.
if err := unix.Fchdir(newroot); err != nil {
@@ -946,16 +947,16 @@ func createIfNotExists(path string, isDir bool) error {
if _, err := os.Stat(path); err != nil {
if os.IsNotExist(err) {
if isDir {
return os.MkdirAll(path, 0o755)
return os.MkdirAll(path, 0755)
}
if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil {
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
return err
}
f, err := os.OpenFile(path, os.O_CREATE, 0o755)
f, err := os.OpenFile(path, os.O_CREATE, 0755)
if err != nil {
return err
}
_ = f.Close()
f.Close()
}
}
return nil
@@ -1030,7 +1031,7 @@ func maskPath(path string, mountLabel string) error {
// For e.g. net.ipv4.ip_forward translated to /proc/sys/net/ipv4/ip_forward.
func writeSystemProperty(key, value string) error {
keyPath := strings.Replace(key, ".", "/", -1)
return ioutil.WriteFile(path.Join("/proc/sys", keyPath), []byte(value), 0o644)
return ioutil.WriteFile(path.Join("/proc/sys", keyPath), []byte(value), 0644)
}
func remount(m *configs.Mount, rootfs string) error {

View File

@@ -324,8 +324,7 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error)
bpf.JumpIf{
Cond: bpf.JumpGreaterThan,
Val: uint32(sysno),
SkipTrue: uint8(baseJumpEnosys + 1),
},
SkipTrue: uint8(baseJumpEnosys + 1)},
// ja [baseJumpFilter]
bpf.Jump{Skip: baseJumpFilter},
}
@@ -354,20 +353,16 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error)
case libseccomp.ArchAMD64:
sectionTail = append([]bpf.Instruction{
// jset (1<<30),[len(tail)-1]
bpf.JumpIf{
Cond: bpf.JumpBitsSet,
bpf.JumpIf{Cond: bpf.JumpBitsSet,
Val: 1 << 30,
SkipTrue: uint8(len(sectionTail) - 1),
},
SkipTrue: uint8(len(sectionTail) - 1)},
}, sectionTail...)
case libseccomp.ArchX32:
sectionTail = append([]bpf.Instruction{
// jset (1<<30),0,[len(tail)-1]
bpf.JumpIf{
Cond: bpf.JumpBitsNotSet,
bpf.JumpIf{Cond: bpf.JumpBitsNotSet,
Val: 1 << 30,
SkipTrue: uint8(len(sectionTail) - 1),
},
SkipTrue: uint8(len(sectionTail) - 1)},
}, sectionTail...)
default:
return nil, errors.Errorf("unknown amd64 native architecture %#x", scmpArch)
@@ -407,14 +402,12 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error)
bpf.JumpIf{
Cond: bpf.JumpGreaterThan,
Val: uint32(x86sysno),
SkipTrue: uint8(baseJumpEnosys + 2), SkipFalse: 1,
},
SkipTrue: uint8(baseJumpEnosys + 2), SkipFalse: 1},
// jgt [x32 syscall],[baseJumpEnosys]
bpf.JumpIf{
Cond: bpf.JumpGreaterThan,
Val: uint32(x32sysno),
SkipTrue: uint8(baseJumpEnosys + 1),
},
SkipTrue: uint8(baseJumpEnosys + 1)},
// ja [baseJumpFilter]
bpf.Jump{Skip: baseJumpFilter},
}...)
@@ -433,14 +426,12 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error)
bpf.JumpIf{
Cond: bpf.JumpGreaterThan,
Val: uint32(x86sysno),
SkipTrue: 1, SkipFalse: 2,
},
SkipTrue: 1, SkipFalse: 2},
// jle [x32 syscall],[baseJumpEnosys]
bpf.JumpIf{
Cond: bpf.JumpLessOrEqual,
Val: uint32(x32sysno),
SkipTrue: 1,
},
SkipTrue: 1},
// ja [baseJumpEnosys+1]
bpf.Jump{Skip: baseJumpEnosys + 1},
// ja [baseJumpFilter]
@@ -487,8 +478,7 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error)
bpf.JumpIf{
Cond: bpf.JumpEqual,
Val: uint32(nativeArch),
SkipTrue: uint8(jump),
},
SkipTrue: uint8(jump)},
}, programTail...)
} else {
programTail = append([]bpf.Instruction{
@@ -496,8 +486,7 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error)
bpf.JumpIf{
Cond: bpf.JumpNotEqual,
Val: uint32(nativeArch),
SkipTrue: 1,
},
SkipTrue: 1},
// ja [jump]
bpf.Jump{Skip: jump},
}, programTail...)

View File

@@ -37,7 +37,7 @@ func (l *linuxSetnsInit) Init() error {
if err := selinux.SetKeyLabel(l.config.ProcessLabel); err != nil {
return err
}
defer selinux.SetKeyLabel("") //nolint: errcheck
defer selinux.SetKeyLabel("")
// Do not inherit the parent's session keyring.
if _, err := keys.JoinSessionKeyring(l.getSessionRingName()); err != nil {
// Same justification as in standart_init_linux.go as to why we
@@ -65,7 +65,7 @@ func (l *linuxSetnsInit) Init() error {
if err := selinux.SetExecLabel(l.config.ProcessLabel); err != nil {
return err
}
defer selinux.SetExecLabel("") //nolint: errcheck
defer selinux.SetExecLabel("")
// Without NoNewPrivileges seccomp is a privileged operation, so we need to
// do this before dropping capabilities; otherwise do it as late as possible
// just before execve so as few syscalls take place after it as possible.

View File

@@ -13,8 +13,8 @@ func Capture(userSkip int) Stacktrace {
)
for i := skip; ; i++ {
pc, file, line, ok := runtime.Caller(i)
// detect if caller is repeated to avoid loop, gccgo
// currently runs into a loop without this check
//detect if caller is repeated to avoid loop, gccgo
//currently runs into a loop without this check
if !ok || pc == prevPc {
break
}

View File

@@ -52,7 +52,7 @@ func (l *linuxStandardInit) Init() error {
if err := selinux.SetKeyLabel(l.config.ProcessLabel); err != nil {
return err
}
defer selinux.SetKeyLabel("") //nolint: errcheck
defer selinux.SetKeyLabel("")
ringname, keepperms, newperms := l.getSessionRingParams()
// Do not inherit the parent's session keyring.
@@ -151,7 +151,7 @@ func (l *linuxStandardInit) Init() error {
if err := selinux.SetExecLabel(l.config.ProcessLabel); err != nil {
return errors.Wrap(err, "set process label")
}
defer selinux.SetExecLabel("") //nolint: errcheck
defer selinux.SetExecLabel("")
// Without NoNewPrivileges seccomp is a privileged operation, so we need to
// do this before dropping capabilities; otherwise do it as late as possible
// just before execve so as few syscalls take place after it as possible.
@@ -183,7 +183,7 @@ func (l *linuxStandardInit) Init() error {
}
// Close the pipe to signal that we have completed our init.
logrus.Debugf("init: closing the pipe to signal completion")
_ = l.pipe.Close()
l.pipe.Close()
// Close the log pipe fd so the parent's ForwardLogs can exit.
if err := unix.Close(l.logFd); err != nil {
@@ -207,7 +207,7 @@ func (l *linuxStandardInit) Init() error {
// N.B. the core issue itself (passing dirfds to the host filesystem) has
// since been resolved.
// https://github.com/torvalds/linux/blob/v4.9/fs/exec.c#L1290-L1318
_ = unix.Close(l.fifoFd)
unix.Close(l.fifoFd)
// Set seccomp as close to execve as possible, so as few syscalls take
// place afterward (reducing the amount of syscalls that users need to
// enable in their seccomp profiles).

View File

@@ -157,7 +157,7 @@ func (i *createdState) transition(s containerState) error {
}
func (i *createdState) destroy() error {
_ = i.c.initProcess.signal(unix.SIGKILL)
i.c.initProcess.signal(unix.SIGKILL)
return destroy(i.c)
}

View File

@@ -96,7 +96,7 @@ func parseStat(data string) (stat Stat_t, err error) {
// one (PID) and two (Name) in the paren-split.
parts = strings.Split(data[i+2:], " ")
var state int
fmt.Sscanf(parts[3-3], "%c", &state) //nolint:staticcheck // "3-3" is more readable in this context.
fmt.Sscanf(parts[3-3], "%c", &state)
stat.State = State(state)
fmt.Sscanf(parts[22-3], "%d", &stat.StartTime)
return stat, nil

View File

@@ -0,0 +1,5 @@
package system
import "github.com/opencontainers/runc/libcontainer/userns"
var RunningInUserNS = userns.RunningInUserNS

View File

@@ -11,17 +11,19 @@ import (
)
const (
minID = 0
maxID = 1<<31 - 1 // for 32-bit systems compatibility
minId = 0
maxId = 1<<31 - 1 //for 32-bit systems compatibility
)
var (
// ErrNoPasswdEntries is returned if no matching entries were found in /etc/group.
// The current operating system does not provide the required data for user lookups.
ErrUnsupported = errors.New("user lookup: operating system does not provide passwd-formatted data")
// No matching entries found in file.
ErrNoPasswdEntries = errors.New("no matching entries in passwd file")
// ErrNoGroupEntries is returned if no matching entries were found in /etc/passwd.
ErrNoGroupEntries = errors.New("no matching entries in group file")
// ErrRange is returned if a UID or GID is outside of the valid range.
ErrRange = fmt.Errorf("uids and gids must be in range %d-%d", minID, maxID)
ErrNoGroupEntries = errors.New("no matching entries in group file")
ErrRange = fmt.Errorf("uids and gids must be in range %d-%d", minId, maxId)
)
type User struct {
@@ -326,7 +328,7 @@ func GetExecUser(userSpec string, defaults *ExecUser, passwd, group io.Reader) (
user.Uid = uidArg
// Must be inside valid uid range.
if user.Uid < minID || user.Uid > maxID {
if user.Uid < minId || user.Uid > maxId {
return nil, ErrRange
}
@@ -375,7 +377,7 @@ func GetExecUser(userSpec string, defaults *ExecUser, passwd, group io.Reader) (
user.Gid = gidArg
// Must be inside valid gid range.
if user.Gid < minID || user.Gid > maxID {
if user.Gid < minId || user.Gid > maxId {
return nil, ErrRange
}
@@ -399,7 +401,7 @@ func GetExecUser(userSpec string, defaults *ExecUser, passwd, group io.Reader) (
// or the given group data is nil, the id will be returned as-is
// provided it is in the legal range.
func GetAdditionalGroups(additionalGroups []string, group io.Reader) ([]int, error) {
groups := []Group{}
var groups = []Group{}
if group != nil {
var err error
groups, err = ParseGroupFilter(group, func(g Group) bool {
@@ -437,7 +439,7 @@ func GetAdditionalGroups(additionalGroups []string, group io.Reader) ([]int, err
return nil, fmt.Errorf("Unable to find group %s", ag)
}
// Ensure gid is inside gid range.
if gid < minID || gid > maxID {
if gid < minId || gid > maxId {
return nil, ErrRange
}
gidMap[int(gid)] = struct{}{}

Some files were not shown because too many files have changed in this diff Show More