vendor: cadvisor v0.39.0

Main upgrades: - github.com/opencontainers/runc v1.0.0-rc93 - github.com/containerd/containerd v1.4.4 - github.com/docker/docker v20.10.2 - github.com/mrunalp/fileutils v0.5.0 - github.com/opencontainers/selinux v1.8.0 - github.com/cilium/ebpf v0.2.0
2021-03-08 22:09:22 -08:00
parent faa3a5fbd4
commit b5dd78da3d
286 changed files with 7427 additions and 4415 deletions
--- a/vendor/github.com/opencontainers/runc/libcontainer/seccomp/patchbpf/enosys_linux.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/seccomp/patchbpf/enosys_linux.go
@@ -0,0 +1,628 @@
+// +build linux,cgo,seccomp
+
+package patchbpf
+
+import (
+	"encoding/binary"
+	"io"
+	"os"
+	"runtime"
+	"unsafe"
+
+	"github.com/opencontainers/runc/libcontainer/configs"
+	"github.com/opencontainers/runc/libcontainer/utils"
+
+	"github.com/pkg/errors"
+	libseccomp "github.com/seccomp/libseccomp-golang"
+	"github.com/sirupsen/logrus"
+	"golang.org/x/net/bpf"
+	"golang.org/x/sys/unix"
+)
+
+// #cgo pkg-config: libseccomp
+/*
+#include <errno.h>
+#include <stdint.h>
+#include <seccomp.h>
+#include <linux/seccomp.h>
+
+const uint32_t C_ACT_ERRNO_ENOSYS = SCMP_ACT_ERRNO(ENOSYS);
+
+// Copied from <linux/seccomp.h>.
+
+#ifndef SECCOMP_SET_MODE_FILTER
+#	define SECCOMP_SET_MODE_FILTER 1
+#endif
+const uintptr_t C_SET_MODE_FILTER = SECCOMP_SET_MODE_FILTER;
+
+#ifndef SECCOMP_FILTER_FLAG_LOG
+#	define SECCOMP_FILTER_FLAG_LOG (1UL << 1)
+#endif
+const uintptr_t C_FILTER_FLAG_LOG = SECCOMP_FILTER_FLAG_LOG;
+
+// We use the AUDIT_ARCH_* values because those are the ones used by the kernel
+// and SCMP_ARCH_* sometimes has fake values (such as SCMP_ARCH_X32). But we
+// use <seccomp.h> so we get libseccomp's fallback definitions of AUDIT_ARCH_*.
+
+const uint32_t C_AUDIT_ARCH_I386         = AUDIT_ARCH_I386;
+const uint32_t C_AUDIT_ARCH_X86_64       = AUDIT_ARCH_X86_64;
+const uint32_t C_AUDIT_ARCH_ARM          = AUDIT_ARCH_ARM;
+const uint32_t C_AUDIT_ARCH_AARCH64      = AUDIT_ARCH_AARCH64;
+const uint32_t C_AUDIT_ARCH_MIPS         = AUDIT_ARCH_MIPS;
+const uint32_t C_AUDIT_ARCH_MIPS64       = AUDIT_ARCH_MIPS64;
+const uint32_t C_AUDIT_ARCH_MIPS64N32    = AUDIT_ARCH_MIPS64N32;
+const uint32_t C_AUDIT_ARCH_MIPSEL       = AUDIT_ARCH_MIPSEL;
+const uint32_t C_AUDIT_ARCH_MIPSEL64     = AUDIT_ARCH_MIPSEL64;
+const uint32_t C_AUDIT_ARCH_MIPSEL64N32  = AUDIT_ARCH_MIPSEL64N32;
+const uint32_t C_AUDIT_ARCH_PPC          = AUDIT_ARCH_PPC;
+const uint32_t C_AUDIT_ARCH_PPC64        = AUDIT_ARCH_PPC64;
+const uint32_t C_AUDIT_ARCH_PPC64LE      = AUDIT_ARCH_PPC64LE;
+const uint32_t C_AUDIT_ARCH_S390         = AUDIT_ARCH_S390;
+const uint32_t C_AUDIT_ARCH_S390X        = AUDIT_ARCH_S390X;
+*/
+import "C"
+
+var retErrnoEnosys = uint32(C.C_ACT_ERRNO_ENOSYS)
+
+func isAllowAction(action configs.Action) bool {
+	switch action {
+	// Trace is considered an "allow" action because a good tracer should
+	// support future syscalls (by handling -ENOSYS on its own), and giving
+	// -ENOSYS will be disruptive for emulation.
+	case configs.Allow, configs.Log, configs.Trace:
+		return true
+	default:
+		return false
+	}
+}
+
+func parseProgram(rdr io.Reader) ([]bpf.RawInstruction, error) {
+	var program []bpf.RawInstruction
+loop:
+	for {
+		// Read the next instruction. We have to use NativeEndian because
+		// seccomp_export_bpf outputs the program in *host* endian-ness.
+		var insn unix.SockFilter
+		if err := binary.Read(rdr, utils.NativeEndian, &insn); err != nil {
+			switch err {
+			case io.EOF:
+				// Parsing complete.
+				break loop
+			case io.ErrUnexpectedEOF:
+				// Parsing stopped mid-instruction.
+				return nil, errors.Wrap(err, "program parsing halted mid-instruction")
+			default:
+				// All other errors.
+				return nil, errors.Wrap(err, "parsing instructions")
+			}
+		}
+		program = append(program, bpf.RawInstruction{
+			Op: insn.Code,
+			Jt: insn.Jt,
+			Jf: insn.Jf,
+			K:  insn.K,
+		})
+	}
+	return program, nil
+}
+
+func disassembleFilter(filter *libseccomp.ScmpFilter) ([]bpf.Instruction, error) {
+	rdr, wtr, err := os.Pipe()
+	if err != nil {
+		return nil, errors.Wrap(err, "creating scratch pipe")
+	}
+	defer wtr.Close()
+	defer rdr.Close()
+
+	if err := filter.ExportBPF(wtr); err != nil {
+		return nil, errors.Wrap(err, "exporting BPF")
+	}
+	// Close so that the reader actually gets EOF.
+	_ = wtr.Close()
+
+	// Parse the instructions.
+	rawProgram, err := parseProgram(rdr)
+	if err != nil {
+		return nil, errors.Wrap(err, "parsing generated BPF filter")
+	}
+	program, ok := bpf.Disassemble(rawProgram)
+	if !ok {
+		return nil, errors.Errorf("could not disassemble entire BPF filter")
+	}
+	return program, nil
+}
+
+type nativeArch uint32
+
+const invalidArch nativeArch = 0
+
+func archToNative(arch libseccomp.ScmpArch) (nativeArch, error) {
+	switch arch {
+	case libseccomp.ArchNative:
+		// Convert to actual native architecture.
+		arch, err := libseccomp.GetNativeArch()
+		if err != nil {
+			return invalidArch, errors.Wrap(err, "get native arch")
+		}
+		return archToNative(arch)
+	case libseccomp.ArchX86:
+		return nativeArch(C.C_AUDIT_ARCH_I386), nil
+	case libseccomp.ArchAMD64, libseccomp.ArchX32:
+		// NOTE: x32 is treated like x86_64 except all x32 syscalls have the
+		//       30th bit of the syscall number set to indicate that it's not a
+		//       normal x86_64 syscall.
+		return nativeArch(C.C_AUDIT_ARCH_X86_64), nil
+	case libseccomp.ArchARM:
+		return nativeArch(C.C_AUDIT_ARCH_ARM), nil
+	case libseccomp.ArchARM64:
+		return nativeArch(C.C_AUDIT_ARCH_AARCH64), nil
+	case libseccomp.ArchMIPS:
+		return nativeArch(C.C_AUDIT_ARCH_MIPS), nil
+	case libseccomp.ArchMIPS64:
+		return nativeArch(C.C_AUDIT_ARCH_MIPS64), nil
+	case libseccomp.ArchMIPS64N32:
+		return nativeArch(C.C_AUDIT_ARCH_MIPS64N32), nil
+	case libseccomp.ArchMIPSEL:
+		return nativeArch(C.C_AUDIT_ARCH_MIPSEL), nil
+	case libseccomp.ArchMIPSEL64:
+		return nativeArch(C.C_AUDIT_ARCH_MIPSEL64), nil
+	case libseccomp.ArchMIPSEL64N32:
+		return nativeArch(C.C_AUDIT_ARCH_MIPSEL64N32), nil
+	case libseccomp.ArchPPC:
+		return nativeArch(C.C_AUDIT_ARCH_PPC), nil
+	case libseccomp.ArchPPC64:
+		return nativeArch(C.C_AUDIT_ARCH_PPC64), nil
+	case libseccomp.ArchPPC64LE:
+		return nativeArch(C.C_AUDIT_ARCH_PPC64LE), nil
+	case libseccomp.ArchS390:
+		return nativeArch(C.C_AUDIT_ARCH_S390), nil
+	case libseccomp.ArchS390X:
+		return nativeArch(C.C_AUDIT_ARCH_S390X), nil
+	default:
+		return invalidArch, errors.Errorf("unknown architecture: %v", arch)
+	}
+}
+
+type lastSyscallMap map[nativeArch]map[libseccomp.ScmpArch]libseccomp.ScmpSyscall
+
+// Figure out largest syscall number referenced in the filter for each
+// architecture. We will be generating code based on the native architecture
+// representation, but SCMP_ARCH_X32 means we have to track cases where the
+// same architecture has different largest syscalls based on the mode.
+func findLastSyscalls(config *configs.Seccomp) (lastSyscallMap, error) {
+	lastSyscalls := make(lastSyscallMap)
+	// Only loop over architectures which are present in the filter. Any other
+	// architectures will get the libseccomp bad architecture action anyway.
+	for _, ociArch := range config.Architectures {
+		arch, err := libseccomp.GetArchFromString(ociArch)
+		if err != nil {
+			return nil, errors.Wrap(err, "validating seccomp architecture")
+		}
+
+		// Map native architecture to a real architecture value to avoid
+		// doubling-up the lastSyscall mapping.
+		if arch == libseccomp.ArchNative {
+			nativeArch, err := libseccomp.GetNativeArch()
+			if err != nil {
+				return nil, errors.Wrap(err, "get native arch")
+			}
+			arch = nativeArch
+		}
+
+		// Figure out native architecture representation of the architecture.
+		nativeArch, err := archToNative(arch)
+		if err != nil {
+			return nil, errors.Wrapf(err, "cannot map architecture %v to AUDIT_ARCH_ constant", arch)
+		}
+
+		if _, ok := lastSyscalls[nativeArch]; !ok {
+			lastSyscalls[nativeArch] = map[libseccomp.ScmpArch]libseccomp.ScmpSyscall{}
+		}
+		if _, ok := lastSyscalls[nativeArch][arch]; ok {
+			// Because of ArchNative we may hit the same entry multiple times.
+			// Just skip it if we've seen this (nativeArch, ScmpArch)
+			// combination before.
+			continue
+		}
+
+		// Find the largest syscall in the filter for this architecture.
+		var largestSyscall libseccomp.ScmpSyscall
+		for _, rule := range config.Syscalls {
+			sysno, err := libseccomp.GetSyscallFromNameByArch(rule.Name, arch)
+			if err != nil {
+				// Ignore unknown syscalls.
+				continue
+			}
+			if sysno > largestSyscall {
+				largestSyscall = sysno
+			}
+		}
+		if largestSyscall != 0 {
+			lastSyscalls[nativeArch][arch] = largestSyscall
+		} else {
+			logrus.Warnf("could not find any syscalls for arch %s", ociArch)
+			delete(lastSyscalls[nativeArch], arch)
+		}
+	}
+	return lastSyscalls, nil
+}
+
+// FIXME FIXME FIXME
+//
+// This solution is less than ideal. In the future it would be great to have
+// per-arch information about which syscalls were added in which kernel
+// versions so we can create far more accurate filter rules (handling holes in
+// the syscall table and determining -ENOSYS requirements based on kernel
+// minimum version alone.
+//
+// This implementation can in principle cause issues with syscalls like
+// close_range(2) which were added out-of-order in the syscall table between
+// kernel releases.
+func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error) {
+	// A jump-table for each nativeArch used to generate the initial
+	// conditional jumps -- measured from the *END* of the program so they
+	// remain valid after prepending to the tail.
+	archJumpTable := map[nativeArch]uint32{}
+
+	// Generate our own -ENOSYS rules for each architecture. They have to be
+	// generated in reverse (prepended to the tail of the program) because the
+	// JumpIf jumps need to be computed from the end of the program.
+	programTail := []bpf.Instruction{
+		// Fall-through rules jump into the filter.
+		bpf.Jump{Skip: 1},
+		// Rules which jump to here get -ENOSYS.
+		bpf.RetConstant{Val: retErrnoEnosys},
+	}
+
+	// Generate the syscall -ENOSYS rules.
+	for nativeArch, maxSyscalls := range lastSyscalls {
+		// The number of instructions from the tail of this section which need
+		// to be jumped in order to reach the -ENOSYS return. If the section
+		// does not jump, it will fall through to the actual filter.
+		baseJumpEnosys := uint32(len(programTail) - 1)
+		baseJumpFilter := baseJumpEnosys + 1
+
+		// Add the load instruction for the syscall number -- we jump here
+		// directly from the arch code so we need to do it here. Sadly we can't
+		// share this code between architecture branches.
+		section := []bpf.Instruction{
+			// load [0]
+			bpf.LoadAbsolute{Off: 0, Size: 4}, // NOTE: We assume sizeof(int) == 4.
+		}
+
+		switch len(maxSyscalls) {
+		case 0:
+			// No syscalls found for this arch -- skip it and move on.
+			continue
+		case 1:
+			// Get the only syscall in the map.
+			var sysno libseccomp.ScmpSyscall
+			for _, no := range maxSyscalls {
+				sysno = no
+			}
+
+			// The simplest case just boils down to a single jgt instruction,
+			// with special handling if baseJumpEnosys is larger than 255 (and
+			// thus a long jump is required).
+			var sectionTail []bpf.Instruction
+			if baseJumpEnosys+1 <= 255 {
+				sectionTail = []bpf.Instruction{
+					// jgt [syscall],[baseJumpEnosys+1]
+					bpf.JumpIf{
+						Cond:     bpf.JumpGreaterThan,
+						Val:      uint32(sysno),
+						SkipTrue: uint8(baseJumpEnosys + 1)},
+					// ja [baseJumpFilter]
+					bpf.Jump{Skip: baseJumpFilter},
+				}
+			} else {
+				sectionTail = []bpf.Instruction{
+					// jle [syscall],1
+					bpf.JumpIf{Cond: bpf.JumpLessOrEqual, Val: uint32(sysno), SkipTrue: 1},
+					// ja [baseJumpEnosys+1]
+					bpf.Jump{Skip: baseJumpEnosys + 1},
+					// ja [baseJumpFilter]
+					bpf.Jump{Skip: baseJumpFilter},
+				}
+			}
+
+			// If we're on x86 we need to add a check for x32 and if we're in
+			// the wrong mode we jump over the section.
+			if uint32(nativeArch) == uint32(C.C_AUDIT_ARCH_X86_64) {
+				// Grab the only architecture in the map.
+				var scmpArch libseccomp.ScmpArch
+				for arch := range maxSyscalls {
+					scmpArch = arch
+				}
+
+				// Generate a prefix to check the mode.
+				switch scmpArch {
+				case libseccomp.ArchAMD64:
+					sectionTail = append([]bpf.Instruction{
+						// jset (1<<30),[len(tail)-1]
+						bpf.JumpIf{Cond: bpf.JumpBitsSet,
+							Val:      1 << 30,
+							SkipTrue: uint8(len(sectionTail) - 1)},
+					}, sectionTail...)
+				case libseccomp.ArchX32:
+					sectionTail = append([]bpf.Instruction{
+						// jset (1<<30),0,[len(tail)-1]
+						bpf.JumpIf{Cond: bpf.JumpBitsNotSet,
+							Val:      1 << 30,
+							SkipTrue: uint8(len(sectionTail) - 1)},
+					}, sectionTail...)
+				default:
+					return nil, errors.Errorf("unknown amd64 native architecture %#x", scmpArch)
+				}
+			}
+
+			section = append(section, sectionTail...)
+		case 2:
+			// x32 and x86_64 are a unique case, we can't handle any others.
+			if uint32(nativeArch) != uint32(C.C_AUDIT_ARCH_X86_64) {
+				return nil, errors.Errorf("unknown architecture overlap on native arch %#x", nativeArch)
+			}
+
+			x32sysno, ok := maxSyscalls[libseccomp.ArchX32]
+			if !ok {
+				return nil, errors.Errorf("missing %v in overlapping x86_64 arch: %v", libseccomp.ArchX32, maxSyscalls)
+			}
+			x86sysno, ok := maxSyscalls[libseccomp.ArchAMD64]
+			if !ok {
+				return nil, errors.Errorf("missing %v in overlapping x86_64 arch: %v", libseccomp.ArchAMD64, maxSyscalls)
+			}
+
+			// The x32 ABI indicates that a syscall is being made by an x32
+			// process by setting the 30th bit of the syscall number, but we
+			// need to do some special-casing depending on whether we need to
+			// do long jumps.
+			if baseJumpEnosys+2 <= 255 {
+				// For the simple case we want to have something like:
+				//   jset (1<<30),1
+				//   jgt [x86 syscall],[baseJumpEnosys+2],1
+				//   jgt [x32 syscall],[baseJumpEnosys+1]
+				//   ja [baseJumpFilter]
+				section = append(section, []bpf.Instruction{
+					// jset (1<<30),1
+					bpf.JumpIf{Cond: bpf.JumpBitsSet, Val: 1 << 30, SkipTrue: 1},
+					// jgt [x86 syscall],[baseJumpEnosys+1],1
+					bpf.JumpIf{
+						Cond:     bpf.JumpGreaterThan,
+						Val:      uint32(x86sysno),
+						SkipTrue: uint8(baseJumpEnosys + 2), SkipFalse: 1},
+					// jgt [x32 syscall],[baseJumpEnosys]
+					bpf.JumpIf{
+						Cond:     bpf.JumpGreaterThan,
+						Val:      uint32(x32sysno),
+						SkipTrue: uint8(baseJumpEnosys + 1)},
+					// ja [baseJumpFilter]
+					bpf.Jump{Skip: baseJumpFilter},
+				}...)
+			} else {
+				// But if the [baseJumpEnosys+2] jump is larger than 255 we
+				// need to do a long jump like so:
+				//   jset (1<<30),1
+				//   jgt [x86 syscall],1,2
+				//   jle [x32 syscall],1
+				//   ja [baseJumpEnosys+1]
+				//   ja [baseJumpFilter]
+				section = append(section, []bpf.Instruction{
+					// jset (1<<30),1
+					bpf.JumpIf{Cond: bpf.JumpBitsSet, Val: 1 << 30, SkipTrue: 1},
+					// jgt [x86 syscall],1,2
+					bpf.JumpIf{
+						Cond:     bpf.JumpGreaterThan,
+						Val:      uint32(x86sysno),
+						SkipTrue: 1, SkipFalse: 2},
+					// jle [x32 syscall],[baseJumpEnosys]
+					bpf.JumpIf{
+						Cond:     bpf.JumpLessOrEqual,
+						Val:      uint32(x32sysno),
+						SkipTrue: 1},
+					// ja [baseJumpEnosys+1]
+					bpf.Jump{Skip: baseJumpEnosys + 1},
+					// ja [baseJumpFilter]
+					bpf.Jump{Skip: baseJumpFilter},
+				}...)
+			}
+		default:
+			return nil, errors.Errorf("invalid number of architecture overlaps: %v", len(maxSyscalls))
+		}
+
+		// Prepend this section to the tail.
+		programTail = append(section, programTail...)
+
+		// Update jump table.
+		archJumpTable[nativeArch] = uint32(len(programTail))
+	}
+
+	// Add a dummy "jump to filter" for any architecture we might miss below.
+	// Such architectures will probably get the BadArch action of the filter
+	// regardless.
+	programTail = append([]bpf.Instruction{
+		// ja [end of stub and start of filter]
+		bpf.Jump{Skip: uint32(len(programTail))},
+	}, programTail...)
+
+	// Generate the jump rules for each architecture. This has to be done in
+	// reverse as well for the same reason as above. We add to programTail
+	// directly because the jumps are impacted by each architecture rule we add
+	// as well.
+	//
+	// TODO: Maybe we want to optimise to avoid long jumps here? So sort the
+	//       architectures based on how large the jumps are going to be, or
+	//       re-sort the candidate architectures each time to make sure that we
+	//       pick the largest jump which is going to be smaller than 255.
+	for nativeArch := range lastSyscalls {
+		// We jump forwards but the jump table is calculated from the *END*.
+		jump := uint32(len(programTail)) - archJumpTable[nativeArch]
+
+		// Same routine as above -- this is a basic jeq check, complicated
+		// slightly if it turns out that we need to do a long jump.
+		if jump <= 255 {
+			programTail = append([]bpf.Instruction{
+				// jeq [arch],[jump]
+				bpf.JumpIf{
+					Cond:     bpf.JumpEqual,
+					Val:      uint32(nativeArch),
+					SkipTrue: uint8(jump)},
+			}, programTail...)
+		} else {
+			programTail = append([]bpf.Instruction{
+				// jne [arch],1
+				bpf.JumpIf{
+					Cond:     bpf.JumpNotEqual,
+					Val:      uint32(nativeArch),
+					SkipTrue: 1},
+				// ja [jump]
+				bpf.Jump{Skip: jump},
+			}, programTail...)
+		}
+	}
+
+	// Prepend the load instruction for the architecture.
+	programTail = append([]bpf.Instruction{
+		// load [4]
+		bpf.LoadAbsolute{Off: 4, Size: 4}, // NOTE: We assume sizeof(int) == 4.
+	}, programTail...)
+
+	// And that's all folks!
+	return programTail, nil
+}
+
+func assemble(program []bpf.Instruction) ([]unix.SockFilter, error) {
+	rawProgram, err := bpf.Assemble(program)
+	if err != nil {
+		return nil, errors.Wrap(err, "assembling program")
+	}
+
+	// Convert to []unix.SockFilter for unix.SockFilter.
+	var filter []unix.SockFilter
+	for _, insn := range rawProgram {
+		filter = append(filter, unix.SockFilter{
+			Code: insn.Op,
+			Jt:   insn.Jt,
+			Jf:   insn.Jf,
+			K:    insn.K,
+		})
+	}
+	return filter, nil
+}
+
+func generatePatch(config *configs.Seccomp) ([]bpf.Instruction, error) {
+	// We only add the stub if the default action is not permissive.
+	if isAllowAction(config.DefaultAction) {
+		logrus.Debugf("seccomp: skipping -ENOSYS stub filter generation")
+		return nil, nil
+	}
+
+	lastSyscalls, err := findLastSyscalls(config)
+	if err != nil {
+		return nil, errors.Wrap(err, "finding last syscalls for -ENOSYS stub")
+	}
+	stubProgram, err := generateEnosysStub(lastSyscalls)
+	if err != nil {
+		return nil, errors.Wrap(err, "generating -ENOSYS stub")
+	}
+	return stubProgram, nil
+}
+
+func enosysPatchFilter(config *configs.Seccomp, filter *libseccomp.ScmpFilter) ([]unix.SockFilter, error) {
+	program, err := disassembleFilter(filter)
+	if err != nil {
+		return nil, errors.Wrap(err, "disassembling original filter")
+	}
+
+	patch, err := generatePatch(config)
+	if err != nil {
+		return nil, errors.Wrap(err, "generating patch for filter")
+	}
+	fullProgram := append(patch, program...)
+
+	logrus.Debugf("seccomp: prepending -ENOSYS stub filter to user filter...")
+	for idx, insn := range patch {
+		logrus.Debugf("  [%4.1d] %s", idx, insn)
+	}
+	logrus.Debugf("  [....] --- original filter ---")
+
+	fprog, err := assemble(fullProgram)
+	if err != nil {
+		return nil, errors.Wrap(err, "assembling modified filter")
+	}
+	return fprog, nil
+}
+
+func filterFlags(filter *libseccomp.ScmpFilter) (flags uint, noNewPrivs bool, err error) {
+	// Ignore the error since pre-2.4 libseccomp is treated as API level 0.
+	apiLevel, _ := libseccomp.GetApi()
+
+	noNewPrivs, err = filter.GetNoNewPrivsBit()
+	if err != nil {
+		return 0, false, errors.Wrap(err, "fetch no_new_privs filter bit")
+	}
+
+	if apiLevel >= 3 {
+		if logBit, err := filter.GetLogBit(); err != nil {
+			return 0, false, errors.Wrap(err, "fetch SECCOMP_FILTER_FLAG_LOG bit")
+		} else if logBit {
+			flags |= uint(C.C_FILTER_FLAG_LOG)
+		}
+	}
+
+	// TODO: Support seccomp flags not yet added to libseccomp-golang...
+	return
+}
+
+func sysSeccompSetFilter(flags uint, filter []unix.SockFilter) (err error) {
+	fprog := unix.SockFprog{
+		Len:    uint16(len(filter)),
+		Filter: &filter[0],
+	}
+	// If no seccomp flags were requested we can use the old-school prctl(2).
+	if flags == 0 {
+		err = unix.Prctl(unix.PR_SET_SECCOMP,
+			unix.SECCOMP_MODE_FILTER,
+			uintptr(unsafe.Pointer(&fprog)), 0, 0)
+	} else {
+		_, _, err = unix.RawSyscall(unix.SYS_SECCOMP,
+			uintptr(C.C_SET_MODE_FILTER),
+			uintptr(flags), uintptr(unsafe.Pointer(&fprog)))
+	}
+	runtime.KeepAlive(filter)
+	runtime.KeepAlive(fprog)
+	return
+}
+
+// PatchAndLoad takes a seccomp configuration and a libseccomp filter which has
+// been pre-configured with the set of rules in the seccomp config. It then
+// patches said filter to handle -ENOSYS in a much nicer manner than the
+// default libseccomp default action behaviour, and loads the patched filter
+// into the kernel for the current process.
+func PatchAndLoad(config *configs.Seccomp, filter *libseccomp.ScmpFilter) error {
+	// Generate a patched filter.
+	fprog, err := enosysPatchFilter(config, filter)
+	if err != nil {
+		return errors.Wrap(err, "patching filter")
+	}
+
+	// Get the set of libseccomp flags set.
+	seccompFlags, noNewPrivs, err := filterFlags(filter)
+	if err != nil {
+		return errors.Wrap(err, "fetch seccomp filter flags")
+	}
+
+	// Set no_new_privs if it was requested, though in runc we handle
+	// no_new_privs separately so warn if we hit this path.
+	if noNewPrivs {
+		logrus.Warnf("potentially misconfigured filter -- setting no_new_privs in seccomp path")
+		if err := unix.Prctl(unix.PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil {
+			return errors.Wrap(err, "enable no_new_privs bit")
+		}
+	}
+
+	// Finally, load the filter.
+	if err := sysSeccompSetFilter(seccompFlags, fprog); err != nil {
+		return errors.Wrap(err, "loading seccomp filter")
+	}
+	return nil
+}
--- a/vendor/github.com/opencontainers/runc/libcontainer/seccomp/patchbpf/enosys_unsupported.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/seccomp/patchbpf/enosys_unsupported.go
@@ -0,0 +1,3 @@
+// +build !linux !cgo !seccomp
+
+package patchbpf