containerd/contrib/seccomp/seccomp_default.go
Wei Fu 6b7e237fc7 chore: use go fix to cleanup old +build buildtag
Signed-off-by: Wei Fu <fuweid89@gmail.com>
2022-12-29 14:25:14 +08:00

748 lines
15 KiB
Go

//go:build linux
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package seccomp
import (
"runtime"
"golang.org/x/sys/unix"
"github.com/containerd/containerd/contrib/seccomp/kernelversion"
"github.com/opencontainers/runtime-spec/specs-go"
)
func arches() []specs.Arch {
switch runtime.GOARCH {
case "amd64":
return []specs.Arch{specs.ArchX86_64, specs.ArchX86, specs.ArchX32}
case "arm64":
return []specs.Arch{specs.ArchARM, specs.ArchAARCH64}
case "mips64":
return []specs.Arch{specs.ArchMIPS, specs.ArchMIPS64, specs.ArchMIPS64N32}
case "mips64n32":
return []specs.Arch{specs.ArchMIPS, specs.ArchMIPS64, specs.ArchMIPS64N32}
case "mipsel64":
return []specs.Arch{specs.ArchMIPSEL, specs.ArchMIPSEL64, specs.ArchMIPSEL64N32}
case "mipsel64n32":
return []specs.Arch{specs.ArchMIPSEL, specs.ArchMIPSEL64, specs.ArchMIPSEL64N32}
case "s390x":
return []specs.Arch{specs.ArchS390, specs.ArchS390X}
case "riscv64":
// ArchRISCV32 (SCMP_ARCH_RISCV32) does not exist
return []specs.Arch{specs.ArchRISCV64}
default:
return []specs.Arch{}
}
}
// DefaultProfile defines the allowed syscalls for the default seccomp profile.
func DefaultProfile(sp *specs.Spec) *specs.LinuxSeccomp {
nosys := uint(unix.ENOSYS)
syscalls := []specs.LinuxSyscall{
{
Names: []string{
"accept",
"accept4",
"access",
"adjtimex",
"alarm",
"bind",
"brk",
"capget",
"capset",
"chdir",
"chmod",
"chown",
"chown32",
"clock_adjtime",
"clock_adjtime64",
"clock_getres",
"clock_getres_time64",
"clock_gettime",
"clock_gettime64",
"clock_nanosleep",
"clock_nanosleep_time64",
"close",
"close_range",
"connect",
"copy_file_range",
"creat",
"dup",
"dup2",
"dup3",
"epoll_create",
"epoll_create1",
"epoll_ctl",
"epoll_ctl_old",
"epoll_pwait",
"epoll_pwait2",
"epoll_wait",
"epoll_wait_old",
"eventfd",
"eventfd2",
"execve",
"execveat",
"exit",
"exit_group",
"faccessat",
"faccessat2",
"fadvise64",
"fadvise64_64",
"fallocate",
"fanotify_mark",
"fchdir",
"fchmod",
"fchmodat",
"fchown",
"fchown32",
"fchownat",
"fcntl",
"fcntl64",
"fdatasync",
"fgetxattr",
"flistxattr",
"flock",
"fork",
"fremovexattr",
"fsetxattr",
"fstat",
"fstat64",
"fstatat64",
"fstatfs",
"fstatfs64",
"fsync",
"ftruncate",
"ftruncate64",
"futex",
"futex_time64",
"futex_waitv",
"futimesat",
"getcpu",
"getcwd",
"getdents",
"getdents64",
"getegid",
"getegid32",
"geteuid",
"geteuid32",
"getgid",
"getgid32",
"getgroups",
"getgroups32",
"getitimer",
"getpeername",
"getpgid",
"getpgrp",
"getpid",
"getppid",
"getpriority",
"getrandom",
"getresgid",
"getresgid32",
"getresuid",
"getresuid32",
"getrlimit",
"get_robust_list",
"getrusage",
"getsid",
"getsockname",
"getsockopt",
"get_thread_area",
"gettid",
"gettimeofday",
"getuid",
"getuid32",
"getxattr",
"inotify_add_watch",
"inotify_init",
"inotify_init1",
"inotify_rm_watch",
"io_cancel",
"ioctl",
"io_destroy",
"io_getevents",
"io_pgetevents",
"io_pgetevents_time64",
"ioprio_get",
"ioprio_set",
"io_setup",
"io_submit",
"io_uring_enter",
"io_uring_register",
"io_uring_setup",
"ipc",
"kill",
"landlock_add_rule",
"landlock_create_ruleset",
"landlock_restrict_self",
"lchown",
"lchown32",
"lgetxattr",
"link",
"linkat",
"listen",
"listxattr",
"llistxattr",
"_llseek",
"lremovexattr",
"lseek",
"lsetxattr",
"lstat",
"lstat64",
"madvise",
"membarrier",
"memfd_create",
"memfd_secret",
"mincore",
"mkdir",
"mkdirat",
"mknod",
"mknodat",
"mlock",
"mlock2",
"mlockall",
"mmap",
"mmap2",
"mprotect",
"mq_getsetattr",
"mq_notify",
"mq_open",
"mq_timedreceive",
"mq_timedreceive_time64",
"mq_timedsend",
"mq_timedsend_time64",
"mq_unlink",
"mremap",
"msgctl",
"msgget",
"msgrcv",
"msgsnd",
"msync",
"munlock",
"munlockall",
"munmap",
"nanosleep",
"newfstatat",
"_newselect",
"open",
"openat",
"openat2",
"pause",
"pidfd_open",
"pidfd_send_signal",
"pipe",
"pipe2",
"pkey_alloc",
"pkey_free",
"pkey_mprotect",
"poll",
"ppoll",
"ppoll_time64",
"prctl",
"pread64",
"preadv",
"preadv2",
"prlimit64",
"process_mrelease",
"pselect6",
"pselect6_time64",
"pwrite64",
"pwritev",
"pwritev2",
"read",
"readahead",
"readlink",
"readlinkat",
"readv",
"recv",
"recvfrom",
"recvmmsg",
"recvmmsg_time64",
"recvmsg",
"remap_file_pages",
"removexattr",
"rename",
"renameat",
"renameat2",
"restart_syscall",
"rmdir",
"rseq",
"rt_sigaction",
"rt_sigpending",
"rt_sigprocmask",
"rt_sigqueueinfo",
"rt_sigreturn",
"rt_sigsuspend",
"rt_sigtimedwait",
"rt_sigtimedwait_time64",
"rt_tgsigqueueinfo",
"sched_getaffinity",
"sched_getattr",
"sched_getparam",
"sched_get_priority_max",
"sched_get_priority_min",
"sched_getscheduler",
"sched_rr_get_interval",
"sched_rr_get_interval_time64",
"sched_setaffinity",
"sched_setattr",
"sched_setparam",
"sched_setscheduler",
"sched_yield",
"seccomp",
"select",
"semctl",
"semget",
"semop",
"semtimedop",
"semtimedop_time64",
"send",
"sendfile",
"sendfile64",
"sendmmsg",
"sendmsg",
"sendto",
"setfsgid",
"setfsgid32",
"setfsuid",
"setfsuid32",
"setgid",
"setgid32",
"setgroups",
"setgroups32",
"setitimer",
"setpgid",
"setpriority",
"setregid",
"setregid32",
"setresgid",
"setresgid32",
"setresuid",
"setresuid32",
"setreuid",
"setreuid32",
"setrlimit",
"set_robust_list",
"setsid",
"setsockopt",
"set_thread_area",
"set_tid_address",
"setuid",
"setuid32",
"setxattr",
"shmat",
"shmctl",
"shmdt",
"shmget",
"shutdown",
"sigaltstack",
"signalfd",
"signalfd4",
"sigprocmask",
"sigreturn",
"socketcall",
"socketpair",
"splice",
"stat",
"stat64",
"statfs",
"statfs64",
"statx",
"symlink",
"symlinkat",
"sync",
"sync_file_range",
"syncfs",
"sysinfo",
"tee",
"tgkill",
"time",
"timer_create",
"timer_delete",
"timer_getoverrun",
"timer_gettime",
"timer_gettime64",
"timer_settime",
"timer_settime64",
"timerfd_create",
"timerfd_gettime",
"timerfd_gettime64",
"timerfd_settime",
"timerfd_settime64",
"times",
"tkill",
"truncate",
"truncate64",
"ugetrlimit",
"umask",
"uname",
"unlink",
"unlinkat",
"utime",
"utimensat",
"utimensat_time64",
"utimes",
"vfork",
"vmsplice",
"wait4",
"waitid",
"waitpid",
"write",
"writev",
},
Action: specs.ActAllow,
Args: []specs.LinuxSeccompArg{},
},
{
Names: []string{"socket"},
Action: specs.ActAllow,
Args: []specs.LinuxSeccompArg{
{
Index: 0,
Value: unix.AF_VSOCK,
Op: specs.OpNotEqual,
},
},
},
{
Names: []string{"personality"},
Action: specs.ActAllow,
Args: []specs.LinuxSeccompArg{
{
Index: 0,
Value: 0x0,
Op: specs.OpEqualTo,
},
},
},
{
Names: []string{"personality"},
Action: specs.ActAllow,
Args: []specs.LinuxSeccompArg{
{
Index: 0,
Value: 0x0008,
Op: specs.OpEqualTo,
},
},
},
{
Names: []string{"personality"},
Action: specs.ActAllow,
Args: []specs.LinuxSeccompArg{
{
Index: 0,
Value: 0x20000,
Op: specs.OpEqualTo,
},
},
},
{
Names: []string{"personality"},
Action: specs.ActAllow,
Args: []specs.LinuxSeccompArg{
{
Index: 0,
Value: 0x20008,
Op: specs.OpEqualTo,
},
},
},
{
Names: []string{"personality"},
Action: specs.ActAllow,
Args: []specs.LinuxSeccompArg{
{
Index: 0,
Value: 0xffffffff,
Op: specs.OpEqualTo,
},
},
},
}
s := &specs.LinuxSeccomp{
DefaultAction: specs.ActErrno,
Architectures: arches(),
Syscalls: syscalls,
}
// include by kernel version
if ok, err := kernelversion.GreaterEqualThan(
kernelversion.KernelVersion{Kernel: 4, Major: 8}); err == nil {
if ok {
s.Syscalls = append(s.Syscalls, specs.LinuxSyscall{
Names: []string{
"process_vm_readv",
"process_vm_writev",
"ptrace",
},
Action: specs.ActAllow,
Args: []specs.LinuxSeccompArg{},
})
}
}
// include by arch
switch runtime.GOARCH {
case "ppc64le":
s.Syscalls = append(s.Syscalls, specs.LinuxSyscall{
Names: []string{
"sync_file_range2",
"swapcontext",
},
Action: specs.ActAllow,
Args: []specs.LinuxSeccompArg{},
})
case "arm", "arm64":
s.Syscalls = append(s.Syscalls, specs.LinuxSyscall{
Names: []string{
"arm_fadvise64_64",
"arm_sync_file_range",
"sync_file_range2",
"breakpoint",
"cacheflush",
"set_tls",
},
Action: specs.ActAllow,
Args: []specs.LinuxSeccompArg{},
})
case "amd64":
s.Syscalls = append(s.Syscalls, specs.LinuxSyscall{
Names: []string{
"arch_prctl",
"modify_ldt",
},
Action: specs.ActAllow,
Args: []specs.LinuxSeccompArg{},
})
case "386":
s.Syscalls = append(s.Syscalls, specs.LinuxSyscall{
Names: []string{
"modify_ldt",
},
Action: specs.ActAllow,
Args: []specs.LinuxSeccompArg{},
})
case "s390", "s390x":
s.Syscalls = append(s.Syscalls, specs.LinuxSyscall{
Names: []string{
"s390_pci_mmio_read",
"s390_pci_mmio_write",
"s390_runtime_instr",
},
Action: specs.ActAllow,
Args: []specs.LinuxSeccompArg{},
})
case "riscv64":
s.Syscalls = append(s.Syscalls, specs.LinuxSyscall{
Names: []string{
"riscv_flush_icache",
},
Action: specs.ActAllow,
Args: []specs.LinuxSeccompArg{},
})
}
admin := false
for _, c := range sp.Process.Capabilities.Bounding {
switch c {
case "CAP_DAC_READ_SEARCH":
s.Syscalls = append(s.Syscalls, specs.LinuxSyscall{
Names: []string{"open_by_handle_at"},
Action: specs.ActAllow,
Args: []specs.LinuxSeccompArg{},
})
case "CAP_SYS_ADMIN":
admin = true
s.Syscalls = append(s.Syscalls, specs.LinuxSyscall{
Names: []string{
"bpf",
"clone",
"clone3",
"fanotify_init",
"fsconfig",
"fsmount",
"fsopen",
"fspick",
"lookup_dcookie",
"mount",
"mount_setattr",
"move_mount",
"name_to_handle_at",
"open_tree",
"perf_event_open",
"quotactl",
"quotactl_fd",
"setdomainname",
"sethostname",
"setns",
"syslog",
"umount",
"umount2",
"unshare",
},
Action: specs.ActAllow,
Args: []specs.LinuxSeccompArg{},
})
case "CAP_SYS_BOOT":
s.Syscalls = append(s.Syscalls, specs.LinuxSyscall{
Names: []string{"reboot"},
Action: specs.ActAllow,
Args: []specs.LinuxSeccompArg{},
})
case "CAP_SYS_CHROOT":
s.Syscalls = append(s.Syscalls, specs.LinuxSyscall{
Names: []string{"chroot"},
Action: specs.ActAllow,
Args: []specs.LinuxSeccompArg{},
})
case "CAP_SYS_MODULE":
s.Syscalls = append(s.Syscalls, specs.LinuxSyscall{
Names: []string{
"delete_module",
"init_module",
"finit_module",
},
Action: specs.ActAllow,
Args: []specs.LinuxSeccompArg{},
})
case "CAP_SYS_PACCT":
s.Syscalls = append(s.Syscalls, specs.LinuxSyscall{
Names: []string{"acct"},
Action: specs.ActAllow,
Args: []specs.LinuxSeccompArg{},
})
case "CAP_SYS_PTRACE":
s.Syscalls = append(s.Syscalls, specs.LinuxSyscall{
Names: []string{
"kcmp",
"pidfd_getfd",
"process_madvise",
"process_vm_readv",
"process_vm_writev",
"ptrace",
},
Action: specs.ActAllow,
Args: []specs.LinuxSeccompArg{},
})
case "CAP_SYS_RAWIO":
s.Syscalls = append(s.Syscalls, specs.LinuxSyscall{
Names: []string{
"iopl",
"ioperm",
},
Action: specs.ActAllow,
Args: []specs.LinuxSeccompArg{},
})
case "CAP_SYS_TIME":
s.Syscalls = append(s.Syscalls, specs.LinuxSyscall{
Names: []string{
"settimeofday",
"stime",
"clock_settime",
"clock_settime64",
},
Action: specs.ActAllow,
Args: []specs.LinuxSeccompArg{},
})
case "CAP_SYS_TTY_CONFIG":
s.Syscalls = append(s.Syscalls, specs.LinuxSyscall{
Names: []string{"vhangup"},
Action: specs.ActAllow,
Args: []specs.LinuxSeccompArg{},
})
case "CAP_SYS_NICE":
s.Syscalls = append(s.Syscalls, specs.LinuxSyscall{
Names: []string{
"get_mempolicy",
"mbind",
"set_mempolicy",
},
Action: specs.ActAllow,
Args: []specs.LinuxSeccompArg{},
})
case "CAP_SYSLOG":
s.Syscalls = append(s.Syscalls, specs.LinuxSyscall{
Names: []string{"syslog"},
Action: specs.ActAllow,
Args: []specs.LinuxSeccompArg{},
})
case "CAP_BPF":
s.Syscalls = append(s.Syscalls, specs.LinuxSyscall{
Names: []string{"bpf"},
Action: specs.ActAllow,
Args: []specs.LinuxSeccompArg{},
})
case "CAP_PERFMON":
s.Syscalls = append(s.Syscalls, specs.LinuxSyscall{
Names: []string{"perf_event_open"},
Action: specs.ActAllow,
Args: []specs.LinuxSeccompArg{},
})
}
}
if !admin {
switch runtime.GOARCH {
case "s390", "s390x":
s.Syscalls = append(s.Syscalls, specs.LinuxSyscall{
Names: []string{
"clone",
},
Action: specs.ActAllow,
Args: []specs.LinuxSeccompArg{
{
Index: 1,
Value: unix.CLONE_NEWNS | unix.CLONE_NEWUTS | unix.CLONE_NEWIPC | unix.CLONE_NEWUSER | unix.CLONE_NEWPID | unix.CLONE_NEWNET | unix.CLONE_NEWCGROUP,
ValueTwo: 0,
Op: specs.OpMaskedEqual,
},
},
})
default:
s.Syscalls = append(s.Syscalls, specs.LinuxSyscall{
Names: []string{
"clone",
},
Action: specs.ActAllow,
Args: []specs.LinuxSeccompArg{
{
Index: 0,
Value: unix.CLONE_NEWNS | unix.CLONE_NEWUTS | unix.CLONE_NEWIPC | unix.CLONE_NEWUSER | unix.CLONE_NEWPID | unix.CLONE_NEWNET | unix.CLONE_NEWCGROUP,
ValueTwo: 0,
Op: specs.OpMaskedEqual,
},
},
})
}
// clone3 is explicitly requested to give ENOSYS instead of the default EPERM, when CAP_SYS_ADMIN is unset
// https://github.com/moby/moby/pull/42681
s.Syscalls = append(s.Syscalls, specs.LinuxSyscall{
Names: []string{
"clone3",
},
Action: specs.ActErrno,
ErrnoRet: &nosys,
})
}
return s
}