From 7e7545e55654b9bf096101dab5f3a33caa9420a4 Mon Sep 17 00:00:00 2001 From: Sebastiaan van Stijn Date: Mon, 24 Aug 2020 12:01:12 +0200 Subject: [PATCH 1/7] seccomp: allow add preadv2 and pwritev2 syscalls Signed-off-by: Sebastiaan van Stijn --- contrib/seccomp/seccomp_default.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/contrib/seccomp/seccomp_default.go b/contrib/seccomp/seccomp_default.go index 2adb1c937..a20b3670c 100644 --- a/contrib/seccomp/seccomp_default.go +++ b/contrib/seccomp/seccomp_default.go @@ -235,11 +235,13 @@ func DefaultProfile(sp *specs.Spec) *specs.LinuxSeccomp { "prctl", "pread64", "preadv", + "preadv2", "prlimit64", "pselect6", "pselect6_time64", "pwrite64", "pwritev", + "pwritev2", "read", "readahead", "readlink", From 1746a195e9dc8b974733cd295e82aedc56525079 Mon Sep 17 00:00:00 2001 From: Sebastiaan van Stijn Date: Mon, 24 Aug 2020 12:09:27 +0200 Subject: [PATCH 2/7] seccomp: allow adjtimex get time operation Enabled adjtimex in the default profile without requiring CAP_SYS_TIME privilege. The kernel will check CAP_SYS_TIME and won't allow setting the time. Fixes: Getting the system time with ntptime returns an error in an unprivileged container To verify, inside a CentOS 7 container: yum install -y ntp ntptime # ntp_gettime() returns code 0 (OK) ntpdate -v time.nist.gov # ntpdate[84]: Can't adjust the time of day: Operation not permitted Signed-off-by: Sebastiaan van Stijn --- contrib/seccomp/seccomp_default.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/seccomp/seccomp_default.go b/contrib/seccomp/seccomp_default.go index a20b3670c..3305b6be1 100644 --- a/contrib/seccomp/seccomp_default.go +++ b/contrib/seccomp/seccomp_default.go @@ -55,6 +55,7 @@ func DefaultProfile(sp *specs.Spec) *specs.LinuxSeccomp { "accept", "accept4", "access", + "adjtimex", "alarm", "bind", "brk", @@ -555,7 +556,6 @@ func DefaultProfile(sp *specs.Spec) *specs.LinuxSeccomp { Names: []string{ "settimeofday", "stime", - "adjtimex", }, Action: specs.ActAllow, Args: []specs.LinuxSeccompArg{}, From fc9e5d161a2c3dfdd313ec997eb5750e0a9a7663 Mon Sep 17 00:00:00 2001 From: Sebastiaan van Stijn Date: Mon, 24 Aug 2020 12:15:10 +0200 Subject: [PATCH 3/7] seccomp: allow syscall membarrier Add the membarrier syscall to the default seccomp profile. It is for example used in the implementation of dlopen() in the musl libc of Alpine images. Signed-off-by: Sebastiaan van Stijn --- contrib/seccomp/seccomp_default.go | 1 + 1 file changed, 1 insertion(+) diff --git a/contrib/seccomp/seccomp_default.go b/contrib/seccomp/seccomp_default.go index 3305b6be1..753b21973 100644 --- a/contrib/seccomp/seccomp_default.go +++ b/contrib/seccomp/seccomp_default.go @@ -192,6 +192,7 @@ func DefaultProfile(sp *specs.Spec) *specs.LinuxSeccomp { "lstat", "lstat64", "madvise", + "membarrier", "memfd_create", "mincore", "mkdir", From 117d678749d740becd1d896bfdb244b3846625e0 Mon Sep 17 00:00:00 2001 From: Sebastiaan van Stijn Date: Mon, 24 Aug 2020 12:27:14 +0200 Subject: [PATCH 4/7] seccomp: allow personality with UNAME26 bit set From personality(2): Have uname(2) report a 2.6.40+ version number rather than a 3.x version number. Added as a stopgap measure to support broken applications that could not handle the kernel version-numbering switch from 2.6.x to 3.x. This allows both "UNAME26|PER_LINUX" and "UNAME26|PER_LINUX32". Fixes: "setarch broken in docker packages from Debian stretch" Signed-off-by: Sebastiaan van Stijn --- contrib/seccomp/seccomp_default.go | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/contrib/seccomp/seccomp_default.go b/contrib/seccomp/seccomp_default.go index 753b21973..f675833ae 100644 --- a/contrib/seccomp/seccomp_default.go +++ b/contrib/seccomp/seccomp_default.go @@ -412,6 +412,28 @@ func DefaultProfile(sp *specs.Spec) *specs.LinuxSeccomp { }, }, }, + { + Names: []string{"personality"}, + Action: specs.ActAllow, + Args: []specs.LinuxSeccompArg{ + { + Index: 0, + Value: 0x20000, + Op: specs.OpEqualTo, + }, + }, + }, + { + Names: []string{"personality"}, + Action: specs.ActAllow, + Args: []specs.LinuxSeccompArg{ + { + Index: 0, + Value: 0x20008, + Op: specs.OpEqualTo, + }, + }, + }, { Names: []string{"personality"}, Action: specs.ActAllow, From 5862285facc350859d875a5a37d94dc39d214187 Mon Sep 17 00:00:00 2001 From: Sebastiaan van Stijn Date: Mon, 24 Aug 2020 12:36:53 +0200 Subject: [PATCH 5/7] seccomp: allow sync_file_range2 on supported architectures. On a ppc64le host, running postgres (tried with 9.4 to 9.6) gives the following warning when trying to flush data to disks (which happens very frequently): WARNING: could not flush dirty data: Operation not permitted. A quick dig in postgres source code indicate it uses sync_file_range(2) to flush data; which on ppe64le and arm64 is translated to sync_file_range2(2) for alignements reasons. The profile did not allow sync_file_range2(2), making postgres sad because it can not flush its buffers. arm_sync_file_range(2) is an ancient alias to sync_file_range2(2), the syscall was renamed in Linux 2.6.22 when the same syscall was added for PowerPC. Signed-off-by: Sebastiaan van Stijn --- contrib/seccomp/seccomp_default.go | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/contrib/seccomp/seccomp_default.go b/contrib/seccomp/seccomp_default.go index f675833ae..0dfdcd4c9 100644 --- a/contrib/seccomp/seccomp_default.go +++ b/contrib/seccomp/seccomp_default.go @@ -455,11 +455,20 @@ func DefaultProfile(sp *specs.Spec) *specs.LinuxSeccomp { // include by arch switch runtime.GOARCH { + case "ppc64le": + s.Syscalls = append(s.Syscalls, specs.LinuxSyscall{ + Names: []string{ + "sync_file_range2", + }, + Action: specs.ActAllow, + Args: []specs.LinuxSeccompArg{}, + }) case "arm", "arm64": s.Syscalls = append(s.Syscalls, specs.LinuxSyscall{ Names: []string{ "arm_fadvise64_64", "arm_sync_file_range", + "sync_file_range2", "breakpoint", "cacheflush", "set_tls", From 5cdb6e81d2c6889355a917890b3c441de15a762d Mon Sep 17 00:00:00 2001 From: Sebastiaan van Stijn Date: Mon, 24 Aug 2020 12:40:43 +0200 Subject: [PATCH 6/7] seccomp: allow quotactl with CAP_SYS_ADMIN This allows the quotactl syscall in the default seccomp profile, gated by CAP_SYS_ADMIN. Signed-off-by: Sebastiaan van Stijn --- contrib/seccomp/seccomp_default.go | 1 + 1 file changed, 1 insertion(+) diff --git a/contrib/seccomp/seccomp_default.go b/contrib/seccomp/seccomp_default.go index 0dfdcd4c9..95a3a3118 100644 --- a/contrib/seccomp/seccomp_default.go +++ b/contrib/seccomp/seccomp_default.go @@ -525,6 +525,7 @@ func DefaultProfile(sp *specs.Spec) *specs.LinuxSeccomp { "mount", "name_to_handle_at", "perf_event_open", + "quotactl", "setdomainname", "sethostname", "setns", From 0a5ee7e6f3dd95ccb09bd93527ccc66c29b0ee5e Mon Sep 17 00:00:00 2001 From: Sebastiaan van Stijn Date: Mon, 24 Aug 2020 12:43:21 +0200 Subject: [PATCH 7/7] seccomp: allow clock_settime when CAP_SYS_TIME is added Signed-off-by: Sebastiaan van Stijn --- contrib/seccomp/seccomp_default.go | 1 + 1 file changed, 1 insertion(+) diff --git a/contrib/seccomp/seccomp_default.go b/contrib/seccomp/seccomp_default.go index 95a3a3118..ab159bf82 100644 --- a/contrib/seccomp/seccomp_default.go +++ b/contrib/seccomp/seccomp_default.go @@ -589,6 +589,7 @@ func DefaultProfile(sp *specs.Spec) *specs.LinuxSeccomp { Names: []string{ "settimeofday", "stime", + "clock_settime", }, Action: specs.ActAllow, Args: []specs.LinuxSeccompArg{},