cri: make read-only mounts recursively read-only
Prior to this commit, `readOnly` volumes were not recursively read-only and
could result in compromise of data;
e.g., even if `/mnt` was mounted as read-only, its submounts such as
`/mnt/usbstorage` were not read-only.
This commit utilizes runc's "rro" bind mount option to make read-only bind
mounts literally read-only. The "rro" bind mount options is implemented by
calling `mount_setattr(2)` with `MOUNT_ATTR_RDONLY` and `AT_RECURSIVE`.
The "rro" bind mount options requires kernel >= 5.12, with runc >= 1.1 or
a compatible runtime such as crun >= 1.4.
When the "rro" bind mount options is not available, containerd falls back
to the legacy non-recursive read-only mounts by default.
The behavior is configurable via `/etc/containerd/config.toml`:
```toml
version = 2
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc]
# treat_ro_mounts_as_rro ("Enabled"|"IfPossible"|"Disabled")
# treats read-only mounts as recursive read-only mounts.
# An empty string means "IfPossible".
# "Enabled" requires Linux kernel v5.12 or later.
# This configuration does not apply to non-volume mounts such as "/sys/fs/cgroup".
treat_ro_mounts_as_rro = ""
```
Replaces:
- kubernetes/enhancements issue 3857
- kubernetes/enhancements PR 3858
Note: this change does not affect non-CRI clients such as ctr, nerdctl, and Docker/Moby.
RRO mounts have been supported since nerdctl v0.14 (containerd/nerdctl PR 511)
and Docker v25 (moby/moby PR 45278).
Signed-off-by: Akihiro Suda <akihiro.suda.cz@hco.ntt.co.jp>
This commit is contained in:
@@ -38,8 +38,14 @@ import (
|
||||
"github.com/containerd/log"
|
||||
)
|
||||
|
||||
// RuntimeConfig is a subset of [github.com/containerd/containerd/v2/pkg/cri/config].
|
||||
// Needed for avoiding circular imports.
|
||||
type RuntimeConfig struct {
|
||||
TreatRoMountsAsRro bool // only applies to volumes
|
||||
}
|
||||
|
||||
// WithMounts sorts and adds runtime and CRI mounts to the spec
|
||||
func WithMounts(osi osinterface.OS, config *runtime.ContainerConfig, extra []*runtime.Mount, mountLabel string) oci.SpecOpts {
|
||||
func WithMounts(osi osinterface.OS, config *runtime.ContainerConfig, extra []*runtime.Mount, mountLabel string, rtConfig *RuntimeConfig) oci.SpecOpts {
|
||||
return func(ctx context.Context, client oci.Client, _ *containers.Container, s *runtimespec.Spec) (err error) {
|
||||
// mergeMounts merge CRI mounts with extra mounts. If a mount destination
|
||||
// is mounted by both a CRI mount and an extra mount, the CRI mount will
|
||||
@@ -67,6 +73,7 @@ func WithMounts(osi osinterface.OS, config *runtime.ContainerConfig, extra []*ru
|
||||
sort.Sort(orderedMounts(mounts))
|
||||
|
||||
// Mount cgroup into the container as readonly, which inherits docker's behavior.
|
||||
// TreatRoMountsAsRro does not apply here, as /sys/fs/cgroup is not a volume.
|
||||
s.Mounts = append(s.Mounts, runtimespec.Mount{
|
||||
Source: "cgroup",
|
||||
Destination: "/sys/fs/cgroup",
|
||||
@@ -148,10 +155,25 @@ func WithMounts(osi osinterface.OS, config *runtime.ContainerConfig, extra []*ru
|
||||
options = append(options, "rprivate")
|
||||
}
|
||||
|
||||
var srcIsDir bool
|
||||
if srcSt, err := osi.Stat(src); err != nil {
|
||||
if errors.Is(err, os.ErrNotExist) { // happens when osi is FakeOS
|
||||
srcIsDir = true // assume src to be dir
|
||||
} else {
|
||||
return fmt.Errorf("failed to stat mount source %q: %w", src, err)
|
||||
}
|
||||
} else if srcSt != nil { // srcSt can be nil when osi is FakeOS
|
||||
srcIsDir = srcSt.IsDir()
|
||||
}
|
||||
|
||||
// NOTE(random-liu): we don't change all mounts to `ro` when root filesystem
|
||||
// is readonly. This is different from docker's behavior, but make more sense.
|
||||
if mount.GetReadonly() {
|
||||
options = append(options, "ro")
|
||||
if rtConfig != nil && rtConfig.TreatRoMountsAsRro && srcIsDir {
|
||||
options = append(options, "rro")
|
||||
} else {
|
||||
options = append(options, "ro")
|
||||
}
|
||||
} else {
|
||||
options = append(options, "rw")
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user