diff --git a/RELEASES.md b/RELEASES.md index 32c52e936..6b9061b2d 100644 --- a/RELEASES.md +++ b/RELEASES.md @@ -461,6 +461,25 @@ version = 2

+## Other breaking changes +### containerd v2.0 +#### CRI plugin treats read-only mounts recursively read-only +Starting with containerd v2.0, the CRI plugin treats read-only mounts +as recursively read-only mounts when running on Linux kernel v5.12 or later. + +To rollback to the legacy behavior that corresponds to containerd v1.x, +set the following config: +```toml +version = 2 +[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc] + # treat_ro_mounts_as_rro ("Enabled"|"IfPossible"|"Disabled") + # treats read-only mounts as recursive read-only mounts. + # An empty string means "IfPossible". + # "Enabled" requires Linux kernel v5.12 or later. + # This configuration does not apply to non-volume mounts such as "/sys/fs/cgroup". + treat_ro_mounts_as_rro = "Disabled" +``` + ## Experimental features Experimental features are new features added to containerd which do not have the diff --git a/docs/cri/config.md b/docs/cri/config.md index 63041db75..09008e260 100644 --- a/docs/cri/config.md +++ b/docs/cri/config.md @@ -369,6 +369,14 @@ version = 2 # See https://github.com/containerd/containerd/issues/6657 for context. snapshotter = "" + # treat_ro_mounts_as_rro ("Enabled"|"IfPossible"|"Disabled") + # treats read-only mounts as recursive read-only mounts. + # An empty string means "IfPossible". + # "Enabled" requires Linux kernel v5.12 or later. + # Introduced in containerd v2.0. + # This configuration does not apply to non-volume mounts such as "/sys/fs/cgroup". + treat_ro_mounts_as_rro = "" + # 'plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc.options' is options specific to # "io.containerd.runc.v1" and "io.containerd.runc.v2". Its corresponding options type is: # https://github.com/containerd/containerd/blob/v1.3.2/runtime/v2/runc/options/oci.pb.go#L26 . diff --git a/integration/container_volume_linux_test.go b/integration/container_volume_linux_test.go new file mode 100644 index 000000000..9fd62e362 --- /dev/null +++ b/integration/container_volume_linux_test.go @@ -0,0 +1,149 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package integration + +import ( + "fmt" + "os" + "path/filepath" + "syscall" + "testing" + "time" + + "github.com/containerd/containerd/v2/core/mount" + "github.com/containerd/containerd/v2/integration/images" + "github.com/containerd/containerd/v2/pkg/kernelversion" + "github.com/opencontainers/selinux/go-selinux" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + runtime "k8s.io/cri-api/pkg/apis/runtime/v1" +) + +func testReadonlyMounts(t *testing.T, mode string, expectRRO bool) { + workDir := t.TempDir() + mntSrcDir := filepath.Join(workDir, "mnt") // "/mnt" in the container + require.NoError(t, os.MkdirAll(mntSrcDir, 0755)) + tmpfsDir := filepath.Join(mntSrcDir, "tmpfs") // "/mnt/tmpfs" in the container + require.NoError(t, os.MkdirAll(tmpfsDir, 0755)) + tmpfsMount := mount.Mount{ + Type: "tmpfs", + Source: "none", + } + require.NoError(t, tmpfsMount.Mount(tmpfsDir)) + t.Cleanup(func() { + require.NoError(t, mount.UnmountAll(tmpfsDir, 0)) + }) + + podLogDir := filepath.Join(workDir, "podLogDir") + require.NoError(t, os.MkdirAll(podLogDir, 0755)) + + config := `version = 2 +` + if mode != "" { + config += fmt.Sprintf(` +[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc] + treat_ro_mount_as_rro = %q +`, mode) + } + require.NoError(t, os.WriteFile(filepath.Join(workDir, "config.toml"), + []byte(config), 0644)) + ctrdProc := newCtrdProc(t, "containerd", workDir) + t.Cleanup(func() { + cleanupPods(t, ctrdProc.criRuntimeService(t)) + require.NoError(t, ctrdProc.kill(syscall.SIGTERM)) + require.NoError(t, ctrdProc.wait(5*time.Minute)) + if t.Failed() { + dumpFileContent(t, ctrdProc.logPath()) + } + }) + runtimeServiceOrig, imageServiceOrig := runtimeService, imageService + runtimeService, imageService = ctrdProc.criRuntimeService(t), ctrdProc.criImageService(t) + t.Cleanup(func() { + runtimeService, imageService = runtimeServiceOrig, imageServiceOrig + }) + require.NoError(t, ctrdProc.isReady()) + + sb, sbConfig := PodSandboxConfigWithCleanup(t, "sandbox", "test-ro-mounts", + WithPodLogDirectory(podLogDir), + ) + + testImage := images.Get(images.BusyBox) + EnsureImageExists(t, testImage) + + containerName := "test-container" + cnConfig := ContainerConfig( + containerName, + testImage, + WithCommand("/bin/touch", "/mnt/tmpfs/file"), + WithLogPath(containerName), + func(c *runtime.ContainerConfig) { + c.Mounts = append(c.Mounts, &runtime.Mount{ + HostPath: mntSrcDir, + ContainerPath: "/mnt", + SelinuxRelabel: selinux.GetEnabled(), + Readonly: true, + }) + }, + ) + + cn, err := runtimeService.CreateContainer(sb, cnConfig, sbConfig) + require.NoError(t, err) + + t.Log("Start the container") + require.NoError(t, runtimeService.StartContainer(cn)) + + t.Log("Wait for container to finish running") + exitCode := -1 + require.NoError(t, Eventually(func() (bool, error) { + s, err := runtimeService.ContainerStatus(cn) + if err != nil { + return false, err + } + if s.GetState() == runtime.ContainerState_CONTAINER_EXITED { + exitCode = int(s.ExitCode) + return true, nil + } + return false, nil + }, time.Second, 30*time.Second)) + + output, err := os.ReadFile(filepath.Join(podLogDir, containerName)) + assert.NoError(t, err) + t.Logf("exitCode=%d, output=%q", exitCode, output) + + if expectRRO { + require.NotEqual(t, 0, exitCode) + require.Contains(t, string(output), "stderr F touch: /mnt/tmpfs/file: Read-only file system\n") + } else { + require.Equal(t, 0, exitCode) + } +} + +func TestReadonlyMounts(t *testing.T) { + kernelSupportsRRO, err := kernelversion.GreaterEqualThan(kernelversion.KernelVersion{Kernel: 5, Major: 12}) + require.NoError(t, err) + t.Run("Default", func(t *testing.T) { + testReadonlyMounts(t, "", kernelSupportsRRO) + }) + t.Run("Disabled", func(t *testing.T) { + testReadonlyMounts(t, "Disabled", false) + }) + if kernelSupportsRRO { + t.Run("Enabled", func(t *testing.T) { + testReadonlyMounts(t, "Enabled", true) + }) + } +} diff --git a/pkg/cri/config/config.go b/pkg/cri/config/config.go index 6b0a0be93..a8bde65cf 100644 --- a/pkg/cri/config/config.go +++ b/pkg/cri/config/config.go @@ -21,9 +21,15 @@ import ( "errors" "fmt" "net/url" + goruntime "runtime" + "strconv" "time" + introspectionapi "github.com/containerd/containerd/v2/api/services/introspection/v1" + apitypes "github.com/containerd/containerd/v2/api/types" + "github.com/containerd/containerd/v2/protobuf" "github.com/containerd/log" + "github.com/containerd/typeurl/v2" "github.com/pelletier/go-toml/v2" runtime "k8s.io/cri-api/pkg/apis/runtime/v1" "k8s.io/kubelet/pkg/cri/streaming" @@ -34,8 +40,16 @@ import ( "github.com/containerd/containerd/v2/pkg/deprecation" runtimeoptions "github.com/containerd/containerd/v2/pkg/runtimeoptions/v1" "github.com/containerd/containerd/v2/plugins" + "github.com/opencontainers/image-spec/specs-go" + "github.com/opencontainers/runtime-spec/specs-go/features" ) +func init() { + const prefix = "types.containerd.io" + major := strconv.Itoa(specs.VersionMajor) + typeurl.Register(&features.Features{}, prefix, "opencontainers/runtime-spec", major, "features", "Features") +} + const ( // defaultImagePullProgressTimeoutDuration is the default value of imagePullProgressTimeout. // @@ -73,6 +87,17 @@ const ( DefaultSandboxImage = "registry.k8s.io/pause:3.9" ) +// Ternary represents a ternary value. +// Ternary is needed because TOML does not accept "null" for boolean values. +type Ternary = string + +const ( + TernaryEmpty Ternary = "" // alias for IfPossible + TernaryEnabled Ternary = "Enabled" + TernaryIfPossible Ternary = "IfPossible" + TernaryDisabled Ternary = "Disabled" +) + // Runtime struct to contain the type(ID), engine, and root variables for a default runtime // and a runtime for untrusted workload. type Runtime struct { @@ -116,6 +141,15 @@ type Runtime struct { // shim - means use whatever Controller implementation provided by shim (e.g. use RemoteController). // podsandbox - means use Controller implementation from sbserver podsandbox package. Sandboxer string `toml:"sandboxer" json:"sandboxer"` + + // TreatRoMountsAsRro ("Enabled"|"IfPossible"|"Disabled") + // treats read-only mounts as recursive read-only mounts. + // An empty string means "IfPossible". + // "Enabled" requires Linux kernel v5.12 or later. + // Introduced in containerd v2.0. + // This configuration does not apply to non-volume mounts such as "/sys/fs/cgroup". + TreatRoMountsAsRro Ternary `toml:"treat_ro_mount_as_rro" json:"treatRoMountsAsRro"` + TreatRoMountsAsRroResolved bool `toml:"-" json:"-"` // Do not set manually } // ContainerdConfig contains toml config related to containerd @@ -499,8 +533,120 @@ func ValidateImageConfig(ctx context.Context, c *ImageConfig) ([]deprecation.War return warnings, nil } +func introspectRuntimeFeatures(ctx context.Context, introspectionClient introspectionapi.IntrospectionClient, r Runtime) (*features.Features, error) { + if introspectionClient == nil { // happens for unit tests + return nil, errors.New("introspectionClient is nil") + } + infoReq := &introspectionapi.PluginInfoRequest{ + Type: string(plugins.RuntimePluginV2), + ID: "task", + } + rr := &apitypes.RuntimeRequest{ + RuntimePath: r.Type, + } + if r.Path != "" { + rr.RuntimePath = r.Path + } + options, err := GenerateRuntimeOptions(r) + if err != nil { + return nil, err + } + rr.Options, err = protobuf.MarshalAnyToProto(options) + if err != nil { + return nil, fmt.Errorf("failed to marshal %T: %w", options, err) + } + infoReq.Options, err = protobuf.MarshalAnyToProto(rr) + if err != nil { + return nil, fmt.Errorf("failed to marshal %T: %w", rr, err) + } + infoResp, err := introspectionClient.PluginInfo(ctx, infoReq) + if err != nil { + return nil, fmt.Errorf("failed to call PluginInfo: %w", err) + } + var info apitypes.RuntimeInfo + if err := typeurl.UnmarshalTo(infoResp.Extra, &info); err != nil { + return nil, fmt.Errorf("failed to get runtime info from plugin info: %w", err) + } + featuresX, err := typeurl.UnmarshalAny(info.Features) + if err != nil { + return nil, fmt.Errorf("failed to unmarshal Features (%T): %w", info.Features, err) + } + features, ok := featuresX.(*features.Features) + if !ok { + return nil, fmt.Errorf("unknown features type %T", featuresX) + } + return features, nil +} + +// resolveTreatRoMountsAsRro resolves r.TreatRoMountsAsRro string into a boolean. +func resolveTreatRoMountsAsRro(ctx context.Context, introspectionClient introspectionapi.IntrospectionClient, r Runtime) (bool, error) { + debugPrefix := "treat_ro_mounts_as_rro" + if r.Type != "" { + debugPrefix += fmt.Sprintf("[%s]", r.Type) + } + if binaryName := r.Options["BinaryName"]; binaryName != "" { + debugPrefix += fmt.Sprintf("[%v]", binaryName) + } + debugPrefix += ": " + + var runtimeSupportsRro bool + if r.Type == plugins.RuntimeRuncV2 { + features, err := introspectRuntimeFeatures(ctx, introspectionClient, r) + if err != nil { + log.G(ctx).WithError(err).Warnf(debugPrefix + "failed to introspect runtime features (binary is not compatible with runc v1.1?)") + } else { + log.G(ctx).Debugf(debugPrefix+"Features: %+v", features) + for _, s := range features.MountOptions { + if s == "rro" { + runtimeSupportsRro = true + break + } + } + } + } + + switch r.TreatRoMountsAsRro { + case TernaryDisabled: + log.G(ctx).Debug(debugPrefix + "rro mounts are explicitly disabled") + return false, nil + case TernaryEnabled: + log.G(ctx).Debug(debugPrefix + "rro mounts are explicitly enabled") + if !kernelSupportsRro { + return true, fmt.Errorf("invalid `treat_ro_mounts_as_rro`: %q: needs Linux kernel v5.12 or later", TernaryEnabled) + } + if !runtimeSupportsRro { + return true, fmt.Errorf("invalid `treat_ro_mounts_as_rro`: %q: needs a runtime that is compatible with runc v1.1", TernaryEnabled) + } + return true, nil + case TernaryEmpty, TernaryIfPossible: + if r.Type != plugins.RuntimeRuncV2 { + log.G(ctx).Debugf(debugPrefix+"rro mounts are not supported by runtime %q, disabling rro mounts", r.Type) + return false, nil + } + if !kernelSupportsRro { + msg := debugPrefix + "rro mounts are not supported by kernel, disabling rro mounts" + if goruntime.GOOS == "linux" { + msg += " (Hint: upgrade the kernel to v5.12 or later)" + log.G(ctx).Warn(msg) + } else { + log.G(ctx).Debug(msg) + } + return false, nil + } + if !runtimeSupportsRro { + log.G(ctx).Warn(debugPrefix + "rro mounts are not supported by runtime, disabling rro mounts (Hint: use a runtime that is compatible with runc v1.1)") + return false, nil + } + log.G(ctx).Debug(debugPrefix + "rro mounts are implicitly enabled") + return true, nil + default: + return false, fmt.Errorf("invalid `treat_ro_mounts_as_rro`: %q (must be %q, %q, or %q)", + r.TreatRoMountsAsRro, TernaryDisabled, TernaryEnabled, TernaryIfPossible) + } +} + // ValidateRuntimeConfig validates the given runtime configuration. -func ValidateRuntimeConfig(ctx context.Context, c *RuntimeConfig) ([]deprecation.Warning, error) { +func ValidateRuntimeConfig(ctx context.Context, c *RuntimeConfig, introspectionClient introspectionapi.IntrospectionClient) ([]deprecation.Warning, error) { var warnings []deprecation.Warning if c.ContainerdConfig.Runtimes == nil { c.ContainerdConfig.Runtimes = make(map[string]Runtime) @@ -521,8 +667,15 @@ func ValidateRuntimeConfig(ctx context.Context, c *RuntimeConfig) ([]deprecation // If empty, use default podSandbox mode if len(r.Sandboxer) == 0 { r.Sandboxer = string(ModePodSandbox) - c.ContainerdConfig.Runtimes[k] = r } + + // Resolve r.TreatRoMountsAsRro (string; empty value must not be ignored) into r.TreatRoMountsAsRroResolved (bool) + var err error + r.TreatRoMountsAsRroResolved, err = resolveTreatRoMountsAsRro(ctx, introspectionClient, r) + if err != nil { + return warnings, err + } + c.ContainerdConfig.Runtimes[k] = r } // Validation for drain_exec_sync_io_timeout diff --git a/pkg/cri/config/config_kernel_linux.go b/pkg/cri/config/config_kernel_linux.go index 85c564ae3..296e104bc 100644 --- a/pkg/cri/config/config_kernel_linux.go +++ b/pkg/cri/config/config_kernel_linux.go @@ -41,3 +41,13 @@ func ValidateEnableUnprivileged(ctx context.Context, c *RuntimeConfig) error { } return nil } + +var kernelSupportsRro bool + +func init() { + var err error + kernelSupportsRro, err = kernelGreaterEqualThan(kernel.KernelVersion{Kernel: 5, Major: 12}) + if err != nil { + panic(fmt.Errorf("check current system kernel version error: %w", err)) + } +} diff --git a/pkg/cri/config/config_kernel_other.go b/pkg/cri/config/config_kernel_other.go index 433c1682f..bc675414f 100644 --- a/pkg/cri/config/config_kernel_other.go +++ b/pkg/cri/config/config_kernel_other.go @@ -25,3 +25,5 @@ import ( func ValidateEnableUnprivileged(ctx context.Context, c *RuntimeConfig) error { return nil } + +var kernelSupportsRro bool diff --git a/pkg/cri/config/config_test.go b/pkg/cri/config/config_test.go index f7a5f30df..8a982d95d 100644 --- a/pkg/cri/config/config_test.go +++ b/pkg/cri/config/config_test.go @@ -222,7 +222,7 @@ func TestValidateConfig(t *testing.T) { t.Run(desc, func(t *testing.T) { var warnings []deprecation.Warning if test.runtimeConfig != nil { - w, err := ValidateRuntimeConfig(context.Background(), test.runtimeConfig) + w, err := ValidateRuntimeConfig(context.Background(), test.runtimeConfig, nil) if test.runtimeExpectedErr != "" { assert.Contains(t, err.Error(), test.runtimeExpectedErr) } else { diff --git a/pkg/cri/opts/spec_linux_opts.go b/pkg/cri/opts/spec_linux_opts.go index 806a35d1b..6deac3628 100644 --- a/pkg/cri/opts/spec_linux_opts.go +++ b/pkg/cri/opts/spec_linux_opts.go @@ -38,8 +38,14 @@ import ( "github.com/containerd/log" ) +// RuntimeConfig is a subset of [github.com/containerd/containerd/v2/pkg/cri/config]. +// Needed for avoiding circular imports. +type RuntimeConfig struct { + TreatRoMountsAsRro bool // only applies to volumes +} + // WithMounts sorts and adds runtime and CRI mounts to the spec -func WithMounts(osi osinterface.OS, config *runtime.ContainerConfig, extra []*runtime.Mount, mountLabel string) oci.SpecOpts { +func WithMounts(osi osinterface.OS, config *runtime.ContainerConfig, extra []*runtime.Mount, mountLabel string, rtConfig *RuntimeConfig) oci.SpecOpts { return func(ctx context.Context, client oci.Client, _ *containers.Container, s *runtimespec.Spec) (err error) { // mergeMounts merge CRI mounts with extra mounts. If a mount destination // is mounted by both a CRI mount and an extra mount, the CRI mount will @@ -67,6 +73,7 @@ func WithMounts(osi osinterface.OS, config *runtime.ContainerConfig, extra []*ru sort.Sort(orderedMounts(mounts)) // Mount cgroup into the container as readonly, which inherits docker's behavior. + // TreatRoMountsAsRro does not apply here, as /sys/fs/cgroup is not a volume. s.Mounts = append(s.Mounts, runtimespec.Mount{ Source: "cgroup", Destination: "/sys/fs/cgroup", @@ -148,10 +155,25 @@ func WithMounts(osi osinterface.OS, config *runtime.ContainerConfig, extra []*ru options = append(options, "rprivate") } + var srcIsDir bool + if srcSt, err := osi.Stat(src); err != nil { + if errors.Is(err, os.ErrNotExist) { // happens when osi is FakeOS + srcIsDir = true // assume src to be dir + } else { + return fmt.Errorf("failed to stat mount source %q: %w", src, err) + } + } else if srcSt != nil { // srcSt can be nil when osi is FakeOS + srcIsDir = srcSt.IsDir() + } + // NOTE(random-liu): we don't change all mounts to `ro` when root filesystem // is readonly. This is different from docker's behavior, but make more sense. if mount.GetReadonly() { - options = append(options, "ro") + if rtConfig != nil && rtConfig.TreatRoMountsAsRro && srcIsDir { + options = append(options, "rro") + } else { + options = append(options, "ro") + } } else { options = append(options, "rw") } diff --git a/pkg/cri/server/container_create.go b/pkg/cri/server/container_create.go index b922bc976..b579200e9 100644 --- a/pkg/cri/server/container_create.go +++ b/pkg/cri/server/container_create.go @@ -683,7 +683,9 @@ func (c *criService) buildLinuxSpec( } }() - specOpts = append(specOpts, customopts.WithMounts(c.os, config, extraMounts, mountLabel)) + specOpts = append(specOpts, customopts.WithMounts(c.os, config, extraMounts, mountLabel, &customopts.RuntimeConfig{ + TreatRoMountsAsRro: ociRuntime.TreatRoMountsAsRroResolved, + })) if !c.config.DisableProcMount { // Change the default masked/readonly paths to empty slices diff --git a/pkg/cri/server/container_create_linux_test.go b/pkg/cri/server/container_create_linux_test.go index e46b00df1..6b7992cc1 100644 --- a/pkg/cri/server/container_create_linux_test.go +++ b/pkg/cri/server/container_create_linux_test.go @@ -597,7 +597,7 @@ func TestMountPropagation(t *testing.T) { var spec runtimespec.Spec spec.Linux = &runtimespec.Linux{} - err := opts.WithMounts(c.os, config, []*runtime.Mount{test.criMount}, "")(context.Background(), nil, nil, &spec) + err := opts.WithMounts(c.os, config, []*runtime.Mount{test.criMount}, "", nil)(context.Background(), nil, nil, &spec) if test.expectErr { require.Error(t, err) } else { diff --git a/plugins/cri/runtime/plugin.go b/plugins/cri/runtime/plugin.go index eb82ef2ce..521f6ed7b 100644 --- a/plugins/cri/runtime/plugin.go +++ b/plugins/cri/runtime/plugin.go @@ -24,6 +24,7 @@ import ( "os" "path/filepath" + introspectionapi "github.com/containerd/containerd/v2/api/services/introspection/v1" "github.com/containerd/log" "github.com/containerd/plugin" "github.com/containerd/plugin/registry" @@ -35,6 +36,7 @@ import ( "github.com/containerd/containerd/v2/pkg/cri/constants" "github.com/containerd/containerd/v2/pkg/oci" "github.com/containerd/containerd/v2/plugins" + "github.com/containerd/containerd/v2/plugins/services" "github.com/containerd/containerd/v2/plugins/services/warning" "github.com/containerd/errdefs" "github.com/containerd/platforms" @@ -50,6 +52,7 @@ func init() { Config: &config, Requires: []plugin.Type{ plugins.WarningPlugin, + plugins.ServicePlugin, }, ConfigMigration: func(ctx context.Context, version int, pluginConfigs map[string]interface{}) error { if version >= srvconfig.CurrentConfigVersion { @@ -73,7 +76,18 @@ func initCRIRuntime(ic *plugin.InitContext) (interface{}, error) { ic.Meta.Exports = map[string]string{"CRIVersion": constants.CRIVersion} ctx := ic.Context pluginConfig := ic.Config.(*criconfig.RuntimeConfig) - if warnings, err := criconfig.ValidateRuntimeConfig(ctx, pluginConfig); err != nil { + + introspectionService, err := ic.GetByID(plugins.ServicePlugin, services.IntrospectionService) + if err != nil { + return nil, fmt.Errorf("failed to get plugin (%q, %q): %w", + plugins.ServicePlugin, services.IntrospectionService, err) + } + introspectionClient, ok := introspectionService.(introspectionapi.IntrospectionClient) + if !ok { + return nil, fmt.Errorf("%+v does not implement IntrospectionClient interfae", introspectionService) + } + + if warnings, err := criconfig.ValidateRuntimeConfig(ctx, pluginConfig, introspectionClient); err != nil { return nil, fmt.Errorf("invalid plugin config: %w", err) } else if len(warnings) > 0 { ws, err := ic.GetSingle(plugins.WarningPlugin)