*: introduce image_pull_with_sync_fs in CRI
It's to ensure the data integrity during unexpected power failure.
Background:
Since release 1.3, in Linux system, containerD unpacks and writes files into
overlayfs snapshot directly. It doesn’t involve any mount-umount operations
so that the performance of pulling image has been improved.
As we know, the umount syscall for overlayfs will force kernel to flush
all the dirty pages into disk. Without umount syscall, the files’ data relies
on kernel’s writeback threads or filesystem's commit setting (for
instance, ext4 filesystem).
The files in committed snapshot can be loss after unexpected power failure.
However, the snapshot has been committed and the metadata also has been
fsynced. There is data inconsistency between snapshot metadata and files
in that snapshot.
We, containerd, received several issues about data loss after unexpected
power failure.
* https://github.com/containerd/containerd/issues/5854
* https://github.com/containerd/containerd/issues/3369#issuecomment-1787334907
Solution:
* Option 1: SyncFs after unpack
Linux platform provides [syncfs][syncfs] syscall to synchronize just the
filesystem containing a given file.
* Option 2: Fsync directories recursively and fsync on regular file
The fsync doesn't support symlink/block device/char device files. We
need to use fsync the parent directory to ensure that entry is
persisted.
However, based on [xfstest-dev][xfstest-dev], there is no case to ensure
fsync-on-parent can persist the special file's metadata, for example,
uid/gid, access mode.
Checkout [generic/690][generic/690]: Syncing parent dir can persist
symlink. But for f2fs, it needs special mount option. And it doesn't say
that uid/gid can be persisted. All the details are behind the
implemetation.
> NOTE: All the related test cases has `_flakey_drop_and_remount` in
[xfstest-dev].
Based on discussion about [Documenting the crash-recovery guarantees of Linux file systems][kernel-crash-recovery-data-integrity],
we can't rely on Fsync-on-parent.
* Option 1 is winner
This patch is using option 1.
There is test result based on [test-tool][test-tool].
All the networking traffic created by pull is local.
* Image: docker.io/library/golang:1.19.4 (992 MiB)
* Current: 5.446738579s
* WIOS=21081, WBytes=1329741824, RIOS=79, RBytes=1197056
* Option 1: 6.239686088s
* WIOS=34804, WBytes=1454845952, RIOS=79, RBytes=1197056
* Option 2: 1m30.510934813s
* WIOS=42143, WBytes=1471397888, RIOS=82, RBytes=1209344
* Image: docker.io/tensorflow/tensorflow:latest (1.78 GiB, ~32590 Inodes)
* Current: 8.852718042s
* WIOS=39417, WBytes=2412818432, RIOS=2673, RBytes=335987712
* Option 1: 9.683387174s
* WIOS=42767, WBytes=2431750144, RIOS=89, RBytes=1238016
* Option 2: 1m54.302103719s
* WIOS=54403, WBytes=2460528640, RIOS=1709, RBytes=208237568
The Option 1 will increase `wios`. So, the `image_pull_with_sync_fs` is
option in CRI plugin.
[syncfs]: <https://man7.org/linux/man-pages/man2/syncfs.2.html>
[xfstest-dev]: <https://git.kernel.org/pub/scm/fs/xfs/xfstests-dev.git>
[generic/690]: <https://git.kernel.org/pub/scm/fs/xfs/xfstests-dev.git/tree/tests/generic/690?h=v2023.11.19>
[kernel-crash-recovery-data-integrity]: <https://lore.kernel.org/linux-fsdevel/1552418820-18102-1-git-send-email-jaya@cs.utexas.edu/>
[test-tool]: <a17fb2010d/contrib/syncfs/containerd/main_test.go (L51)
>
Signed-off-by: Wei Fu <fuweid89@gmail.com>
This commit is contained in:
parent
bd5c602c4d
commit
23278c81fb
@ -279,6 +279,14 @@ func WithUnpackDuplicationSuppressor(suppressor kmutex.KeyedLocker) UnpackOpt {
|
||||
}
|
||||
}
|
||||
|
||||
// WithUnpackApplyOpts appends new apply options on the UnpackConfig.
|
||||
func WithUnpackApplyOpts(opts ...diff.ApplyOpt) UnpackOpt {
|
||||
return func(ctx context.Context, uc *UnpackConfig) error {
|
||||
uc.ApplyOpts = append(uc.ApplyOpts, opts...)
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
func (i *image) Unpack(ctx context.Context, snapshotterName string, opts ...UnpackOpt) error {
|
||||
ctx, done, err := i.client.WithLease(ctx)
|
||||
if err != nil {
|
||||
|
@ -60,6 +60,7 @@ func (s *service) Apply(ctx context.Context, er *diffapi.ApplyRequest) (*diffapi
|
||||
}
|
||||
opts = append(opts, diff.WithPayloads(payloads))
|
||||
}
|
||||
opts = append(opts, diff.WithSyncFs(er.SyncFs))
|
||||
|
||||
ocidesc, err = s.applier.Apply(ctx, desc, mounts, opts...)
|
||||
if err != nil {
|
||||
|
@ -92,7 +92,7 @@ func (s *fsApplier) Apply(ctx context.Context, desc ocispec.Descriptor, mounts [
|
||||
r: io.TeeReader(processor, digester.Hash()),
|
||||
}
|
||||
|
||||
if err := apply(ctx, mounts, rc); err != nil {
|
||||
if err := apply(ctx, mounts, rc, config.SyncFs); err != nil {
|
||||
return emptyDesc, err
|
||||
}
|
||||
|
||||
|
@ -25,7 +25,7 @@ import (
|
||||
"github.com/containerd/containerd/v2/mount"
|
||||
)
|
||||
|
||||
func apply(ctx context.Context, mounts []mount.Mount, r io.Reader) error {
|
||||
func apply(ctx context.Context, mounts []mount.Mount, r io.Reader, _sync bool) error {
|
||||
// We currently do not support mounts nor bind mounts on MacOS in the containerd daemon.
|
||||
// Using this as an exception to enable native snapshotter and allow further research.
|
||||
if len(mounts) == 1 && mounts[0].Type == "bind" {
|
||||
@ -38,6 +38,8 @@ func apply(ctx context.Context, mounts []mount.Mount, r io.Reader) error {
|
||||
path := mounts[0].Source
|
||||
_, err := archive.Apply(ctx, path, r, opts...)
|
||||
return err
|
||||
|
||||
// TODO: Do we need to sync all the filesystems?
|
||||
}
|
||||
|
||||
return mount.WithTempMount(ctx, mounts, func(root string) error {
|
||||
|
@ -20,15 +20,18 @@ import (
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"strings"
|
||||
|
||||
"github.com/containerd/containerd/v2/archive"
|
||||
"github.com/containerd/containerd/v2/errdefs"
|
||||
"github.com/containerd/containerd/v2/mount"
|
||||
"github.com/containerd/containerd/v2/pkg/userns"
|
||||
|
||||
"golang.org/x/sys/unix"
|
||||
)
|
||||
|
||||
func apply(ctx context.Context, mounts []mount.Mount, r io.Reader) error {
|
||||
func apply(ctx context.Context, mounts []mount.Mount, r io.Reader, sync bool) (retErr error) {
|
||||
switch {
|
||||
case len(mounts) == 1 && mounts[0].Type == "overlay":
|
||||
// OverlayConvertWhiteout (mknod c 0 0) doesn't work in userns.
|
||||
@ -50,7 +53,18 @@ func apply(ctx context.Context, mounts []mount.Mount, r io.Reader) error {
|
||||
opts = append(opts, archive.WithParents(parents))
|
||||
}
|
||||
_, err = archive.Apply(ctx, path, r, opts...)
|
||||
if err == nil && sync {
|
||||
err = doSyncFs(path)
|
||||
}
|
||||
return err
|
||||
case sync && len(mounts) == 1 && mounts[0].Type == "bind":
|
||||
defer func() {
|
||||
if retErr != nil {
|
||||
return
|
||||
}
|
||||
|
||||
retErr = doSyncFs(mounts[0].Source)
|
||||
}()
|
||||
}
|
||||
return mount.WithTempMount(ctx, mounts, func(root string) error {
|
||||
_, err := archive.Apply(ctx, root, r)
|
||||
@ -75,3 +89,17 @@ func getOverlayPath(options []string) (upper string, lower []string, err error)
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
func doSyncFs(file string) error {
|
||||
fd, err := os.Open(file)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to open %s: %w", file, err)
|
||||
}
|
||||
defer fd.Close()
|
||||
|
||||
_, _, errno := unix.Syscall(unix.SYS_SYNCFS, fd.Fd(), 0, 0)
|
||||
if errno != 0 {
|
||||
return fmt.Errorf("failed to syncfs for %s: %w", file, errno)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
@ -26,7 +26,8 @@ import (
|
||||
"github.com/containerd/containerd/v2/mount"
|
||||
)
|
||||
|
||||
func apply(ctx context.Context, mounts []mount.Mount, r io.Reader) error {
|
||||
func apply(ctx context.Context, mounts []mount.Mount, r io.Reader, _sync bool) error {
|
||||
// TODO: for windows, how to sync?
|
||||
return mount.WithTempMount(ctx, mounts, func(root string) error {
|
||||
_, err := archive.Apply(ctx, root, r)
|
||||
return err
|
||||
|
10
diff/diff.go
10
diff/diff.go
@ -67,6 +67,8 @@ type Comparer interface {
|
||||
type ApplyConfig struct {
|
||||
// ProcessorPayloads specifies the payload sent to various processors
|
||||
ProcessorPayloads map[string]typeurl.Any
|
||||
// SyncFs is to synchronize the underlying filesystem containing files
|
||||
SyncFs bool
|
||||
}
|
||||
|
||||
// ApplyOpt is used to configure an Apply operation
|
||||
@ -125,6 +127,14 @@ func WithPayloads(payloads map[string]typeurl.Any) ApplyOpt {
|
||||
}
|
||||
}
|
||||
|
||||
// WithSyncFs sets sync flag to the config.
|
||||
func WithSyncFs(sync bool) ApplyOpt {
|
||||
return func(_ context.Context, _ ocispec.Descriptor, c *ApplyConfig) error {
|
||||
c.SyncFs = sync
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
// WithSourceDateEpoch specifies the timestamp used to provide control for reproducibility.
|
||||
// See also https://reproducible-builds.org/docs/source-date-epoch/ .
|
||||
//
|
||||
|
@ -61,6 +61,7 @@ func (r *diffRemote) Apply(ctx context.Context, desc ocispec.Descriptor, mounts
|
||||
Diff: oci.DescriptorToProto(desc),
|
||||
Mounts: mount.ToProto(mounts),
|
||||
Payloads: payloads,
|
||||
SyncFs: config.SyncFs,
|
||||
}
|
||||
resp, err := r.client.Apply(ctx, req)
|
||||
if err != nil {
|
||||
|
@ -357,6 +357,9 @@ type PluginConfig struct {
|
||||
//
|
||||
// For example, the value can be '5h', '2h30m', '10s'.
|
||||
DrainExecSyncIOTimeout string `toml:"drain_exec_sync_io_timeout" json:"drainExecSyncIOTimeout"`
|
||||
// ImagePullWithSyncFs is an experimental setting. It's to force sync
|
||||
// filesystem during unpacking to ensure that data integrity.
|
||||
ImagePullWithSyncFs bool `toml:"image_pull_with_sync_fs" json:"imagePullWithSyncFs"`
|
||||
}
|
||||
|
||||
// X509KeyPairStreaming contains the x509 configuration for streaming
|
||||
|
@ -100,6 +100,7 @@ func DefaultConfig() PluginConfig {
|
||||
CDISpecDirs: []string{"/etc/cdi", "/var/run/cdi"},
|
||||
ImagePullProgressTimeout: defaultImagePullProgressTimeoutDuration.String(),
|
||||
DrainExecSyncIOTimeout: "0s",
|
||||
ImagePullWithSyncFs: false,
|
||||
EnableUnprivilegedPorts: true,
|
||||
EnableUnprivilegedICMP: true,
|
||||
}
|
||||
|
@ -39,6 +39,7 @@ import (
|
||||
runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
|
||||
|
||||
containerd "github.com/containerd/containerd/v2/client"
|
||||
"github.com/containerd/containerd/v2/diff"
|
||||
"github.com/containerd/containerd/v2/errdefs"
|
||||
containerdimages "github.com/containerd/containerd/v2/images"
|
||||
"github.com/containerd/containerd/v2/pkg/cri/annotations"
|
||||
@ -165,6 +166,7 @@ func (c *CRIImageService) PullImage(ctx context.Context, r *runtime.PullImageReq
|
||||
containerd.WithImageHandler(imageHandler),
|
||||
containerd.WithUnpackOpts([]containerd.UnpackOpt{
|
||||
containerd.WithUnpackDuplicationSuppressor(c.unpackDuplicationSuppressor),
|
||||
containerd.WithUnpackApplyOpts(diff.WithSyncFs(c.config.ImagePullWithSyncFs)),
|
||||
}),
|
||||
}
|
||||
|
||||
|
@ -106,6 +106,7 @@ func (l *local) Apply(ctx context.Context, er *diffapi.ApplyRequest, _ ...grpc.C
|
||||
}
|
||||
opts = append(opts, diff.WithPayloads(payloads))
|
||||
}
|
||||
opts = append(opts, diff.WithSyncFs(er.SyncFs))
|
||||
|
||||
for _, differ := range l.differs {
|
||||
ocidesc, err = differ.Apply(ctx, desc, mounts, opts...)
|
||||
|
Loading…
Reference in New Issue
Block a user