/* Copyright The containerd Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ package blockfile import ( "context" "fmt" "io" "os" "path/filepath" "runtime" "github.com/containerd/containerd/v2/core/mount" "github.com/containerd/containerd/v2/core/snapshots" "github.com/containerd/containerd/v2/core/snapshots/storage" "github.com/containerd/continuity/fs" "github.com/containerd/log" "github.com/containerd/plugin" ) // viewHookHelper is only used in test for recover the filesystem. type viewHookHelper func(backingFile string, fsType string, defaultOpts []string) error // SnapshotterConfig holds the configurable properties for the blockfile snapshotter type SnapshotterConfig struct { // recreateScratch is whether scratch should be recreated even // if already exists recreateScratch bool scratchGenerator func(string) error // fsType is the filesystem type for the mount (defaults to ext4) fsType string // mountOptions are the base options added to the mount (defaults to ["loop"]) mountOptions []string // testViewHookHelper is used to fsck or mount with rw to handle // the recovery. If we mount ro for view snapshot, we might hit // the issue like // // (ext4) INFO: recovery required on readonly filesystem // (ext4) write access unavailable, cannot proceed (try mounting with noload) // // FIXME(fuweid): I don't hit the readonly issue in ssd storage. But it's // easy to reproduce it in slow-storage. testViewHookHelper viewHookHelper } // Opt is an option to configure the overlay snapshotter type Opt func(string, *SnapshotterConfig) // WithScratchFile provides a scratch file which will get copied on startup // if the scratch file needs to be generated. func WithScratchFile(src string) Opt { return func(root string, config *SnapshotterConfig) { config.scratchGenerator = func(dst string) error { // Copy src to dst if err := copyFileWithSync(dst, src); err != nil { return fmt.Errorf("failed to copy scratch: %w", err) } return nil } } } // WithFSType defines the filesystem type to apply to mounts of the blockfile func WithFSType(fsType string) Opt { return func(root string, config *SnapshotterConfig) { config.fsType = fsType } } // WithMountOptions defines the mount options used for the mount func WithMountOptions(options []string) Opt { return func(root string, config *SnapshotterConfig) { config.mountOptions = options } } // WithRecreateScratch is used to determine that scratch should be recreated // even if already exists. func WithRecreateScratch(recreate bool) Opt { return func(root string, config *SnapshotterConfig) { config.recreateScratch = recreate } } // withViewHookHelper introduces hook for preparing snapshot for View. It // should be used in test only. // //nolint:nolintlint,unused // not used on all platforms func withViewHookHelper(fn viewHookHelper) Opt { return func(_ string, config *SnapshotterConfig) { config.testViewHookHelper = fn } } type snapshotter struct { root string scratch string fsType string options []string ms *storage.MetaStore testViewHookHelper viewHookHelper } // NewSnapshotter returns a Snapshotter which copies layers on the underlying // file system. A metadata file is stored under the root. func NewSnapshotter(root string, opts ...Opt) (snapshots.Snapshotter, error) { var config SnapshotterConfig if err := os.MkdirAll(root, 0700); err != nil { return nil, err } for _, opt := range opts { opt(root, &config) } scratch := filepath.Join(root, "scratch") createScratch := config.recreateScratch if !createScratch { if _, err := os.Stat(scratch); err != nil { if !os.IsNotExist(err) { return nil, fmt.Errorf("unable to stat scratch file: %w", err) } createScratch = true } } if createScratch { if config.scratchGenerator == nil { return nil, fmt.Errorf("no scratch file generator: %w", plugin.ErrSkipPlugin) } if err := config.scratchGenerator(scratch); err != nil { return nil, fmt.Errorf("failed to generate scratch file: %w", err) } } if config.fsType == "" { config.fsType = "ext4" } if config.mountOptions == nil { config.mountOptions = []string{"loop"} } ms, err := storage.NewMetaStore(filepath.Join(root, "metadata.db")) if err != nil { return nil, err } if err := os.Mkdir(filepath.Join(root, "snapshots"), 0700); err != nil && !os.IsExist(err) { return nil, err } return &snapshotter{ root: root, scratch: scratch, fsType: config.fsType, options: config.mountOptions, ms: ms, testViewHookHelper: config.testViewHookHelper, }, nil } // Stat returns the info for an active or committed snapshot by name or // key. // // Should be used for parent resolution, existence checks and to discern // the kind of snapshot. func (o *snapshotter) Stat(ctx context.Context, key string) (info snapshots.Info, err error) { err = o.ms.WithTransaction(ctx, false, func(ctx context.Context) error { _, info, _, err = storage.GetInfo(ctx, key) return err }) if err != nil { return snapshots.Info{}, err } return info, nil } func (o *snapshotter) Update(ctx context.Context, info snapshots.Info, fieldpaths ...string) (_ snapshots.Info, err error) { err = o.ms.WithTransaction(ctx, true, func(ctx context.Context) error { info, err = storage.UpdateInfo(ctx, info, fieldpaths...) return err }) if err != nil { return snapshots.Info{}, err } return info, nil } func (o *snapshotter) Usage(ctx context.Context, key string) (usage snapshots.Usage, err error) { var ( id string info snapshots.Info ) err = o.ms.WithTransaction(ctx, false, func(ctx context.Context) error { id, info, usage, err = storage.GetInfo(ctx, key) if err != nil { return err } // Current usage calculation is an approximation based on the size // of the block file - the size of its parent. This does not consider // that the filesystem may not support shared extents between the block // file and its parents, in which case the accurate calculation would just // be the size of the block file. Additionally, this does not take into // consideration that file may have been removed before being adding, // making the number of shared extents between the parent and the block // file smaller than the parent, under reporting actual usage. // // A more ideal calculation would look like: // size(block) - usage(extent_intersection(block,parent)) // OR // usage(extent_union(block,parent)) - size(parent) if info.Kind == snapshots.KindActive { // TODO: Use size calculator from fs package st, err := os.Stat(o.getBlockFile(id)) if err != nil { return err } usage.Size = st.Size() usage.Inodes = 1 } if info.Parent != "" { // GetInfo returns total number of bytes used by a snapshot (including parent). // So subtract parent usage in order to get delta consumed by layer itself. _, _, parentUsage, err := storage.GetInfo(ctx, info.Parent) if err != nil { return err } usage.Size -= parentUsage.Size } return err }) if err != nil { return snapshots.Usage{}, err } return usage, nil } func (o *snapshotter) Prepare(ctx context.Context, key, parent string, opts ...snapshots.Opt) ([]mount.Mount, error) { return o.createSnapshot(ctx, snapshots.KindActive, key, parent, opts) } func (o *snapshotter) View(ctx context.Context, key, parent string, opts ...snapshots.Opt) ([]mount.Mount, error) { return o.createSnapshot(ctx, snapshots.KindView, key, parent, opts) } // Mounts returns the mounts for the transaction identified by key. Can be // called on an read-write or readonly transaction. // // This can be used to recover mounts after calling View or Prepare. func (o *snapshotter) Mounts(ctx context.Context, key string) (_ []mount.Mount, err error) { var s storage.Snapshot err = o.ms.WithTransaction(ctx, false, func(ctx context.Context) error { s, err = storage.GetSnapshot(ctx, key) if err != nil { return fmt.Errorf("failed to get snapshot mount: %w", err) } return nil }) if err != nil { return nil, err } return o.mounts(s), nil } func (o *snapshotter) Commit(ctx context.Context, name, key string, opts ...snapshots.Opt) error { return o.ms.WithTransaction(ctx, true, func(ctx context.Context) error { id, _, _, err := storage.GetInfo(ctx, key) if err != nil { return err } st, err := os.Stat(o.getBlockFile(id)) if err != nil { return err } usage := snapshots.Usage{ Size: st.Size(), Inodes: 1, } if _, err = storage.CommitActive(ctx, key, name, usage, opts...); err != nil { return fmt.Errorf("failed to commit snapshot: %w", err) } return nil }) } // Remove abandons the transaction identified by key. All resources // associated with the key will be removed. func (o *snapshotter) Remove(ctx context.Context, key string) (err error) { var ( renamed, path string restore bool ) err = o.ms.WithTransaction(ctx, true, func(ctx context.Context) error { id, _, err := storage.Remove(ctx, key) if err != nil { return fmt.Errorf("failed to remove: %w", err) } path = o.getBlockFile(id) renamed = filepath.Join(o.root, "snapshots", "rm-"+id) if err = os.Rename(path, renamed); err != nil { if !os.IsNotExist(err) { return fmt.Errorf("failed to rename: %w", err) } renamed = "" } restore = true return nil }) if err != nil { if renamed != "" && restore { if err1 := os.Rename(renamed, path); err1 != nil { // May cause inconsistent data on disk log.G(ctx).WithError(err1).WithField("path", renamed).Error("failed to rename after failed commit") } } return err } if renamed != "" { if err := os.Remove(renamed); err != nil { // Must be cleaned up, any "rm-*" could be removed if no active transactions log.G(ctx).WithError(err).WithField("path", renamed).Warnf("failed to remove root filesystem") } } return nil } // Walk the committed snapshots. func (o *snapshotter) Walk(ctx context.Context, fn snapshots.WalkFunc, fs ...string) error { return o.ms.WithTransaction(ctx, false, func(ctx context.Context) error { return storage.WalkInfo(ctx, fn, fs...) }) } func (o *snapshotter) createSnapshot(ctx context.Context, kind snapshots.Kind, key, parent string, opts []snapshots.Opt) (_ []mount.Mount, err error) { var s storage.Snapshot err = o.ms.WithTransaction(ctx, true, func(ctx context.Context) error { s, err = storage.CreateSnapshot(ctx, kind, key, parent, opts...) if err != nil { return fmt.Errorf("failed to create snapshot: %w", err) } var path string if len(s.ParentIDs) == 0 || s.Kind == snapshots.KindActive { path = o.getBlockFile(s.ID) if len(s.ParentIDs) > 0 { if err = copyFileWithSync(path, o.getBlockFile(s.ParentIDs[0])); err != nil { return fmt.Errorf("copying of parent failed: %w", err) } } else { if err = copyFileWithSync(path, o.scratch); err != nil { return fmt.Errorf("copying of scratch failed: %w", err) } } } else { path = o.getBlockFile(s.ParentIDs[0]) } if o.testViewHookHelper != nil { if err := o.testViewHookHelper(path, o.fsType, o.options); err != nil { return fmt.Errorf("failed to handle the viewHookHelper: %w", err) } } return nil }) if err != nil { return nil, err } return o.mounts(s), nil } func (o *snapshotter) getBlockFile(id string) string { return filepath.Join(o.root, "snapshots", id) } func (o *snapshotter) mounts(s storage.Snapshot) []mount.Mount { var ( mountOptions = o.options source string ) if s.Kind == snapshots.KindView { mountOptions = append(mountOptions, "ro") } else { mountOptions = append(mountOptions, "rw") } if len(s.ParentIDs) == 0 || s.Kind == snapshots.KindActive { source = o.getBlockFile(s.ID) } else { source = o.getBlockFile(s.ParentIDs[0]) } return []mount.Mount{ { Source: source, Type: o.fsType, Options: mountOptions, }, } } // Close closes the snapshotter func (o *snapshotter) Close() error { return o.ms.Close() } func copyFileWithSync(target, source string) error { // The Go stdlib does not seem to have an efficient os.File.ReadFrom // routine for other platforms like it does on Linux with // copy_file_range. For Darwin at least we can use clonefile // in its place, otherwise if we have a sparse file we'd have // a fun surprise waiting below. // // TODO: Enlighten other platforms (windows?) if runtime.GOOS == "darwin" { return fs.CopyFile(target, source) } src, err := os.Open(source) if err != nil { return fmt.Errorf("failed to open source %s: %w", source, err) } defer src.Close() tgt, err := os.Create(target) if err != nil { return fmt.Errorf("failed to open target %s: %w", target, err) } defer tgt.Close() defer tgt.Sync() _, err = io.Copy(tgt, src) return err }