Merge pull request #3924 from renzhengeek/renzhen/snapshot-gc

snapshots/devmapper: do not stop snapshot GC when one snapshot removing fails
This commit is contained in:
Maksym Pavlenko 2020-03-12 19:28:55 -07:00 committed by GitHub
commit e2e40e19d7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 73 additions and 4 deletions

View File

@ -25,6 +25,7 @@ The following configuration flags are supported:
* `pool_name` - a name to use for the devicemapper thin pool. Pool name
should be the same as in `/dev/mapper/` directory
* `base_image_size` - defines how much space to allocate when creating the base device
* `async_remove` - flag to async remove device using snapshot GC's cleanup callback
Pool name and base image size are required snapshotter parameters.

View File

@ -40,6 +40,9 @@ type Config struct {
// Defines how much space to allocate when creating base image for container
BaseImageSize string `toml:"base_image_size"`
BaseImageSizeBytes uint64 `toml:"-"`
// Flag to async remove device using Cleanup() callback in snapshots GC
AsyncRemove bool `toml:"async_remove"`
}
// LoadConfig reads devmapper configuration file from disk in TOML format

View File

@ -122,6 +122,14 @@ func (m *PoolMetadata) AddDevice(ctx context.Context, info *DeviceInfo) error {
return nil
}
// ChangeDeviceState changes the device state given the device name in devices bucket.
func (m *PoolMetadata) ChangeDeviceState(ctx context.Context, name string, state DeviceState) error {
return m.UpdateDevice(ctx, name, func(deviceInfo *DeviceInfo) error {
deviceInfo.State = state
return nil
})
}
// MarkFaulty marks the given device and corresponding devmapper device ID as faulty.
// The snapshotter might attempt to recreate a device in 'Faulty' state with another devmapper ID in
// subsequent calls, and in case of success it's status will be changed to 'Created' or 'Activated'.

View File

@ -84,7 +84,7 @@ func (p *PoolDevice) ensureDeviceStates(ctx context.Context) error {
var faultyDevices []*DeviceInfo
var activatedDevices []*DeviceInfo
if err := p.metadata.WalkDevices(ctx, func(info *DeviceInfo) error {
if err := p.WalkDevices(ctx, func(info *DeviceInfo) error {
switch info.State {
case Suspended, Resumed, Deactivated, Removed, Faulty:
case Activated:
@ -494,6 +494,18 @@ func (p *PoolDevice) RemovePool(ctx context.Context) error {
return result.ErrorOrNil()
}
// MarkDeviceState changes the device's state in metastore
func (p *PoolDevice) MarkDeviceState(ctx context.Context, name string, state DeviceState) error {
return p.metadata.ChangeDeviceState(ctx, name, state)
}
// WalkDevices iterates all devices in pool metadata
func (p *PoolDevice) WalkDevices(ctx context.Context, cb func(info *DeviceInfo) error) error {
return p.metadata.WalkDevices(ctx, func(info *DeviceInfo) error {
return cb(info)
})
}
// Close closes pool device (thin-pool will not be removed)
func (p *PoolDevice) Close() error {
return p.metadata.Close()

View File

@ -27,6 +27,7 @@ import (
"strings"
"sync"
"github.com/containerd/containerd/errdefs"
"github.com/containerd/containerd/log"
"github.com/containerd/containerd/mount"
"github.com/containerd/containerd/plugin"
@ -303,10 +304,23 @@ func (s *Snapshotter) removeDevice(ctx context.Context, key string) error {
}
deviceName := s.getDeviceName(snapID)
if !s.config.AsyncRemove {
if err := s.pool.RemoveDevice(ctx, deviceName); err != nil {
log.G(ctx).WithError(err).Errorf("failed to remove device")
// Tell snapshot GC continue to collect other snapshots.
// Otherwise, one snapshot collection failure will stop
// the GC, and all snapshots won't be collected even though
// having no relationship with the failed one.
return errdefs.ErrFailedPrecondition
}
} else {
// The asynchronous cleanup will do the real device remove work.
log.G(ctx).WithField("device", deviceName).Debug("async remove")
if err := s.pool.MarkDeviceState(ctx, deviceName, Removed); err != nil {
log.G(ctx).WithError(err).Errorf("failed to mark device as removed")
return err
}
}
return nil
}
@ -486,3 +500,34 @@ func (s *Snapshotter) withTransaction(ctx context.Context, writable bool, fn fun
return nil
}
func (s *Snapshotter) Cleanup(ctx context.Context) error {
var removedDevices []*DeviceInfo
if !s.config.AsyncRemove {
return nil
}
if err := s.pool.WalkDevices(ctx, func(info *DeviceInfo) error {
if info.State == Removed {
removedDevices = append(removedDevices, info)
}
return nil
}); err != nil {
log.G(ctx).WithError(err).Errorf("failed to query devices from metastore")
return err
}
var result *multierror.Error
for _, dev := range removedDevices {
log.G(ctx).WithField("device", dev.Name).Debug("cleanup device")
if err := s.pool.RemoveDevice(ctx, dev.Name); err != nil {
log.G(ctx).WithField("device", dev.Name).Error("failed to cleanup device")
result = multierror.Append(result, err)
} else {
log.G(ctx).WithField("device", dev.Name).Debug("cleanuped device")
}
}
return result.ErrorOrNil()
}