snapshots/devmapper: deactivate thin device after committed

1. reason to deactivate committed snapshot

The thin device will not be used for IO after committed,
and further thin snapshotting is OK using an inactive thin
device as origin. The benefits to deactivate are:
 - device is not unneccesary visible avoiding any unexpected IO;
 - save useless kernel data structs for maintaining active dm.

 Quote from kernel doc (Documentation/device-mapper/provisioning.txt):

"
  ii) Using an internal snapshot.

  Once created, the user doesn't have to worry about any connection
  between the origin and the snapshot.  Indeed the snapshot is no
  different from any other thinly-provisioned device and can be
  snapshotted itself via the same method.  It's perfectly legal to
  have only one of them active, and there's no ordering requirement on
  activating or removing them both.  (This differs from conventional
  device-mapper snapshots.)
"

2. an thinpool metadata bug is naturally removed

An problem happens when failed to suspend/resume origin thin device
when creating snapshot:

"failed to create snapshot device from parent vg0-mythinpool-snap-3"
error="failed to save initial metadata for snapshot "vg0-mythinpool-snap-19":
object already exists"

This issue occurs because when failed to create snapshot, the
snapshotter.store can be rollbacked, but the thin pool metadata
boltdb failed to rollback in PoolDevice.CreateSnapshotDevice(),
therefore metadata becomes inconsistent: the snapshotID is not
taken in snapshotter.store, but saved in pool metadata boltdb.

The cause is, in PoolDevice.CreateSnapshotDevice(), the defer calls
are invoked on "first-in-last-out" order. When the error happens
on the "resume device" defer call, the metadata is saved and
snapshot is created, which has no chance to be rollbacked.

Signed-off-by: Eric Ren <renzhen@linux.alibaba.com>
This commit is contained in:
renzhen.rz 2019-05-07 21:06:43 +08:00
parent d68b593de4
commit 3887053177
4 changed files with 62 additions and 48 deletions

View File

@ -201,6 +201,13 @@ func RemoveDevice(deviceName string, opts ...RemoveDeviceOpt) error {
args = append(args, GetFullDevicePath(deviceName))
_, err := dmsetup(args...)
if err == unix.ENXIO {
// Ignore "No such device or address" error because we dmsetup
// remove with "deferred" option, there is chance for the device
// having been removed.
return nil
}
return err
}
@ -372,9 +379,13 @@ func parseDmsetupError(output string) string {
return ""
}
const failedSubstr = "failed: "
line := lines[0]
// Handle output like "Device /dev/mapper/snapshotter-suite-pool-snap-1 not found"
if strings.HasSuffix(line, "not found") {
return unix.ENXIO.Error()
}
const failedSubstr = "failed: "
idx := strings.LastIndex(line, failedSubstr)
if idx == -1 {
return ""

View File

@ -178,19 +178,6 @@ func (p *PoolDevice) CreateSnapshotDevice(ctx context.Context, deviceName string
return errors.Wrapf(err, "failed to query device metadata for %q", deviceName)
}
// Suspend thin device if it was activated previously to avoid corruptions
isActivated := p.IsActivated(baseInfo.Name)
if isActivated {
if err := p.suspendDevice(ctx, baseInfo); err != nil {
return err
}
// Resume back base thin device on exit
defer func() {
retErr = multierror.Append(retErr, p.resumeDevice(ctx, baseInfo)).ErrorOrNil()
}()
}
snapInfo := &DeviceInfo{
Name: snapshotName,
Size: virtualSizeBytes,
@ -230,26 +217,6 @@ func (p *PoolDevice) CreateSnapshotDevice(ctx context.Context, deviceName string
return p.activateDevice(ctx, snapInfo)
}
func (p *PoolDevice) suspendDevice(ctx context.Context, info *DeviceInfo) error {
if err := p.transition(ctx, info.Name, Suspending, Suspended, func() error {
return dmsetup.SuspendDevice(info.Name)
}); err != nil {
return errors.Wrapf(err, "failed to suspend device %q", info.Name)
}
return nil
}
func (p *PoolDevice) resumeDevice(ctx context.Context, info *DeviceInfo) error {
if err := p.transition(ctx, info.Name, Resuming, Resumed, func() error {
return dmsetup.ResumeDevice(info.Name)
}); err != nil {
return errors.Wrapf(err, "failed to resume device %q", info.Name)
}
return nil
}
func (p *PoolDevice) createSnapshot(ctx context.Context, baseInfo, snapInfo *DeviceInfo) error {
if err := p.transition(ctx, snapInfo.Name, Creating, Created, func() error {
return dmsetup.CreateSnapshot(p.poolName, snapInfo.DeviceID, baseInfo.DeviceID)
@ -265,16 +232,30 @@ func (p *PoolDevice) createSnapshot(ctx context.Context, baseInfo, snapInfo *Dev
return nil
}
// SuspendDevice flushes the outstanding IO and blocks the further IO
func (p *PoolDevice) SuspendDevice(ctx context.Context, deviceName string) error {
if err := p.transition(ctx, deviceName, Suspending, Suspended, func() error {
return dmsetup.SuspendDevice(deviceName)
}); err != nil {
return errors.Wrapf(err, "failed to suspend device %q", deviceName)
}
return nil
}
// DeactivateDevice deactivates thin device
func (p *PoolDevice) DeactivateDevice(ctx context.Context, deviceName string, deferred bool) error {
if !p.IsActivated(deviceName) {
func (p *PoolDevice) DeactivateDevice(ctx context.Context, deviceName string, deferred, withForce bool) error {
if !p.IsLoaded(deviceName) {
return nil
}
opts := []dmsetup.RemoveDeviceOpt{dmsetup.RemoveWithForce, dmsetup.RemoveWithRetries}
opts := []dmsetup.RemoveDeviceOpt{dmsetup.RemoveWithRetries}
if deferred {
opts = append(opts, dmsetup.RemoveDeferred)
}
if withForce {
opts = append(opts, dmsetup.RemoveWithForce)
}
if err := p.transition(ctx, deviceName, Deactivating, Deactivated, func() error {
return dmsetup.RemoveDevice(deviceName, opts...)
@ -300,6 +281,12 @@ func (p *PoolDevice) IsActivated(deviceName string) bool {
return true
}
// IsLoaded returns true if thin-device is visible for dmsetup
func (p *PoolDevice) IsLoaded(deviceName string) bool {
_, err := dmsetup.Info(deviceName)
return err == nil
}
// GetUsage reports total size in bytes consumed by a thin-device.
// It relies on the number of used blocks reported by 'dmsetup status'.
// The output looks like:
@ -330,7 +317,7 @@ func (p *PoolDevice) RemoveDevice(ctx context.Context, deviceName string) error
return errors.Wrapf(err, "can't query metadata for device %q", deviceName)
}
if err := p.DeactivateDevice(ctx, deviceName, true); err != nil {
if err := p.DeactivateDevice(ctx, deviceName, true, true); err != nil {
return err
}
@ -368,7 +355,7 @@ func (p *PoolDevice) RemovePool(ctx context.Context) error {
// Deactivate devices if any
for _, name := range deviceNames {
if err := p.DeactivateDevice(ctx, name, true); err != nil {
if err := p.DeactivateDevice(ctx, name, true, true); err != nil {
result = multierror.Append(result, errors.Wrapf(err, "failed to remove %q", name))
}
}

View File

@ -107,19 +107,24 @@ func TestPoolDevice(t *testing.T) {
testMakeFileSystem(t, pool)
})
// Mount 'thin-1'
// Mount 'thin-1' and write v1 test file on 'thin-1' device
err = mount.WithTempMount(ctx, getMounts(thinDevice1), func(thin1MountPath string) error {
// Write v1 test file on 'thin-1' device
thin1TestFilePath := filepath.Join(thin1MountPath, "TEST")
err := ioutil.WriteFile(thin1TestFilePath, []byte("test file (v1)"), 0700)
assert.NilError(t, err, "failed to write test file v1 on '%s' volume", thinDevice1)
// Take snapshot of 'thin-1'
t.Run("CreateSnapshotDevice", func(t *testing.T) {
testCreateSnapshot(t, pool)
})
return nil
})
// Update TEST file on 'thin-1' to v2
// Take snapshot of 'thin-1'
t.Run("CreateSnapshotDevice", func(t *testing.T) {
testCreateSnapshot(t, pool)
})
// Update TEST file on 'thin-1' to v2
err = mount.WithTempMount(ctx, getMounts(thinDevice1), func(thin1MountPath string) error {
thin1TestFilePath := filepath.Join(thin1MountPath, "TEST")
err = ioutil.WriteFile(thin1TestFilePath, []byte("test file (v2)"), 0700)
assert.NilError(t, err, "failed to write test file v2 on 'thin-1' volume after taking snapshot")
@ -204,7 +209,7 @@ func testDeactivateThinDevice(t *testing.T, pool *PoolDevice) {
for _, deviceName := range deviceList {
assert.Assert(t, pool.IsActivated(deviceName))
err := pool.DeactivateDevice(context.Background(), deviceName, false)
err := pool.DeactivateDevice(context.Background(), deviceName, false, true)
assert.NilError(t, err, "failed to remove '%s'", deviceName)
assert.Assert(t, !pool.IsActivated(deviceName))

View File

@ -272,7 +272,18 @@ func (s *Snapshotter) Commit(ctx context.Context, name, key string, opts ...snap
}
_, err = storage.CommitActive(ctx, key, name, usage, opts...)
return err
if err != nil {
return err
}
// The thin snapshot is not used for IO after committed, so
// suspend to flush the IO and deactivate the device.
err = s.pool.SuspendDevice(ctx, deviceName)
if err != nil {
return err
}
return s.pool.DeactivateDevice(ctx, deviceName, true, false)
})
}