From 878a3205cdfedf616f0db82381d6841ffdb9461c Mon Sep 17 00:00:00 2001 From: Maksym Pavlenko Date: Tue, 30 Jul 2019 15:17:17 -0700 Subject: [PATCH 1/3] Better error recovery in devmapper Signed-off-by: Maksym Pavlenko --- snapshots/devmapper/device_info.go | 4 + snapshots/devmapper/metadata.go | 42 +++++++++- snapshots/devmapper/metadata_test.go | 32 ++++++- snapshots/devmapper/pool_device.go | 121 +++++++++++++++++++-------- 4 files changed, 157 insertions(+), 42 deletions(-) diff --git a/snapshots/devmapper/device_info.go b/snapshots/devmapper/device_info.go index 6d65d557a..e9dea3455 100644 --- a/snapshots/devmapper/device_info.go +++ b/snapshots/devmapper/device_info.go @@ -56,6 +56,8 @@ const ( Removing // Removed means that device successfully removed but not yet deleted from meta store Removed + // Faulty means that the device is errored and the snapshotter failed to rollback it + Faulty ) func (s DeviceState) String() string { @@ -84,6 +86,8 @@ func (s DeviceState) String() string { return "Removing" case Removed: return "Removed" + case Faulty: + return "Faulty" default: return fmt.Sprintf("unknown %d", s) } diff --git a/snapshots/devmapper/metadata.go b/snapshots/devmapper/metadata.go index cc17efe1f..4a861c5d3 100644 --- a/snapshots/devmapper/metadata.go +++ b/snapshots/devmapper/metadata.go @@ -24,6 +24,7 @@ import ( "fmt" "strconv" + "github.com/hashicorp/go-multierror" "github.com/pkg/errors" bolt "go.etcd.io/bbolt" ) @@ -38,6 +39,7 @@ type deviceIDState byte const ( deviceFree deviceIDState = iota deviceTaken + deviceFaulty ) // Bucket names @@ -92,11 +94,14 @@ func (m *PoolMetadata) ensureDatabaseInitialized() error { // AddDevice saves device info to database. func (m *PoolMetadata) AddDevice(ctx context.Context, info *DeviceInfo) error { - return m.db.Update(func(tx *bolt.Tx) error { + err := m.db.Update(func(tx *bolt.Tx) error { devicesBucket := tx.Bucket(devicesBucketName) - // Make sure device name is unique - if err := getObject(devicesBucket, info.Name, nil); err == nil { + // Make sure device name is unique. If there is already a device with the same name, + // but in Faulty state, give it a try with another devmapper device ID. + // See https://github.com/containerd/containerd/pull/3436 for more context. + var existing DeviceInfo + if err := getObject(devicesBucket, info.Name, &existing); err == nil && existing.State != Faulty { return ErrAlreadyExists } @@ -108,8 +113,37 @@ func (m *PoolMetadata) AddDevice(ctx context.Context, info *DeviceInfo) error { info.DeviceID = deviceID - return putObject(devicesBucket, info.Name, info, false) + return putObject(devicesBucket, info.Name, info, true) }) + + if err != nil { + return errors.Wrapf(err, "failed to save metadata for device %q (parent: %q)", info.Name, info.ParentName) + } + + return nil +} + +// MarkFaulty marks the given device and corresponding devmapper device ID as faulty. +// The snapshotter might attempt to recreate a device in 'Faulty' state with another devmapper ID in +// subsequent calls, and in case of success it's status will be changed to 'Created' or 'Activated'. +// The devmapper dev ID will remain in 'deviceFaulty' state until manually handled by a user. +func (m *PoolMetadata) MarkFaulty(ctx context.Context, info *DeviceInfo) error { + var result error + + if err := m.UpdateDevice(ctx, info.Name, func(deviceInfo *DeviceInfo) error { + deviceInfo.State = Faulty + return nil + }); err != nil { + result = multierror.Append(result, err) + } + + if err := m.db.Update(func(tx *bolt.Tx) error { + return markDeviceID(tx, info.DeviceID, deviceFaulty) + }); err != nil { + result = multierror.Append(result, err) + } + + return result } // getNextDeviceID finds the next free device ID by taking a cursor diff --git a/snapshots/devmapper/metadata_test.go b/snapshots/devmapper/metadata_test.go index 1149b79b1..c4309a598 100644 --- a/snapshots/devmapper/metadata_test.go +++ b/snapshots/devmapper/metadata_test.go @@ -23,8 +23,11 @@ import ( "io/ioutil" "os" "path/filepath" + "strconv" "testing" + "github.com/pkg/errors" + "go.etcd.io/bbolt" "gotest.tools/assert" is "gotest.tools/assert/cmp" ) @@ -77,7 +80,7 @@ func TestPoolMetadata_AddDeviceDuplicate(t *testing.T) { assert.NilError(t, err) err = store.AddDevice(testCtx, &DeviceInfo{Name: "test"}) - assert.Equal(t, ErrAlreadyExists, err) + assert.Equal(t, ErrAlreadyExists, errors.Cause(err)) } func TestPoolMetadata_ReuseDeviceID(t *testing.T) { @@ -151,6 +154,33 @@ func TestPoolMetadata_UpdateDevice(t *testing.T) { assert.Equal(t, Created, newInfo.State) } +func TestPoolMetadata_MarkFaulty(t *testing.T) { + tempDir, store := createStore(t) + defer cleanupStore(t, tempDir, store) + + info := &DeviceInfo{Name: "test"} + err := store.AddDevice(testCtx, info) + assert.NilError(t, err) + + err = store.MarkFaulty(testCtx, info) + assert.NilError(t, err) + + saved, err := store.GetDevice(testCtx, info.Name) + assert.NilError(t, err) + assert.Equal(t, saved.State, Faulty) + assert.Assert(t, saved.DeviceID > 0) + + // Make sure a device ID marked as faulty as well + err = store.db.View(func(tx *bbolt.Tx) error { + bucket := tx.Bucket(deviceIDBucketName) + key := strconv.FormatUint(uint64(saved.DeviceID), 10) + value := bucket.Get([]byte(key)) + assert.Equal(t, value[0], byte(deviceFaulty)) + return nil + }) + assert.NilError(t, err) +} + func TestPoolMetadata_GetDeviceNames(t *testing.T) { tempDir, store := createStore(t) defer cleanupStore(t, tempDir, store) diff --git a/snapshots/devmapper/pool_device.go b/snapshots/devmapper/pool_device.go index c9cc0d5ec..e720f33e8 100644 --- a/snapshots/devmapper/pool_device.go +++ b/snapshots/devmapper/pool_device.go @@ -118,35 +118,59 @@ func (p *PoolDevice) CreateThinDevice(ctx context.Context, deviceName string, vi State: Unknown, } - // Save initial device metadata and allocate new device ID from store - if err := p.metadata.AddDevice(ctx, info); err != nil { - return errors.Wrapf(err, "failed to save initial metadata for new thin device %q", deviceName) - } + var ( + metaErr error + devErr error + activeErr error + ) defer func() { - if retErr == nil { + // We've created a devmapper device, but failed to activate it, try rollback everything + if activeErr != nil { + // Delete the device first. + delErr := p.deleteDevice(ctx, info) + if delErr != nil { + // Failed to rollback, mark the device as faulty and keep metadata in order to + // preserve the faulty device ID + retErr = multierror.Append(retErr, delErr, p.metadata.MarkFaulty(ctx, info)) + return + } + + // The devmapper device has been successfully deleted, deallocate device ID + if err := p.RemoveDevice(ctx, info.Name); err != nil { + retErr = multierror.Append(retErr, err) + return + } + return } - // Rollback metadata - retErr = multierror.Append(retErr, p.metadata.RemoveDevice(ctx, info.Name)) + // We're unable to create the devmapper device, most likely something wrong with the deviceID + if devErr != nil { + retErr = multierror.Append(retErr, p.metadata.MarkFaulty(ctx, info)) + return + } }() + // Save initial device metadata and allocate new device ID from store + metaErr = p.metadata.AddDevice(ctx, info) + if metaErr != nil { + return metaErr + } + // Create thin device - if err := p.createDevice(ctx, info); err != nil { - return err + devErr = p.createDevice(ctx, info) + if devErr != nil { + return devErr } - defer func() { - if retErr == nil { - return - } + // Activate thin device + activeErr = p.activateDevice(ctx, info) + if activeErr != nil { + return activeErr + } - // Rollback creation - retErr = multierror.Append(retErr, p.deleteDevice(ctx, info)) - }() - - return p.activateDevice(ctx, info) + return nil } // createDevice creates thin device @@ -185,36 +209,59 @@ func (p *PoolDevice) CreateSnapshotDevice(ctx context.Context, deviceName string State: Unknown, } - // Save snapshot metadata and allocate new device ID - if err := p.metadata.AddDevice(ctx, snapInfo); err != nil { - return errors.Wrapf(err, "failed to save initial metadata for snapshot %q", snapshotName) - } + var ( + metaErr error + devErr error + activeErr error + ) defer func() { - if retErr == nil { + // We've created a devmapper device, but failed to activate it, try rollback everything + if activeErr != nil { + // Delete the device first. + delErr := p.deleteDevice(ctx, snapInfo) + if delErr != nil { + // Failed to rollback, mark the device as faulty and keep metadata in order to + // preserve the faulty device ID + retErr = multierror.Append(retErr, delErr, p.metadata.MarkFaulty(ctx, snapInfo)) + return + } + + // The devmapper device has been successfully deleted, deallocate device ID + if err := p.RemoveDevice(ctx, snapInfo.Name); err != nil { + retErr = multierror.Append(retErr, err) + return + } + return } - // Rollback metadata - retErr = multierror.Append(retErr, p.metadata.RemoveDevice(ctx, snapInfo.Name)) + // We're unable to create the devmapper device, most likely something wrong with the deviceID + if devErr != nil { + retErr = multierror.Append(retErr, p.metadata.MarkFaulty(ctx, snapInfo)) + return + } }() + // Save snapshot metadata and allocate new device ID + metaErr = p.metadata.AddDevice(ctx, snapInfo) + if metaErr != nil { + return metaErr + } + // Create thin device snapshot - if err := p.createSnapshot(ctx, baseInfo, snapInfo); err != nil { - return err + devErr = p.createSnapshot(ctx, baseInfo, snapInfo) + if devErr != nil { + return devErr } - defer func() { - if retErr == nil { - return - } + // Activate the snapshot device + activeErr = p.activateDevice(ctx, snapInfo) + if activeErr != nil { + return activeErr + } - // Rollback snapshot creation - retErr = multierror.Append(retErr, p.deleteDevice(ctx, snapInfo)) - }() - - // Activate snapshot device - return p.activateDevice(ctx, snapInfo) + return nil } func (p *PoolDevice) createSnapshot(ctx context.Context, baseInfo, snapInfo *DeviceInfo) error { From 4d5a0e19ebe1fd57a4c2fc1f6b36249060bd8b4a Mon Sep 17 00:00:00 2001 From: Maksym Pavlenko Date: Tue, 30 Jul 2019 16:03:09 -0700 Subject: [PATCH 2/3] Mark faulty device in one transaction Signed-off-by: Maksym Pavlenko --- snapshots/devmapper/metadata.go | 31 ++++++++++++++-------------- snapshots/devmapper/metadata_test.go | 2 +- snapshots/devmapper/pool_device.go | 8 +++---- 3 files changed, 21 insertions(+), 20 deletions(-) diff --git a/snapshots/devmapper/metadata.go b/snapshots/devmapper/metadata.go index 4a861c5d3..9b0f069b7 100644 --- a/snapshots/devmapper/metadata.go +++ b/snapshots/devmapper/metadata.go @@ -24,7 +24,6 @@ import ( "fmt" "strconv" - "github.com/hashicorp/go-multierror" "github.com/pkg/errors" bolt "go.etcd.io/bbolt" ) @@ -127,23 +126,25 @@ func (m *PoolMetadata) AddDevice(ctx context.Context, info *DeviceInfo) error { // The snapshotter might attempt to recreate a device in 'Faulty' state with another devmapper ID in // subsequent calls, and in case of success it's status will be changed to 'Created' or 'Activated'. // The devmapper dev ID will remain in 'deviceFaulty' state until manually handled by a user. -func (m *PoolMetadata) MarkFaulty(ctx context.Context, info *DeviceInfo) error { - var result error +func (m *PoolMetadata) MarkFaulty(ctx context.Context, name string) error { + return m.db.Update(func(tx *bolt.Tx) error { + var ( + device = DeviceInfo{} + devBucket = tx.Bucket(devicesBucketName) + ) - if err := m.UpdateDevice(ctx, info.Name, func(deviceInfo *DeviceInfo) error { - deviceInfo.State = Faulty - return nil - }); err != nil { - result = multierror.Append(result, err) - } + if err := getObject(devBucket, name, &device); err != nil { + return err + } - if err := m.db.Update(func(tx *bolt.Tx) error { - return markDeviceID(tx, info.DeviceID, deviceFaulty) - }); err != nil { - result = multierror.Append(result, err) - } + device.State = Faulty - return result + if err := putObject(devBucket, name, &device, true); err != nil { + return err + } + + return markDeviceID(tx, device.DeviceID, deviceFaulty) + }) } // getNextDeviceID finds the next free device ID by taking a cursor diff --git a/snapshots/devmapper/metadata_test.go b/snapshots/devmapper/metadata_test.go index c4309a598..750802733 100644 --- a/snapshots/devmapper/metadata_test.go +++ b/snapshots/devmapper/metadata_test.go @@ -162,7 +162,7 @@ func TestPoolMetadata_MarkFaulty(t *testing.T) { err := store.AddDevice(testCtx, info) assert.NilError(t, err) - err = store.MarkFaulty(testCtx, info) + err = store.MarkFaulty(testCtx, "test") assert.NilError(t, err) saved, err := store.GetDevice(testCtx, info.Name) diff --git a/snapshots/devmapper/pool_device.go b/snapshots/devmapper/pool_device.go index e720f33e8..68e07af87 100644 --- a/snapshots/devmapper/pool_device.go +++ b/snapshots/devmapper/pool_device.go @@ -132,7 +132,7 @@ func (p *PoolDevice) CreateThinDevice(ctx context.Context, deviceName string, vi if delErr != nil { // Failed to rollback, mark the device as faulty and keep metadata in order to // preserve the faulty device ID - retErr = multierror.Append(retErr, delErr, p.metadata.MarkFaulty(ctx, info)) + retErr = multierror.Append(retErr, delErr, p.metadata.MarkFaulty(ctx, info.Name)) return } @@ -147,7 +147,7 @@ func (p *PoolDevice) CreateThinDevice(ctx context.Context, deviceName string, vi // We're unable to create the devmapper device, most likely something wrong with the deviceID if devErr != nil { - retErr = multierror.Append(retErr, p.metadata.MarkFaulty(ctx, info)) + retErr = multierror.Append(retErr, p.metadata.MarkFaulty(ctx, info.Name)) return } }() @@ -223,7 +223,7 @@ func (p *PoolDevice) CreateSnapshotDevice(ctx context.Context, deviceName string if delErr != nil { // Failed to rollback, mark the device as faulty and keep metadata in order to // preserve the faulty device ID - retErr = multierror.Append(retErr, delErr, p.metadata.MarkFaulty(ctx, snapInfo)) + retErr = multierror.Append(retErr, delErr, p.metadata.MarkFaulty(ctx, snapInfo.Name)) return } @@ -238,7 +238,7 @@ func (p *PoolDevice) CreateSnapshotDevice(ctx context.Context, deviceName string // We're unable to create the devmapper device, most likely something wrong with the deviceID if devErr != nil { - retErr = multierror.Append(retErr, p.metadata.MarkFaulty(ctx, snapInfo)) + retErr = multierror.Append(retErr, p.metadata.MarkFaulty(ctx, snapInfo.Name)) return } }() From 3741fd859116ae519059b43d674666945c84a638 Mon Sep 17 00:00:00 2001 From: Maksym Pavlenko Date: Wed, 31 Jul 2019 11:28:33 -0700 Subject: [PATCH 3/3] Remove deferred flag when removing devmapper device Signed-off-by: Maksym Pavlenko --- snapshots/devmapper/pool_device.go | 2 +- snapshots/testsuite/testsuite.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/snapshots/devmapper/pool_device.go b/snapshots/devmapper/pool_device.go index 68e07af87..22e78c138 100644 --- a/snapshots/devmapper/pool_device.go +++ b/snapshots/devmapper/pool_device.go @@ -364,7 +364,7 @@ func (p *PoolDevice) RemoveDevice(ctx context.Context, deviceName string) error return errors.Wrapf(err, "can't query metadata for device %q", deviceName) } - if err := p.DeactivateDevice(ctx, deviceName, true, true); err != nil { + if err := p.DeactivateDevice(ctx, deviceName, false, true); err != nil { return err } diff --git a/snapshots/testsuite/testsuite.go b/snapshots/testsuite/testsuite.go index 50e0232ed..ee64e30a5 100644 --- a/snapshots/testsuite/testsuite.go +++ b/snapshots/testsuite/testsuite.go @@ -501,7 +501,6 @@ func checkRemoveIntermediateSnapshot(ctx context.Context, t *testing.T, snapshot if err != nil { t.Fatal(err) } - defer testutil.Unmount(t, base) committedBase := filepath.Join(work, "committed-base") if err = snapshotter.Commit(ctx, committedBase, base, opt); err != nil { @@ -540,6 +539,7 @@ func checkRemoveIntermediateSnapshot(ctx context.Context, t *testing.T, snapshot if err != nil { t.Fatal(err) } + testutil.Unmount(t, base) err = snapshotter.Remove(ctx, committedBase) if err != nil { t.Fatal(err)