Merge pull request #3470 from mxpv/devmapper_err

Better error recovery in device mapper
This commit is contained in:
Phil Estes 2019-08-05 13:10:06 -04:00 committed by GitHub
commit cb46663725
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 160 additions and 44 deletions

View File

@ -56,6 +56,8 @@ const (
Removing
// Removed means that device successfully removed but not yet deleted from meta store
Removed
// Faulty means that the device is errored and the snapshotter failed to rollback it
Faulty
)
func (s DeviceState) String() string {
@ -84,6 +86,8 @@ func (s DeviceState) String() string {
return "Removing"
case Removed:
return "Removed"
case Faulty:
return "Faulty"
default:
return fmt.Sprintf("unknown %d", s)
}

View File

@ -38,6 +38,7 @@ type deviceIDState byte
const (
deviceFree deviceIDState = iota
deviceTaken
deviceFaulty
)
// Bucket names
@ -92,11 +93,14 @@ func (m *PoolMetadata) ensureDatabaseInitialized() error {
// AddDevice saves device info to database.
func (m *PoolMetadata) AddDevice(ctx context.Context, info *DeviceInfo) error {
return m.db.Update(func(tx *bolt.Tx) error {
err := m.db.Update(func(tx *bolt.Tx) error {
devicesBucket := tx.Bucket(devicesBucketName)
// Make sure device name is unique
if err := getObject(devicesBucket, info.Name, nil); err == nil {
// Make sure device name is unique. If there is already a device with the same name,
// but in Faulty state, give it a try with another devmapper device ID.
// See https://github.com/containerd/containerd/pull/3436 for more context.
var existing DeviceInfo
if err := getObject(devicesBucket, info.Name, &existing); err == nil && existing.State != Faulty {
return ErrAlreadyExists
}
@ -108,7 +112,38 @@ func (m *PoolMetadata) AddDevice(ctx context.Context, info *DeviceInfo) error {
info.DeviceID = deviceID
return putObject(devicesBucket, info.Name, info, false)
return putObject(devicesBucket, info.Name, info, true)
})
if err != nil {
return errors.Wrapf(err, "failed to save metadata for device %q (parent: %q)", info.Name, info.ParentName)
}
return nil
}
// MarkFaulty marks the given device and corresponding devmapper device ID as faulty.
// The snapshotter might attempt to recreate a device in 'Faulty' state with another devmapper ID in
// subsequent calls, and in case of success it's status will be changed to 'Created' or 'Activated'.
// The devmapper dev ID will remain in 'deviceFaulty' state until manually handled by a user.
func (m *PoolMetadata) MarkFaulty(ctx context.Context, name string) error {
return m.db.Update(func(tx *bolt.Tx) error {
var (
device = DeviceInfo{}
devBucket = tx.Bucket(devicesBucketName)
)
if err := getObject(devBucket, name, &device); err != nil {
return err
}
device.State = Faulty
if err := putObject(devBucket, name, &device, true); err != nil {
return err
}
return markDeviceID(tx, device.DeviceID, deviceFaulty)
})
}

View File

@ -23,8 +23,11 @@ import (
"io/ioutil"
"os"
"path/filepath"
"strconv"
"testing"
"github.com/pkg/errors"
"go.etcd.io/bbolt"
"gotest.tools/assert"
is "gotest.tools/assert/cmp"
)
@ -77,7 +80,7 @@ func TestPoolMetadata_AddDeviceDuplicate(t *testing.T) {
assert.NilError(t, err)
err = store.AddDevice(testCtx, &DeviceInfo{Name: "test"})
assert.Equal(t, ErrAlreadyExists, err)
assert.Equal(t, ErrAlreadyExists, errors.Cause(err))
}
func TestPoolMetadata_ReuseDeviceID(t *testing.T) {
@ -151,6 +154,33 @@ func TestPoolMetadata_UpdateDevice(t *testing.T) {
assert.Equal(t, Created, newInfo.State)
}
func TestPoolMetadata_MarkFaulty(t *testing.T) {
tempDir, store := createStore(t)
defer cleanupStore(t, tempDir, store)
info := &DeviceInfo{Name: "test"}
err := store.AddDevice(testCtx, info)
assert.NilError(t, err)
err = store.MarkFaulty(testCtx, "test")
assert.NilError(t, err)
saved, err := store.GetDevice(testCtx, info.Name)
assert.NilError(t, err)
assert.Equal(t, saved.State, Faulty)
assert.Assert(t, saved.DeviceID > 0)
// Make sure a device ID marked as faulty as well
err = store.db.View(func(tx *bbolt.Tx) error {
bucket := tx.Bucket(deviceIDBucketName)
key := strconv.FormatUint(uint64(saved.DeviceID), 10)
value := bucket.Get([]byte(key))
assert.Equal(t, value[0], byte(deviceFaulty))
return nil
})
assert.NilError(t, err)
}
func TestPoolMetadata_GetDeviceNames(t *testing.T) {
tempDir, store := createStore(t)
defer cleanupStore(t, tempDir, store)

View File

@ -118,35 +118,59 @@ func (p *PoolDevice) CreateThinDevice(ctx context.Context, deviceName string, vi
State: Unknown,
}
// Save initial device metadata and allocate new device ID from store
if err := p.metadata.AddDevice(ctx, info); err != nil {
return errors.Wrapf(err, "failed to save initial metadata for new thin device %q", deviceName)
}
var (
metaErr error
devErr error
activeErr error
)
defer func() {
if retErr == nil {
// We've created a devmapper device, but failed to activate it, try rollback everything
if activeErr != nil {
// Delete the device first.
delErr := p.deleteDevice(ctx, info)
if delErr != nil {
// Failed to rollback, mark the device as faulty and keep metadata in order to
// preserve the faulty device ID
retErr = multierror.Append(retErr, delErr, p.metadata.MarkFaulty(ctx, info.Name))
return
}
// The devmapper device has been successfully deleted, deallocate device ID
if err := p.RemoveDevice(ctx, info.Name); err != nil {
retErr = multierror.Append(retErr, err)
return
}
return
}
// Rollback metadata
retErr = multierror.Append(retErr, p.metadata.RemoveDevice(ctx, info.Name))
// We're unable to create the devmapper device, most likely something wrong with the deviceID
if devErr != nil {
retErr = multierror.Append(retErr, p.metadata.MarkFaulty(ctx, info.Name))
return
}
}()
// Save initial device metadata and allocate new device ID from store
metaErr = p.metadata.AddDevice(ctx, info)
if metaErr != nil {
return metaErr
}
// Create thin device
if err := p.createDevice(ctx, info); err != nil {
return err
devErr = p.createDevice(ctx, info)
if devErr != nil {
return devErr
}
defer func() {
if retErr == nil {
return
}
// Activate thin device
activeErr = p.activateDevice(ctx, info)
if activeErr != nil {
return activeErr
}
// Rollback creation
retErr = multierror.Append(retErr, p.deleteDevice(ctx, info))
}()
return p.activateDevice(ctx, info)
return nil
}
// createDevice creates thin device
@ -185,36 +209,59 @@ func (p *PoolDevice) CreateSnapshotDevice(ctx context.Context, deviceName string
State: Unknown,
}
// Save snapshot metadata and allocate new device ID
if err := p.metadata.AddDevice(ctx, snapInfo); err != nil {
return errors.Wrapf(err, "failed to save initial metadata for snapshot %q", snapshotName)
}
var (
metaErr error
devErr error
activeErr error
)
defer func() {
if retErr == nil {
// We've created a devmapper device, but failed to activate it, try rollback everything
if activeErr != nil {
// Delete the device first.
delErr := p.deleteDevice(ctx, snapInfo)
if delErr != nil {
// Failed to rollback, mark the device as faulty and keep metadata in order to
// preserve the faulty device ID
retErr = multierror.Append(retErr, delErr, p.metadata.MarkFaulty(ctx, snapInfo.Name))
return
}
// The devmapper device has been successfully deleted, deallocate device ID
if err := p.RemoveDevice(ctx, snapInfo.Name); err != nil {
retErr = multierror.Append(retErr, err)
return
}
return
}
// Rollback metadata
retErr = multierror.Append(retErr, p.metadata.RemoveDevice(ctx, snapInfo.Name))
// We're unable to create the devmapper device, most likely something wrong with the deviceID
if devErr != nil {
retErr = multierror.Append(retErr, p.metadata.MarkFaulty(ctx, snapInfo.Name))
return
}
}()
// Save snapshot metadata and allocate new device ID
metaErr = p.metadata.AddDevice(ctx, snapInfo)
if metaErr != nil {
return metaErr
}
// Create thin device snapshot
if err := p.createSnapshot(ctx, baseInfo, snapInfo); err != nil {
return err
devErr = p.createSnapshot(ctx, baseInfo, snapInfo)
if devErr != nil {
return devErr
}
defer func() {
if retErr == nil {
return
}
// Activate the snapshot device
activeErr = p.activateDevice(ctx, snapInfo)
if activeErr != nil {
return activeErr
}
// Rollback snapshot creation
retErr = multierror.Append(retErr, p.deleteDevice(ctx, snapInfo))
}()
// Activate snapshot device
return p.activateDevice(ctx, snapInfo)
return nil
}
func (p *PoolDevice) createSnapshot(ctx context.Context, baseInfo, snapInfo *DeviceInfo) error {
@ -317,7 +364,7 @@ func (p *PoolDevice) RemoveDevice(ctx context.Context, deviceName string) error
return errors.Wrapf(err, "can't query metadata for device %q", deviceName)
}
if err := p.DeactivateDevice(ctx, deviceName, true, true); err != nil {
if err := p.DeactivateDevice(ctx, deviceName, false, true); err != nil {
return err
}

View File

@ -501,7 +501,6 @@ func checkRemoveIntermediateSnapshot(ctx context.Context, t *testing.T, snapshot
if err != nil {
t.Fatal(err)
}
defer testutil.Unmount(t, base)
committedBase := filepath.Join(work, "committed-base")
if err = snapshotter.Commit(ctx, committedBase, base, opt); err != nil {
@ -540,6 +539,7 @@ func checkRemoveIntermediateSnapshot(ctx context.Context, t *testing.T, snapshot
if err != nil {
t.Fatal(err)
}
testutil.Unmount(t, base)
err = snapshotter.Remove(ctx, committedBase)
if err != nil {
t.Fatal(err)