Merge pull request #3470 from mxpv/devmapper_err
Better error recovery in device mapper
This commit is contained in:
commit
cb46663725
@ -56,6 +56,8 @@ const (
|
||||
Removing
|
||||
// Removed means that device successfully removed but not yet deleted from meta store
|
||||
Removed
|
||||
// Faulty means that the device is errored and the snapshotter failed to rollback it
|
||||
Faulty
|
||||
)
|
||||
|
||||
func (s DeviceState) String() string {
|
||||
@ -84,6 +86,8 @@ func (s DeviceState) String() string {
|
||||
return "Removing"
|
||||
case Removed:
|
||||
return "Removed"
|
||||
case Faulty:
|
||||
return "Faulty"
|
||||
default:
|
||||
return fmt.Sprintf("unknown %d", s)
|
||||
}
|
||||
|
@ -38,6 +38,7 @@ type deviceIDState byte
|
||||
const (
|
||||
deviceFree deviceIDState = iota
|
||||
deviceTaken
|
||||
deviceFaulty
|
||||
)
|
||||
|
||||
// Bucket names
|
||||
@ -92,11 +93,14 @@ func (m *PoolMetadata) ensureDatabaseInitialized() error {
|
||||
|
||||
// AddDevice saves device info to database.
|
||||
func (m *PoolMetadata) AddDevice(ctx context.Context, info *DeviceInfo) error {
|
||||
return m.db.Update(func(tx *bolt.Tx) error {
|
||||
err := m.db.Update(func(tx *bolt.Tx) error {
|
||||
devicesBucket := tx.Bucket(devicesBucketName)
|
||||
|
||||
// Make sure device name is unique
|
||||
if err := getObject(devicesBucket, info.Name, nil); err == nil {
|
||||
// Make sure device name is unique. If there is already a device with the same name,
|
||||
// but in Faulty state, give it a try with another devmapper device ID.
|
||||
// See https://github.com/containerd/containerd/pull/3436 for more context.
|
||||
var existing DeviceInfo
|
||||
if err := getObject(devicesBucket, info.Name, &existing); err == nil && existing.State != Faulty {
|
||||
return ErrAlreadyExists
|
||||
}
|
||||
|
||||
@ -108,7 +112,38 @@ func (m *PoolMetadata) AddDevice(ctx context.Context, info *DeviceInfo) error {
|
||||
|
||||
info.DeviceID = deviceID
|
||||
|
||||
return putObject(devicesBucket, info.Name, info, false)
|
||||
return putObject(devicesBucket, info.Name, info, true)
|
||||
})
|
||||
|
||||
if err != nil {
|
||||
return errors.Wrapf(err, "failed to save metadata for device %q (parent: %q)", info.Name, info.ParentName)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// MarkFaulty marks the given device and corresponding devmapper device ID as faulty.
|
||||
// The snapshotter might attempt to recreate a device in 'Faulty' state with another devmapper ID in
|
||||
// subsequent calls, and in case of success it's status will be changed to 'Created' or 'Activated'.
|
||||
// The devmapper dev ID will remain in 'deviceFaulty' state until manually handled by a user.
|
||||
func (m *PoolMetadata) MarkFaulty(ctx context.Context, name string) error {
|
||||
return m.db.Update(func(tx *bolt.Tx) error {
|
||||
var (
|
||||
device = DeviceInfo{}
|
||||
devBucket = tx.Bucket(devicesBucketName)
|
||||
)
|
||||
|
||||
if err := getObject(devBucket, name, &device); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
device.State = Faulty
|
||||
|
||||
if err := putObject(devBucket, name, &device, true); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return markDeviceID(tx, device.DeviceID, deviceFaulty)
|
||||
})
|
||||
}
|
||||
|
||||
|
@ -23,8 +23,11 @@ import (
|
||||
"io/ioutil"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"testing"
|
||||
|
||||
"github.com/pkg/errors"
|
||||
"go.etcd.io/bbolt"
|
||||
"gotest.tools/assert"
|
||||
is "gotest.tools/assert/cmp"
|
||||
)
|
||||
@ -77,7 +80,7 @@ func TestPoolMetadata_AddDeviceDuplicate(t *testing.T) {
|
||||
assert.NilError(t, err)
|
||||
|
||||
err = store.AddDevice(testCtx, &DeviceInfo{Name: "test"})
|
||||
assert.Equal(t, ErrAlreadyExists, err)
|
||||
assert.Equal(t, ErrAlreadyExists, errors.Cause(err))
|
||||
}
|
||||
|
||||
func TestPoolMetadata_ReuseDeviceID(t *testing.T) {
|
||||
@ -151,6 +154,33 @@ func TestPoolMetadata_UpdateDevice(t *testing.T) {
|
||||
assert.Equal(t, Created, newInfo.State)
|
||||
}
|
||||
|
||||
func TestPoolMetadata_MarkFaulty(t *testing.T) {
|
||||
tempDir, store := createStore(t)
|
||||
defer cleanupStore(t, tempDir, store)
|
||||
|
||||
info := &DeviceInfo{Name: "test"}
|
||||
err := store.AddDevice(testCtx, info)
|
||||
assert.NilError(t, err)
|
||||
|
||||
err = store.MarkFaulty(testCtx, "test")
|
||||
assert.NilError(t, err)
|
||||
|
||||
saved, err := store.GetDevice(testCtx, info.Name)
|
||||
assert.NilError(t, err)
|
||||
assert.Equal(t, saved.State, Faulty)
|
||||
assert.Assert(t, saved.DeviceID > 0)
|
||||
|
||||
// Make sure a device ID marked as faulty as well
|
||||
err = store.db.View(func(tx *bbolt.Tx) error {
|
||||
bucket := tx.Bucket(deviceIDBucketName)
|
||||
key := strconv.FormatUint(uint64(saved.DeviceID), 10)
|
||||
value := bucket.Get([]byte(key))
|
||||
assert.Equal(t, value[0], byte(deviceFaulty))
|
||||
return nil
|
||||
})
|
||||
assert.NilError(t, err)
|
||||
}
|
||||
|
||||
func TestPoolMetadata_GetDeviceNames(t *testing.T) {
|
||||
tempDir, store := createStore(t)
|
||||
defer cleanupStore(t, tempDir, store)
|
||||
|
@ -118,35 +118,59 @@ func (p *PoolDevice) CreateThinDevice(ctx context.Context, deviceName string, vi
|
||||
State: Unknown,
|
||||
}
|
||||
|
||||
// Save initial device metadata and allocate new device ID from store
|
||||
if err := p.metadata.AddDevice(ctx, info); err != nil {
|
||||
return errors.Wrapf(err, "failed to save initial metadata for new thin device %q", deviceName)
|
||||
}
|
||||
var (
|
||||
metaErr error
|
||||
devErr error
|
||||
activeErr error
|
||||
)
|
||||
|
||||
defer func() {
|
||||
if retErr == nil {
|
||||
// We've created a devmapper device, but failed to activate it, try rollback everything
|
||||
if activeErr != nil {
|
||||
// Delete the device first.
|
||||
delErr := p.deleteDevice(ctx, info)
|
||||
if delErr != nil {
|
||||
// Failed to rollback, mark the device as faulty and keep metadata in order to
|
||||
// preserve the faulty device ID
|
||||
retErr = multierror.Append(retErr, delErr, p.metadata.MarkFaulty(ctx, info.Name))
|
||||
return
|
||||
}
|
||||
|
||||
// The devmapper device has been successfully deleted, deallocate device ID
|
||||
if err := p.RemoveDevice(ctx, info.Name); err != nil {
|
||||
retErr = multierror.Append(retErr, err)
|
||||
return
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
// Rollback metadata
|
||||
retErr = multierror.Append(retErr, p.metadata.RemoveDevice(ctx, info.Name))
|
||||
// We're unable to create the devmapper device, most likely something wrong with the deviceID
|
||||
if devErr != nil {
|
||||
retErr = multierror.Append(retErr, p.metadata.MarkFaulty(ctx, info.Name))
|
||||
return
|
||||
}
|
||||
}()
|
||||
|
||||
// Save initial device metadata and allocate new device ID from store
|
||||
metaErr = p.metadata.AddDevice(ctx, info)
|
||||
if metaErr != nil {
|
||||
return metaErr
|
||||
}
|
||||
|
||||
// Create thin device
|
||||
if err := p.createDevice(ctx, info); err != nil {
|
||||
return err
|
||||
devErr = p.createDevice(ctx, info)
|
||||
if devErr != nil {
|
||||
return devErr
|
||||
}
|
||||
|
||||
defer func() {
|
||||
if retErr == nil {
|
||||
return
|
||||
}
|
||||
// Activate thin device
|
||||
activeErr = p.activateDevice(ctx, info)
|
||||
if activeErr != nil {
|
||||
return activeErr
|
||||
}
|
||||
|
||||
// Rollback creation
|
||||
retErr = multierror.Append(retErr, p.deleteDevice(ctx, info))
|
||||
}()
|
||||
|
||||
return p.activateDevice(ctx, info)
|
||||
return nil
|
||||
}
|
||||
|
||||
// createDevice creates thin device
|
||||
@ -185,36 +209,59 @@ func (p *PoolDevice) CreateSnapshotDevice(ctx context.Context, deviceName string
|
||||
State: Unknown,
|
||||
}
|
||||
|
||||
// Save snapshot metadata and allocate new device ID
|
||||
if err := p.metadata.AddDevice(ctx, snapInfo); err != nil {
|
||||
return errors.Wrapf(err, "failed to save initial metadata for snapshot %q", snapshotName)
|
||||
}
|
||||
var (
|
||||
metaErr error
|
||||
devErr error
|
||||
activeErr error
|
||||
)
|
||||
|
||||
defer func() {
|
||||
if retErr == nil {
|
||||
// We've created a devmapper device, but failed to activate it, try rollback everything
|
||||
if activeErr != nil {
|
||||
// Delete the device first.
|
||||
delErr := p.deleteDevice(ctx, snapInfo)
|
||||
if delErr != nil {
|
||||
// Failed to rollback, mark the device as faulty and keep metadata in order to
|
||||
// preserve the faulty device ID
|
||||
retErr = multierror.Append(retErr, delErr, p.metadata.MarkFaulty(ctx, snapInfo.Name))
|
||||
return
|
||||
}
|
||||
|
||||
// The devmapper device has been successfully deleted, deallocate device ID
|
||||
if err := p.RemoveDevice(ctx, snapInfo.Name); err != nil {
|
||||
retErr = multierror.Append(retErr, err)
|
||||
return
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
// Rollback metadata
|
||||
retErr = multierror.Append(retErr, p.metadata.RemoveDevice(ctx, snapInfo.Name))
|
||||
// We're unable to create the devmapper device, most likely something wrong with the deviceID
|
||||
if devErr != nil {
|
||||
retErr = multierror.Append(retErr, p.metadata.MarkFaulty(ctx, snapInfo.Name))
|
||||
return
|
||||
}
|
||||
}()
|
||||
|
||||
// Save snapshot metadata and allocate new device ID
|
||||
metaErr = p.metadata.AddDevice(ctx, snapInfo)
|
||||
if metaErr != nil {
|
||||
return metaErr
|
||||
}
|
||||
|
||||
// Create thin device snapshot
|
||||
if err := p.createSnapshot(ctx, baseInfo, snapInfo); err != nil {
|
||||
return err
|
||||
devErr = p.createSnapshot(ctx, baseInfo, snapInfo)
|
||||
if devErr != nil {
|
||||
return devErr
|
||||
}
|
||||
|
||||
defer func() {
|
||||
if retErr == nil {
|
||||
return
|
||||
}
|
||||
// Activate the snapshot device
|
||||
activeErr = p.activateDevice(ctx, snapInfo)
|
||||
if activeErr != nil {
|
||||
return activeErr
|
||||
}
|
||||
|
||||
// Rollback snapshot creation
|
||||
retErr = multierror.Append(retErr, p.deleteDevice(ctx, snapInfo))
|
||||
}()
|
||||
|
||||
// Activate snapshot device
|
||||
return p.activateDevice(ctx, snapInfo)
|
||||
return nil
|
||||
}
|
||||
|
||||
func (p *PoolDevice) createSnapshot(ctx context.Context, baseInfo, snapInfo *DeviceInfo) error {
|
||||
@ -317,7 +364,7 @@ func (p *PoolDevice) RemoveDevice(ctx context.Context, deviceName string) error
|
||||
return errors.Wrapf(err, "can't query metadata for device %q", deviceName)
|
||||
}
|
||||
|
||||
if err := p.DeactivateDevice(ctx, deviceName, true, true); err != nil {
|
||||
if err := p.DeactivateDevice(ctx, deviceName, false, true); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
|
@ -501,7 +501,6 @@ func checkRemoveIntermediateSnapshot(ctx context.Context, t *testing.T, snapshot
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer testutil.Unmount(t, base)
|
||||
|
||||
committedBase := filepath.Join(work, "committed-base")
|
||||
if err = snapshotter.Commit(ctx, committedBase, base, opt); err != nil {
|
||||
@ -540,6 +539,7 @@ func checkRemoveIntermediateSnapshot(ctx context.Context, t *testing.T, snapshot
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
testutil.Unmount(t, base)
|
||||
err = snapshotter.Remove(ctx, committedBase)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
|
Loading…
Reference in New Issue
Block a user