Merge pull request #3470 from mxpv/devmapper_err
Better error recovery in device mapper
This commit is contained in:
commit
cb46663725
@ -56,6 +56,8 @@ const (
|
|||||||
Removing
|
Removing
|
||||||
// Removed means that device successfully removed but not yet deleted from meta store
|
// Removed means that device successfully removed but not yet deleted from meta store
|
||||||
Removed
|
Removed
|
||||||
|
// Faulty means that the device is errored and the snapshotter failed to rollback it
|
||||||
|
Faulty
|
||||||
)
|
)
|
||||||
|
|
||||||
func (s DeviceState) String() string {
|
func (s DeviceState) String() string {
|
||||||
@ -84,6 +86,8 @@ func (s DeviceState) String() string {
|
|||||||
return "Removing"
|
return "Removing"
|
||||||
case Removed:
|
case Removed:
|
||||||
return "Removed"
|
return "Removed"
|
||||||
|
case Faulty:
|
||||||
|
return "Faulty"
|
||||||
default:
|
default:
|
||||||
return fmt.Sprintf("unknown %d", s)
|
return fmt.Sprintf("unknown %d", s)
|
||||||
}
|
}
|
||||||
|
@ -38,6 +38,7 @@ type deviceIDState byte
|
|||||||
const (
|
const (
|
||||||
deviceFree deviceIDState = iota
|
deviceFree deviceIDState = iota
|
||||||
deviceTaken
|
deviceTaken
|
||||||
|
deviceFaulty
|
||||||
)
|
)
|
||||||
|
|
||||||
// Bucket names
|
// Bucket names
|
||||||
@ -92,11 +93,14 @@ func (m *PoolMetadata) ensureDatabaseInitialized() error {
|
|||||||
|
|
||||||
// AddDevice saves device info to database.
|
// AddDevice saves device info to database.
|
||||||
func (m *PoolMetadata) AddDevice(ctx context.Context, info *DeviceInfo) error {
|
func (m *PoolMetadata) AddDevice(ctx context.Context, info *DeviceInfo) error {
|
||||||
return m.db.Update(func(tx *bolt.Tx) error {
|
err := m.db.Update(func(tx *bolt.Tx) error {
|
||||||
devicesBucket := tx.Bucket(devicesBucketName)
|
devicesBucket := tx.Bucket(devicesBucketName)
|
||||||
|
|
||||||
// Make sure device name is unique
|
// Make sure device name is unique. If there is already a device with the same name,
|
||||||
if err := getObject(devicesBucket, info.Name, nil); err == nil {
|
// but in Faulty state, give it a try with another devmapper device ID.
|
||||||
|
// See https://github.com/containerd/containerd/pull/3436 for more context.
|
||||||
|
var existing DeviceInfo
|
||||||
|
if err := getObject(devicesBucket, info.Name, &existing); err == nil && existing.State != Faulty {
|
||||||
return ErrAlreadyExists
|
return ErrAlreadyExists
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -108,7 +112,38 @@ func (m *PoolMetadata) AddDevice(ctx context.Context, info *DeviceInfo) error {
|
|||||||
|
|
||||||
info.DeviceID = deviceID
|
info.DeviceID = deviceID
|
||||||
|
|
||||||
return putObject(devicesBucket, info.Name, info, false)
|
return putObject(devicesBucket, info.Name, info, true)
|
||||||
|
})
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
return errors.Wrapf(err, "failed to save metadata for device %q (parent: %q)", info.Name, info.ParentName)
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// MarkFaulty marks the given device and corresponding devmapper device ID as faulty.
|
||||||
|
// The snapshotter might attempt to recreate a device in 'Faulty' state with another devmapper ID in
|
||||||
|
// subsequent calls, and in case of success it's status will be changed to 'Created' or 'Activated'.
|
||||||
|
// The devmapper dev ID will remain in 'deviceFaulty' state until manually handled by a user.
|
||||||
|
func (m *PoolMetadata) MarkFaulty(ctx context.Context, name string) error {
|
||||||
|
return m.db.Update(func(tx *bolt.Tx) error {
|
||||||
|
var (
|
||||||
|
device = DeviceInfo{}
|
||||||
|
devBucket = tx.Bucket(devicesBucketName)
|
||||||
|
)
|
||||||
|
|
||||||
|
if err := getObject(devBucket, name, &device); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
device.State = Faulty
|
||||||
|
|
||||||
|
if err := putObject(devBucket, name, &device, true); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
return markDeviceID(tx, device.DeviceID, deviceFaulty)
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -23,8 +23,11 @@ import (
|
|||||||
"io/ioutil"
|
"io/ioutil"
|
||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
|
"strconv"
|
||||||
"testing"
|
"testing"
|
||||||
|
|
||||||
|
"github.com/pkg/errors"
|
||||||
|
"go.etcd.io/bbolt"
|
||||||
"gotest.tools/assert"
|
"gotest.tools/assert"
|
||||||
is "gotest.tools/assert/cmp"
|
is "gotest.tools/assert/cmp"
|
||||||
)
|
)
|
||||||
@ -77,7 +80,7 @@ func TestPoolMetadata_AddDeviceDuplicate(t *testing.T) {
|
|||||||
assert.NilError(t, err)
|
assert.NilError(t, err)
|
||||||
|
|
||||||
err = store.AddDevice(testCtx, &DeviceInfo{Name: "test"})
|
err = store.AddDevice(testCtx, &DeviceInfo{Name: "test"})
|
||||||
assert.Equal(t, ErrAlreadyExists, err)
|
assert.Equal(t, ErrAlreadyExists, errors.Cause(err))
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestPoolMetadata_ReuseDeviceID(t *testing.T) {
|
func TestPoolMetadata_ReuseDeviceID(t *testing.T) {
|
||||||
@ -151,6 +154,33 @@ func TestPoolMetadata_UpdateDevice(t *testing.T) {
|
|||||||
assert.Equal(t, Created, newInfo.State)
|
assert.Equal(t, Created, newInfo.State)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestPoolMetadata_MarkFaulty(t *testing.T) {
|
||||||
|
tempDir, store := createStore(t)
|
||||||
|
defer cleanupStore(t, tempDir, store)
|
||||||
|
|
||||||
|
info := &DeviceInfo{Name: "test"}
|
||||||
|
err := store.AddDevice(testCtx, info)
|
||||||
|
assert.NilError(t, err)
|
||||||
|
|
||||||
|
err = store.MarkFaulty(testCtx, "test")
|
||||||
|
assert.NilError(t, err)
|
||||||
|
|
||||||
|
saved, err := store.GetDevice(testCtx, info.Name)
|
||||||
|
assert.NilError(t, err)
|
||||||
|
assert.Equal(t, saved.State, Faulty)
|
||||||
|
assert.Assert(t, saved.DeviceID > 0)
|
||||||
|
|
||||||
|
// Make sure a device ID marked as faulty as well
|
||||||
|
err = store.db.View(func(tx *bbolt.Tx) error {
|
||||||
|
bucket := tx.Bucket(deviceIDBucketName)
|
||||||
|
key := strconv.FormatUint(uint64(saved.DeviceID), 10)
|
||||||
|
value := bucket.Get([]byte(key))
|
||||||
|
assert.Equal(t, value[0], byte(deviceFaulty))
|
||||||
|
return nil
|
||||||
|
})
|
||||||
|
assert.NilError(t, err)
|
||||||
|
}
|
||||||
|
|
||||||
func TestPoolMetadata_GetDeviceNames(t *testing.T) {
|
func TestPoolMetadata_GetDeviceNames(t *testing.T) {
|
||||||
tempDir, store := createStore(t)
|
tempDir, store := createStore(t)
|
||||||
defer cleanupStore(t, tempDir, store)
|
defer cleanupStore(t, tempDir, store)
|
||||||
|
@ -118,35 +118,59 @@ func (p *PoolDevice) CreateThinDevice(ctx context.Context, deviceName string, vi
|
|||||||
State: Unknown,
|
State: Unknown,
|
||||||
}
|
}
|
||||||
|
|
||||||
// Save initial device metadata and allocate new device ID from store
|
var (
|
||||||
if err := p.metadata.AddDevice(ctx, info); err != nil {
|
metaErr error
|
||||||
return errors.Wrapf(err, "failed to save initial metadata for new thin device %q", deviceName)
|
devErr error
|
||||||
}
|
activeErr error
|
||||||
|
)
|
||||||
|
|
||||||
defer func() {
|
defer func() {
|
||||||
if retErr == nil {
|
// We've created a devmapper device, but failed to activate it, try rollback everything
|
||||||
|
if activeErr != nil {
|
||||||
|
// Delete the device first.
|
||||||
|
delErr := p.deleteDevice(ctx, info)
|
||||||
|
if delErr != nil {
|
||||||
|
// Failed to rollback, mark the device as faulty and keep metadata in order to
|
||||||
|
// preserve the faulty device ID
|
||||||
|
retErr = multierror.Append(retErr, delErr, p.metadata.MarkFaulty(ctx, info.Name))
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// The devmapper device has been successfully deleted, deallocate device ID
|
||||||
|
if err := p.RemoveDevice(ctx, info.Name); err != nil {
|
||||||
|
retErr = multierror.Append(retErr, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
// Rollback metadata
|
// We're unable to create the devmapper device, most likely something wrong with the deviceID
|
||||||
retErr = multierror.Append(retErr, p.metadata.RemoveDevice(ctx, info.Name))
|
if devErr != nil {
|
||||||
|
retErr = multierror.Append(retErr, p.metadata.MarkFaulty(ctx, info.Name))
|
||||||
|
return
|
||||||
|
}
|
||||||
}()
|
}()
|
||||||
|
|
||||||
|
// Save initial device metadata and allocate new device ID from store
|
||||||
|
metaErr = p.metadata.AddDevice(ctx, info)
|
||||||
|
if metaErr != nil {
|
||||||
|
return metaErr
|
||||||
|
}
|
||||||
|
|
||||||
// Create thin device
|
// Create thin device
|
||||||
if err := p.createDevice(ctx, info); err != nil {
|
devErr = p.createDevice(ctx, info)
|
||||||
return err
|
if devErr != nil {
|
||||||
|
return devErr
|
||||||
}
|
}
|
||||||
|
|
||||||
defer func() {
|
// Activate thin device
|
||||||
if retErr == nil {
|
activeErr = p.activateDevice(ctx, info)
|
||||||
return
|
if activeErr != nil {
|
||||||
}
|
return activeErr
|
||||||
|
}
|
||||||
|
|
||||||
// Rollback creation
|
return nil
|
||||||
retErr = multierror.Append(retErr, p.deleteDevice(ctx, info))
|
|
||||||
}()
|
|
||||||
|
|
||||||
return p.activateDevice(ctx, info)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// createDevice creates thin device
|
// createDevice creates thin device
|
||||||
@ -185,36 +209,59 @@ func (p *PoolDevice) CreateSnapshotDevice(ctx context.Context, deviceName string
|
|||||||
State: Unknown,
|
State: Unknown,
|
||||||
}
|
}
|
||||||
|
|
||||||
// Save snapshot metadata and allocate new device ID
|
var (
|
||||||
if err := p.metadata.AddDevice(ctx, snapInfo); err != nil {
|
metaErr error
|
||||||
return errors.Wrapf(err, "failed to save initial metadata for snapshot %q", snapshotName)
|
devErr error
|
||||||
}
|
activeErr error
|
||||||
|
)
|
||||||
|
|
||||||
defer func() {
|
defer func() {
|
||||||
if retErr == nil {
|
// We've created a devmapper device, but failed to activate it, try rollback everything
|
||||||
|
if activeErr != nil {
|
||||||
|
// Delete the device first.
|
||||||
|
delErr := p.deleteDevice(ctx, snapInfo)
|
||||||
|
if delErr != nil {
|
||||||
|
// Failed to rollback, mark the device as faulty and keep metadata in order to
|
||||||
|
// preserve the faulty device ID
|
||||||
|
retErr = multierror.Append(retErr, delErr, p.metadata.MarkFaulty(ctx, snapInfo.Name))
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// The devmapper device has been successfully deleted, deallocate device ID
|
||||||
|
if err := p.RemoveDevice(ctx, snapInfo.Name); err != nil {
|
||||||
|
retErr = multierror.Append(retErr, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
// Rollback metadata
|
// We're unable to create the devmapper device, most likely something wrong with the deviceID
|
||||||
retErr = multierror.Append(retErr, p.metadata.RemoveDevice(ctx, snapInfo.Name))
|
if devErr != nil {
|
||||||
|
retErr = multierror.Append(retErr, p.metadata.MarkFaulty(ctx, snapInfo.Name))
|
||||||
|
return
|
||||||
|
}
|
||||||
}()
|
}()
|
||||||
|
|
||||||
|
// Save snapshot metadata and allocate new device ID
|
||||||
|
metaErr = p.metadata.AddDevice(ctx, snapInfo)
|
||||||
|
if metaErr != nil {
|
||||||
|
return metaErr
|
||||||
|
}
|
||||||
|
|
||||||
// Create thin device snapshot
|
// Create thin device snapshot
|
||||||
if err := p.createSnapshot(ctx, baseInfo, snapInfo); err != nil {
|
devErr = p.createSnapshot(ctx, baseInfo, snapInfo)
|
||||||
return err
|
if devErr != nil {
|
||||||
|
return devErr
|
||||||
}
|
}
|
||||||
|
|
||||||
defer func() {
|
// Activate the snapshot device
|
||||||
if retErr == nil {
|
activeErr = p.activateDevice(ctx, snapInfo)
|
||||||
return
|
if activeErr != nil {
|
||||||
}
|
return activeErr
|
||||||
|
}
|
||||||
|
|
||||||
// Rollback snapshot creation
|
return nil
|
||||||
retErr = multierror.Append(retErr, p.deleteDevice(ctx, snapInfo))
|
|
||||||
}()
|
|
||||||
|
|
||||||
// Activate snapshot device
|
|
||||||
return p.activateDevice(ctx, snapInfo)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *PoolDevice) createSnapshot(ctx context.Context, baseInfo, snapInfo *DeviceInfo) error {
|
func (p *PoolDevice) createSnapshot(ctx context.Context, baseInfo, snapInfo *DeviceInfo) error {
|
||||||
@ -317,7 +364,7 @@ func (p *PoolDevice) RemoveDevice(ctx context.Context, deviceName string) error
|
|||||||
return errors.Wrapf(err, "can't query metadata for device %q", deviceName)
|
return errors.Wrapf(err, "can't query metadata for device %q", deviceName)
|
||||||
}
|
}
|
||||||
|
|
||||||
if err := p.DeactivateDevice(ctx, deviceName, true, true); err != nil {
|
if err := p.DeactivateDevice(ctx, deviceName, false, true); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -501,7 +501,6 @@ func checkRemoveIntermediateSnapshot(ctx context.Context, t *testing.T, snapshot
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
defer testutil.Unmount(t, base)
|
|
||||||
|
|
||||||
committedBase := filepath.Join(work, "committed-base")
|
committedBase := filepath.Join(work, "committed-base")
|
||||||
if err = snapshotter.Commit(ctx, committedBase, base, opt); err != nil {
|
if err = snapshotter.Commit(ctx, committedBase, base, opt); err != nil {
|
||||||
@ -540,6 +539,7 @@ func checkRemoveIntermediateSnapshot(ctx context.Context, t *testing.T, snapshot
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
|
testutil.Unmount(t, base)
|
||||||
err = snapshotter.Remove(ctx, committedBase)
|
err = snapshotter.Remove(ctx, committedBase)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
|
Loading…
Reference in New Issue
Block a user