Merge pull request #3470 from mxpv/devmapper_err

Better error recovery in device mapper
This commit is contained in:
Phil Estes 2019-08-05 13:10:06 -04:00 committed by GitHub
commit cb46663725
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 160 additions and 44 deletions

View File

@ -56,6 +56,8 @@ const (
Removing Removing
// Removed means that device successfully removed but not yet deleted from meta store // Removed means that device successfully removed but not yet deleted from meta store
Removed Removed
// Faulty means that the device is errored and the snapshotter failed to rollback it
Faulty
) )
func (s DeviceState) String() string { func (s DeviceState) String() string {
@ -84,6 +86,8 @@ func (s DeviceState) String() string {
return "Removing" return "Removing"
case Removed: case Removed:
return "Removed" return "Removed"
case Faulty:
return "Faulty"
default: default:
return fmt.Sprintf("unknown %d", s) return fmt.Sprintf("unknown %d", s)
} }

View File

@ -38,6 +38,7 @@ type deviceIDState byte
const ( const (
deviceFree deviceIDState = iota deviceFree deviceIDState = iota
deviceTaken deviceTaken
deviceFaulty
) )
// Bucket names // Bucket names
@ -92,11 +93,14 @@ func (m *PoolMetadata) ensureDatabaseInitialized() error {
// AddDevice saves device info to database. // AddDevice saves device info to database.
func (m *PoolMetadata) AddDevice(ctx context.Context, info *DeviceInfo) error { func (m *PoolMetadata) AddDevice(ctx context.Context, info *DeviceInfo) error {
return m.db.Update(func(tx *bolt.Tx) error { err := m.db.Update(func(tx *bolt.Tx) error {
devicesBucket := tx.Bucket(devicesBucketName) devicesBucket := tx.Bucket(devicesBucketName)
// Make sure device name is unique // Make sure device name is unique. If there is already a device with the same name,
if err := getObject(devicesBucket, info.Name, nil); err == nil { // but in Faulty state, give it a try with another devmapper device ID.
// See https://github.com/containerd/containerd/pull/3436 for more context.
var existing DeviceInfo
if err := getObject(devicesBucket, info.Name, &existing); err == nil && existing.State != Faulty {
return ErrAlreadyExists return ErrAlreadyExists
} }
@ -108,7 +112,38 @@ func (m *PoolMetadata) AddDevice(ctx context.Context, info *DeviceInfo) error {
info.DeviceID = deviceID info.DeviceID = deviceID
return putObject(devicesBucket, info.Name, info, false) return putObject(devicesBucket, info.Name, info, true)
})
if err != nil {
return errors.Wrapf(err, "failed to save metadata for device %q (parent: %q)", info.Name, info.ParentName)
}
return nil
}
// MarkFaulty marks the given device and corresponding devmapper device ID as faulty.
// The snapshotter might attempt to recreate a device in 'Faulty' state with another devmapper ID in
// subsequent calls, and in case of success it's status will be changed to 'Created' or 'Activated'.
// The devmapper dev ID will remain in 'deviceFaulty' state until manually handled by a user.
func (m *PoolMetadata) MarkFaulty(ctx context.Context, name string) error {
return m.db.Update(func(tx *bolt.Tx) error {
var (
device = DeviceInfo{}
devBucket = tx.Bucket(devicesBucketName)
)
if err := getObject(devBucket, name, &device); err != nil {
return err
}
device.State = Faulty
if err := putObject(devBucket, name, &device, true); err != nil {
return err
}
return markDeviceID(tx, device.DeviceID, deviceFaulty)
}) })
} }

View File

@ -23,8 +23,11 @@ import (
"io/ioutil" "io/ioutil"
"os" "os"
"path/filepath" "path/filepath"
"strconv"
"testing" "testing"
"github.com/pkg/errors"
"go.etcd.io/bbolt"
"gotest.tools/assert" "gotest.tools/assert"
is "gotest.tools/assert/cmp" is "gotest.tools/assert/cmp"
) )
@ -77,7 +80,7 @@ func TestPoolMetadata_AddDeviceDuplicate(t *testing.T) {
assert.NilError(t, err) assert.NilError(t, err)
err = store.AddDevice(testCtx, &DeviceInfo{Name: "test"}) err = store.AddDevice(testCtx, &DeviceInfo{Name: "test"})
assert.Equal(t, ErrAlreadyExists, err) assert.Equal(t, ErrAlreadyExists, errors.Cause(err))
} }
func TestPoolMetadata_ReuseDeviceID(t *testing.T) { func TestPoolMetadata_ReuseDeviceID(t *testing.T) {
@ -151,6 +154,33 @@ func TestPoolMetadata_UpdateDevice(t *testing.T) {
assert.Equal(t, Created, newInfo.State) assert.Equal(t, Created, newInfo.State)
} }
func TestPoolMetadata_MarkFaulty(t *testing.T) {
tempDir, store := createStore(t)
defer cleanupStore(t, tempDir, store)
info := &DeviceInfo{Name: "test"}
err := store.AddDevice(testCtx, info)
assert.NilError(t, err)
err = store.MarkFaulty(testCtx, "test")
assert.NilError(t, err)
saved, err := store.GetDevice(testCtx, info.Name)
assert.NilError(t, err)
assert.Equal(t, saved.State, Faulty)
assert.Assert(t, saved.DeviceID > 0)
// Make sure a device ID marked as faulty as well
err = store.db.View(func(tx *bbolt.Tx) error {
bucket := tx.Bucket(deviceIDBucketName)
key := strconv.FormatUint(uint64(saved.DeviceID), 10)
value := bucket.Get([]byte(key))
assert.Equal(t, value[0], byte(deviceFaulty))
return nil
})
assert.NilError(t, err)
}
func TestPoolMetadata_GetDeviceNames(t *testing.T) { func TestPoolMetadata_GetDeviceNames(t *testing.T) {
tempDir, store := createStore(t) tempDir, store := createStore(t)
defer cleanupStore(t, tempDir, store) defer cleanupStore(t, tempDir, store)

View File

@ -118,35 +118,59 @@ func (p *PoolDevice) CreateThinDevice(ctx context.Context, deviceName string, vi
State: Unknown, State: Unknown,
} }
// Save initial device metadata and allocate new device ID from store var (
if err := p.metadata.AddDevice(ctx, info); err != nil { metaErr error
return errors.Wrapf(err, "failed to save initial metadata for new thin device %q", deviceName) devErr error
} activeErr error
)
defer func() { defer func() {
if retErr == nil { // We've created a devmapper device, but failed to activate it, try rollback everything
if activeErr != nil {
// Delete the device first.
delErr := p.deleteDevice(ctx, info)
if delErr != nil {
// Failed to rollback, mark the device as faulty and keep metadata in order to
// preserve the faulty device ID
retErr = multierror.Append(retErr, delErr, p.metadata.MarkFaulty(ctx, info.Name))
return
}
// The devmapper device has been successfully deleted, deallocate device ID
if err := p.RemoveDevice(ctx, info.Name); err != nil {
retErr = multierror.Append(retErr, err)
return
}
return return
} }
// Rollback metadata // We're unable to create the devmapper device, most likely something wrong with the deviceID
retErr = multierror.Append(retErr, p.metadata.RemoveDevice(ctx, info.Name)) if devErr != nil {
retErr = multierror.Append(retErr, p.metadata.MarkFaulty(ctx, info.Name))
return
}
}() }()
// Save initial device metadata and allocate new device ID from store
metaErr = p.metadata.AddDevice(ctx, info)
if metaErr != nil {
return metaErr
}
// Create thin device // Create thin device
if err := p.createDevice(ctx, info); err != nil { devErr = p.createDevice(ctx, info)
return err if devErr != nil {
return devErr
} }
defer func() { // Activate thin device
if retErr == nil { activeErr = p.activateDevice(ctx, info)
return if activeErr != nil {
} return activeErr
}
// Rollback creation return nil
retErr = multierror.Append(retErr, p.deleteDevice(ctx, info))
}()
return p.activateDevice(ctx, info)
} }
// createDevice creates thin device // createDevice creates thin device
@ -185,36 +209,59 @@ func (p *PoolDevice) CreateSnapshotDevice(ctx context.Context, deviceName string
State: Unknown, State: Unknown,
} }
// Save snapshot metadata and allocate new device ID var (
if err := p.metadata.AddDevice(ctx, snapInfo); err != nil { metaErr error
return errors.Wrapf(err, "failed to save initial metadata for snapshot %q", snapshotName) devErr error
} activeErr error
)
defer func() { defer func() {
if retErr == nil { // We've created a devmapper device, but failed to activate it, try rollback everything
if activeErr != nil {
// Delete the device first.
delErr := p.deleteDevice(ctx, snapInfo)
if delErr != nil {
// Failed to rollback, mark the device as faulty and keep metadata in order to
// preserve the faulty device ID
retErr = multierror.Append(retErr, delErr, p.metadata.MarkFaulty(ctx, snapInfo.Name))
return
}
// The devmapper device has been successfully deleted, deallocate device ID
if err := p.RemoveDevice(ctx, snapInfo.Name); err != nil {
retErr = multierror.Append(retErr, err)
return
}
return return
} }
// Rollback metadata // We're unable to create the devmapper device, most likely something wrong with the deviceID
retErr = multierror.Append(retErr, p.metadata.RemoveDevice(ctx, snapInfo.Name)) if devErr != nil {
retErr = multierror.Append(retErr, p.metadata.MarkFaulty(ctx, snapInfo.Name))
return
}
}() }()
// Save snapshot metadata and allocate new device ID
metaErr = p.metadata.AddDevice(ctx, snapInfo)
if metaErr != nil {
return metaErr
}
// Create thin device snapshot // Create thin device snapshot
if err := p.createSnapshot(ctx, baseInfo, snapInfo); err != nil { devErr = p.createSnapshot(ctx, baseInfo, snapInfo)
return err if devErr != nil {
return devErr
} }
defer func() { // Activate the snapshot device
if retErr == nil { activeErr = p.activateDevice(ctx, snapInfo)
return if activeErr != nil {
} return activeErr
}
// Rollback snapshot creation return nil
retErr = multierror.Append(retErr, p.deleteDevice(ctx, snapInfo))
}()
// Activate snapshot device
return p.activateDevice(ctx, snapInfo)
} }
func (p *PoolDevice) createSnapshot(ctx context.Context, baseInfo, snapInfo *DeviceInfo) error { func (p *PoolDevice) createSnapshot(ctx context.Context, baseInfo, snapInfo *DeviceInfo) error {
@ -317,7 +364,7 @@ func (p *PoolDevice) RemoveDevice(ctx context.Context, deviceName string) error
return errors.Wrapf(err, "can't query metadata for device %q", deviceName) return errors.Wrapf(err, "can't query metadata for device %q", deviceName)
} }
if err := p.DeactivateDevice(ctx, deviceName, true, true); err != nil { if err := p.DeactivateDevice(ctx, deviceName, false, true); err != nil {
return err return err
} }

View File

@ -501,7 +501,6 @@ func checkRemoveIntermediateSnapshot(ctx context.Context, t *testing.T, snapshot
if err != nil { if err != nil {
t.Fatal(err) t.Fatal(err)
} }
defer testutil.Unmount(t, base)
committedBase := filepath.Join(work, "committed-base") committedBase := filepath.Join(work, "committed-base")
if err = snapshotter.Commit(ctx, committedBase, base, opt); err != nil { if err = snapshotter.Commit(ctx, committedBase, base, opt); err != nil {
@ -540,6 +539,7 @@ func checkRemoveIntermediateSnapshot(ctx context.Context, t *testing.T, snapshot
if err != nil { if err != nil {
t.Fatal(err) t.Fatal(err)
} }
testutil.Unmount(t, base)
err = snapshotter.Remove(ctx, committedBase) err = snapshotter.Remove(ctx, committedBase)
if err != nil { if err != nil {
t.Fatal(err) t.Fatal(err)