Make GCE PD attach block on pending detach. Retry on detach/attach errors.

This commit is contained in:
saadali
2015-07-19 19:00:10 -07:00
parent dfe1eb9b96
commit 234f3ed814
5 changed files with 352 additions and 201 deletions

View File

@@ -43,6 +43,7 @@ const (
maxChecks = 10
maxRetries = 10
checkSleepDuration = time.Second
errorSleepDuration = 5 * time.Second
)
// Singleton operation manager for managing detach clean up go routines
@@ -54,24 +55,17 @@ type GCEDiskUtil struct{}
// Mounts the disk to it's global path.
func (diskUtil *GCEDiskUtil) AttachAndMountDisk(pd *gcePersistentDisk, globalPDPath string) error {
glog.V(5).Infof("AttachAndMountDisk(pd, %q) where pd is %#v\r\n", globalPDPath, pd)
// Terminate any in progress verify detach go routines, this will block until the goroutine is ready to exit because the channel is unbuffered
// Block execution until any pending detach goroutines for this pd have completed
detachCleanupManager.Send(pd.pdName, true)
sdBefore, err := filepath.Glob(diskSDPattern)
if err != nil {
glog.Errorf("Error filepath.Glob(\"%s\"): %v\r\n", diskSDPattern, err)
}
sdBeforeSet := util.NewStringSet(sdBefore...)
gce, err := cloudprovider.GetCloudProvider("gce", nil)
if err != nil {
return err
}
if err := gce.(*gce_cloud.GCECloud).AttachDisk(pd.pdName, pd.readOnly); err != nil {
return err
}
devicePath, err := verifyAttached(pd, sdBeforeSet, gce)
devicePath, err := attachDiskAndVerify(pd, sdBeforeSet)
if err != nil {
return err
}
@@ -108,33 +102,51 @@ func (util *GCEDiskUtil) DetachDisk(pd *gcePersistentDisk) error {
globalPDPath := makeGlobalPDName(pd.plugin.host, pd.pdName)
glog.V(5).Infof("DetachDisk(pd) where pd is %#v and the globalPDPath is %q\r\n", pd, globalPDPath)
// Terminate any in progress verify detach go routines, this will block until the goroutine is ready to exit because the channel is unbuffered
detachCleanupManager.Send(pd.pdName, true)
if err := pd.mounter.Unmount(globalPDPath); err != nil {
return err
}
if err := os.Remove(globalPDPath); err != nil {
return err
}
// Detach the disk
gce, err := cloudprovider.GetCloudProvider("gce", nil)
if err != nil {
return err
}
if err := gce.(*gce_cloud.GCECloud).DetachDisk(pd.pdName); err != nil {
return err
if detachCleanupManager.Exists(pd.pdName) {
glog.Warningf("Terminating new DetachDisk call for GCE PD %q. A previous detach call for this PD is still pending.", pd.pdName)
return nil
}
// Verify disk detached, retry if needed.
go verifyDetached(pd, gce)
// Detach disk, retry if needed.
go detachDiskAndVerify(pd)
return nil
}
// Verifys the disk device to be created has been succesffully attached, and retries if it fails.
func verifyAttached(pd *gcePersistentDisk, sdBeforeSet util.StringSet, gce cloudprovider.Interface) (string, error) {
// Attaches the specified persistent disk device to node, verifies that it is attached, and retries if it fails.
func attachDiskAndVerify(pd *gcePersistentDisk, sdBeforeSet util.StringSet) (string, error) {
devicePaths := getDiskByIdPaths(pd)
var gce cloudprovider.Interface
for numRetries := 0; numRetries < maxRetries; numRetries++ {
if gce == nil {
var err error
gce, err = cloudprovider.GetCloudProvider("gce", nil)
if err != nil || gce == nil {
// Retry on error. See issue #11321
glog.Errorf("Error getting GCECloudProvider while attaching PD %q: %v", pd.pdName, err)
gce = nil
time.Sleep(errorSleepDuration)
continue
}
}
if numRetries > 0 {
glog.Warningf("Timed out waiting for GCE PD %q to attach. Retrying attach.", pd.pdName)
}
if err := gce.(*gce_cloud.GCECloud).AttachDisk(pd.pdName, pd.readOnly); err != nil {
// Retry on error. See issue #11321. Continue and verify if disk is attached, because a
// previous attach operation may still succeed.
glog.Errorf("Error attaching PD %q: %v", pd.pdName, err)
}
for numChecks := 0; numChecks < maxChecks; numChecks++ {
if err := udevadmChangeToNewDrives(sdBeforeSet); err != nil {
// udevadm errors should not block disk attachment, log and continue
@@ -143,84 +155,107 @@ func verifyAttached(pd *gcePersistentDisk, sdBeforeSet util.StringSet, gce cloud
for _, path := range devicePaths {
if pathExists, err := pathExists(path); err != nil {
return "", err
// Retry on error. See issue #11321
glog.Errorf("Error checking if path exists: %v", err)
} else if pathExists {
// A device path has succesfully been created for the PD
glog.V(5).Infof("Succesfully attached GCE PD %q.", pd.pdName)
glog.Infof("Succesfully attached GCE PD %q.", pd.pdName)
return path, nil
}
}
// Sleep then check again
glog.V(5).Infof("Waiting for GCE PD %q to attach.", pd.pdName)
glog.V(3).Infof("Waiting for GCE PD %q to attach.", pd.pdName)
time.Sleep(checkSleepDuration)
}
// Try attaching the disk again
glog.Warningf("Timed out waiting for GCE PD %q to attach. Retrying attach.", pd.pdName)
if err := gce.(*gce_cloud.GCECloud).AttachDisk(pd.pdName, pd.readOnly); err != nil {
return "", err
}
}
return "", fmt.Errorf("Could not attach GCE PD %q. Timeout waiting for mount paths to be created.", pd.pdName)
}
// Veify the specified persistent disk device has been succesfully detached, and retries if it fails.
// Detaches the specified persistent disk device from node, verifies that it is detached, and retries if it fails.
// This function is intended to be called asynchronously as a go routine.
func verifyDetached(pd *gcePersistentDisk, gce cloudprovider.Interface) {
// It starts the detachCleanupManager with the specified pdName so that callers can wait for completion.
func detachDiskAndVerify(pd *gcePersistentDisk) {
glog.V(5).Infof("detachDiskAndVerify for pd %q.", pd.pdName)
defer util.HandleCrash()
// Setting bufferSize to 0 so that when senders send, they are blocked until we recieve. This avoids the need to have a separate exit check.
// Start operation, so that other threads can wait on this detach operation.
// Set bufferSize to 0 so senders are blocked on send until we recieve.
ch, err := detachCleanupManager.Start(pd.pdName, 0 /* bufferSize */)
if err != nil {
glog.Errorf("Error adding %q to detachCleanupManager: %v", pd.pdName, err)
return
}
defer detachCleanupManager.Close(pd.pdName)
devicePaths := getDiskByIdPaths(pd)
for numRetries := 0; numRetries < maxRetries; numRetries++ {
for numChecks := 0; numChecks < maxChecks; numChecks++ {
defer func() {
// Unblock any callers that have been waiting for this detach routine to complete.
for {
select {
case <-ch:
glog.Warningf("Terminating GCE PD %q detach verification. Another attach/detach call was made for this PD.", pd.pdName)
return
glog.V(5).Infof("detachDiskAndVerify for pd %q clearing chan.", pd.pdName)
default:
allPathsRemoved := true
for _, path := range devicePaths {
if err := udevadmChangeToDrive(path); err != nil {
// udevadm errors should not block disk detachment, log and continue
glog.Errorf("%v", err)
}
if exists, err := pathExists(path); err != nil {
glog.Errorf("Error check path: %v", err)
return
} else {
allPathsRemoved = allPathsRemoved && !exists
}
}
if allPathsRemoved {
// All paths to the PD have been succefully removed
glog.V(5).Infof("Succesfully detached GCE PD %q.", pd.pdName)
return
}
glog.V(5).Infof("detachDiskAndVerify for pd %q done clearing chans.", pd.pdName)
return
}
}
}()
// Sleep then check again
glog.V(5).Infof("Waiting for GCE PD %q to detach.", pd.pdName)
time.Sleep(checkSleepDuration)
devicePaths := getDiskByIdPaths(pd)
var gce cloudprovider.Interface
for numRetries := 0; numRetries < maxRetries; numRetries++ {
if gce == nil {
var err error
gce, err = cloudprovider.GetCloudProvider("gce", nil)
if err != nil || gce == nil {
// Retry on error. See issue #11321
glog.Errorf("Error getting GCECloudProvider while detaching PD %q: %v", pd.pdName, err)
gce = nil
time.Sleep(errorSleepDuration)
continue
}
}
// Try detaching disk again
glog.Warningf("Timed out waiting for GCE PD %q to detach. Retrying detach.", pd.pdName)
if err := gce.(*gce_cloud.GCECloud).DetachDisk(pd.pdName); err != nil {
glog.Errorf("Error on retry detach PD %q: %v", pd.pdName, err)
return
if numRetries > 0 {
glog.Warningf("Timed out waiting for GCE PD %q to detach. Retrying detach.", pd.pdName)
}
if err := gce.(*gce_cloud.GCECloud).DetachDisk(pd.pdName); err != nil {
// Retry on error. See issue #11321. Continue and verify if disk is detached, because a
// previous detach operation may still succeed.
glog.Errorf("Error detaching PD %q: %v", pd.pdName, err)
}
for numChecks := 0; numChecks < maxChecks; numChecks++ {
allPathsRemoved := true
for _, path := range devicePaths {
if err := udevadmChangeToDrive(path); err != nil {
// udevadm errors should not block disk detachment, log and continue
glog.Errorf("%v", err)
}
if exists, err := pathExists(path); err != nil {
// Retry on error. See issue #11321
glog.Errorf("Error checking if path exists: %v", err)
} else {
allPathsRemoved = allPathsRemoved && !exists
}
}
if allPathsRemoved {
// All paths to the PD have been succefully removed
glog.Infof("Succesfully detached GCE PD %q.", pd.pdName)
return
}
// Sleep then check again
glog.V(3).Infof("Waiting for GCE PD %q to detach.", pd.pdName)
time.Sleep(checkSleepDuration)
}
}
glog.Errorf("Could not detach GCE PD %q. One or more mount paths was not removed.", pd.pdName)
glog.Errorf("Failed to detach GCE PD %q. One or more mount paths was not removed.", pd.pdName)
}
// Returns list of all /dev/disk/by-id/* paths for given PD.
@@ -326,7 +361,7 @@ func (mounter *gceSafeFormatAndMount) Mount(source string, target string, fstype
cmd := mounter.runner.Command("/usr/share/google/safe_format_and_mount", args...)
dataOut, err := cmd.CombinedOutput()
if err != nil {
glog.V(5).Infof("error running /usr/share/google/safe_format_and_mount\n%s", string(dataOut))
glog.Errorf("error running /usr/share/google/safe_format_and_mount\n%s", string(dataOut))
}
return err
}