AWS EBS: Remove the attached volumes cache
There are known issues with the attached-volume state cache that just aren't possible to fix with the current interface. Replace it with a map of the active attach jobs (that was the original requirement, to avoid a nasty race condition). This costs us an extra DescribeInstance call on attach/detach, but that seems worth it if it ends this class of bugs. Fix #15073
This commit is contained in:
		@@ -932,9 +932,10 @@ type awsInstance struct {
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
	mutex sync.Mutex
 | 
						mutex sync.Mutex
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	// We must cache because otherwise there is a race condition,
 | 
						// We keep an active list of devices we have assigned but not yet
 | 
				
			||||||
	// where we assign a device mapping and then get a second request before we attach the volume
 | 
						// attached, to avoid a race condition where we assign a device mapping
 | 
				
			||||||
	deviceMappings map[mountDevice]string
 | 
						// and then get a second request before we attach the volume
 | 
				
			||||||
 | 
						attaching map[mountDevice]string
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
// newAWSInstance creates a new awsInstance object
 | 
					// newAWSInstance creates a new awsInstance object
 | 
				
			||||||
@@ -953,8 +954,7 @@ func newAWSInstance(ec2Service EC2, instance *ec2.Instance) *awsInstance {
 | 
				
			|||||||
		subnetID:         aws.StringValue(instance.SubnetId),
 | 
							subnetID:         aws.StringValue(instance.SubnetId),
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	// We lazy-init deviceMappings
 | 
						self.attaching = make(map[mountDevice]string)
 | 
				
			||||||
	self.deviceMappings = nil
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
	return self
 | 
						return self
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
@@ -1001,8 +1001,6 @@ func (self *awsInstance) getMountDevice(volumeID string, assign bool) (assigned
 | 
				
			|||||||
	self.mutex.Lock()
 | 
						self.mutex.Lock()
 | 
				
			||||||
	defer self.mutex.Unlock()
 | 
						defer self.mutex.Unlock()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	// We cache both for efficiency and correctness
 | 
					 | 
				
			||||||
	if self.deviceMappings == nil {
 | 
					 | 
				
			||||||
	info, err := self.describeInstance()
 | 
						info, err := self.describeInstance()
 | 
				
			||||||
	if err != nil {
 | 
						if err != nil {
 | 
				
			||||||
		return "", false, err
 | 
							return "", false, err
 | 
				
			||||||
@@ -1021,11 +1019,13 @@ func (self *awsInstance) getMountDevice(volumeID string, assign bool) (assigned
 | 
				
			|||||||
		}
 | 
							}
 | 
				
			||||||
		deviceMappings[mountDevice(name)] = aws.StringValue(blockDevice.Ebs.VolumeId)
 | 
							deviceMappings[mountDevice(name)] = aws.StringValue(blockDevice.Ebs.VolumeId)
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
		self.deviceMappings = deviceMappings
 | 
					
 | 
				
			||||||
 | 
						for mountDevice, volume := range self.attaching {
 | 
				
			||||||
 | 
							deviceMappings[mountDevice] = volume
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	// Check to see if this volume is already assigned a device on this machine
 | 
						// Check to see if this volume is already assigned a device on this machine
 | 
				
			||||||
	for mountDevice, mappingVolumeID := range self.deviceMappings {
 | 
						for mountDevice, mappingVolumeID := range deviceMappings {
 | 
				
			||||||
		if volumeID == mappingVolumeID {
 | 
							if volumeID == mappingVolumeID {
 | 
				
			||||||
			if assign {
 | 
								if assign {
 | 
				
			||||||
				glog.Warningf("Got assignment call for already-assigned volume: %s@%s", mountDevice, mappingVolumeID)
 | 
									glog.Warningf("Got assignment call for already-assigned volume: %s@%s", mountDevice, mappingVolumeID)
 | 
				
			||||||
@@ -1042,7 +1042,7 @@ func (self *awsInstance) getMountDevice(volumeID string, assign bool) (assigned
 | 
				
			|||||||
	valid := instanceType.getEBSMountDevices()
 | 
						valid := instanceType.getEBSMountDevices()
 | 
				
			||||||
	chosen := mountDevice("")
 | 
						chosen := mountDevice("")
 | 
				
			||||||
	for _, mountDevice := range valid {
 | 
						for _, mountDevice := range valid {
 | 
				
			||||||
		_, found := self.deviceMappings[mountDevice]
 | 
							_, found := deviceMappings[mountDevice]
 | 
				
			||||||
		if !found {
 | 
							if !found {
 | 
				
			||||||
			chosen = mountDevice
 | 
								chosen = mountDevice
 | 
				
			||||||
			break
 | 
								break
 | 
				
			||||||
@@ -1050,31 +1050,31 @@ func (self *awsInstance) getMountDevice(volumeID string, assign bool) (assigned
 | 
				
			|||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	if chosen == "" {
 | 
						if chosen == "" {
 | 
				
			||||||
		glog.Warningf("Could not assign a mount device (all in use?).  mappings=%v, valid=%v", self.deviceMappings, valid)
 | 
							glog.Warningf("Could not assign a mount device (all in use?).  mappings=%v, valid=%v", deviceMappings, valid)
 | 
				
			||||||
		return "", false, nil
 | 
							return "", false, nil
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	self.deviceMappings[chosen] = volumeID
 | 
						self.attaching[chosen] = volumeID
 | 
				
			||||||
	glog.V(2).Infof("Assigned mount device %s -> volume %s", chosen, volumeID)
 | 
						glog.V(2).Infof("Assigned mount device %s -> volume %s", chosen, volumeID)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	return chosen, false, nil
 | 
						return chosen, false, nil
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
func (self *awsInstance) releaseMountDevice(volumeID string, mountDevice mountDevice) {
 | 
					func (self *awsInstance) endAttaching(volumeID string, mountDevice mountDevice) {
 | 
				
			||||||
	self.mutex.Lock()
 | 
						self.mutex.Lock()
 | 
				
			||||||
	defer self.mutex.Unlock()
 | 
						defer self.mutex.Unlock()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	existingVolumeID, found := self.deviceMappings[mountDevice]
 | 
						existingVolumeID, found := self.attaching[mountDevice]
 | 
				
			||||||
	if !found {
 | 
						if !found {
 | 
				
			||||||
		glog.Errorf("releaseMountDevice on non-allocated device")
 | 
							glog.Errorf("endAttaching on non-allocated device")
 | 
				
			||||||
		return
 | 
							return
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
	if volumeID != existingVolumeID {
 | 
						if volumeID != existingVolumeID {
 | 
				
			||||||
		glog.Errorf("releaseMountDevice on device assigned to different volume")
 | 
							glog.Errorf("endAttaching on device assigned to different volume")
 | 
				
			||||||
		return
 | 
							return
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
	glog.V(2).Infof("Releasing mount device mapping: %s -> volume %s", mountDevice, volumeID)
 | 
						glog.V(2).Infof("Releasing mount device mapping: %s -> volume %s", mountDevice, volumeID)
 | 
				
			||||||
	delete(self.deviceMappings, mountDevice)
 | 
						delete(self.attaching, mountDevice)
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
type awsDisk struct {
 | 
					type awsDisk struct {
 | 
				
			||||||
@@ -1280,8 +1280,8 @@ func (c *AWSCloud) AttachDisk(diskName string, instanceName string, readOnly boo
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
	attached := false
 | 
						attached := false
 | 
				
			||||||
	defer func() {
 | 
						defer func() {
 | 
				
			||||||
		if !attached {
 | 
							if attached {
 | 
				
			||||||
			awsInstance.releaseMountDevice(disk.awsID, mountDevice)
 | 
								awsInstance.endAttaching(disk.awsID, mountDevice)
 | 
				
			||||||
		}
 | 
							}
 | 
				
			||||||
	}()
 | 
						}()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -1346,33 +1346,15 @@ func (aws *AWSCloud) DetachDisk(diskName string, instanceName string) (string, e
 | 
				
			|||||||
		return "", errors.New("no response from DetachVolume")
 | 
							return "", errors.New("no response from DetachVolume")
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	// TODO: Fix this - just remove the cache?
 | 
					 | 
				
			||||||
	// If we don't have a cache; we don't have to wait any more (the driver does it for us)
 | 
					 | 
				
			||||||
	// Also, maybe we could get the locally connected drivers from the AWS metadata service?
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	// At this point we are waiting for the volume being detached. This
 | 
					 | 
				
			||||||
	// releases the volume and invalidates the cache even when there is a timeout.
 | 
					 | 
				
			||||||
	//
 | 
					 | 
				
			||||||
	// TODO: A timeout leaves the cache in an inconsistent state. The volume is still
 | 
					 | 
				
			||||||
	// detaching though the cache shows it as ready to be attached again. Subsequent
 | 
					 | 
				
			||||||
	// attach operations will fail. The attach is being retried and eventually
 | 
					 | 
				
			||||||
	// works though. An option would be to completely flush the cache upon timeouts.
 | 
					 | 
				
			||||||
	//
 | 
					 | 
				
			||||||
	defer func() {
 | 
					 | 
				
			||||||
		// TODO: Not thread safe?
 | 
					 | 
				
			||||||
		for mountDevice, existingVolumeID := range awsInstance.deviceMappings {
 | 
					 | 
				
			||||||
			if existingVolumeID == disk.awsID {
 | 
					 | 
				
			||||||
				awsInstance.releaseMountDevice(disk.awsID, mountDevice)
 | 
					 | 
				
			||||||
				return
 | 
					 | 
				
			||||||
			}
 | 
					 | 
				
			||||||
		}
 | 
					 | 
				
			||||||
	}()
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	err = disk.waitForAttachmentStatus("detached")
 | 
						err = disk.waitForAttachmentStatus("detached")
 | 
				
			||||||
	if err != nil {
 | 
						if err != nil {
 | 
				
			||||||
		return "", err
 | 
							return "", err
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						if mountDevice != "" {
 | 
				
			||||||
 | 
							awsInstance.endAttaching(disk.awsID, mountDevice)
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	hostDevicePath := "/dev/xvd" + string(mountDevice)
 | 
						hostDevicePath := "/dev/xvd" + string(mountDevice)
 | 
				
			||||||
	return hostDevicePath, err
 | 
						return hostDevicePath, err
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user