Fix race condition in updating attached volume between master and node
This PR tries to fix issue #29324. This cause of this issue is a race condition happens when marking volumes as attached for node status. This PR tries to clean up the logic of when and where to mark volumes as attached/detached. Basically the workflow as follows, 1. When volume is attached sucessfully, the volume and node info is added into nodesToUpdateStatusFor to mark the volume as attached to the node. 2. When detach request comes in, it will check whether it is safe to detach now. If the check passes, remove the volume from volumesToReportAsAttached to indicate the volume is no longer considered as attached now. Afterwards, reconciler tries to update node status and trigger detach operation. If any of these operation fails, the volume is added back to the volumesToReportAsAttached list showing that it is still attached. These steps should make sure that kubelet get the right (might be outdated) information about which volume is attached or not. It also garantees that if detach operation is pending, kubelet should not trigger any mount operations.
This commit is contained in:
@@ -153,6 +153,14 @@ type ActualStateOfWorldAttacherUpdater interface {
|
||||
|
||||
// Marks the specified volume as detached from the specified node
|
||||
MarkVolumeAsDetached(volumeName api.UniqueVolumeName, nodeName string)
|
||||
|
||||
// Marks desire to detach the specified volume (remove the volume from the node's
|
||||
// volumesToReportedAsAttached list)
|
||||
RemoveVolumeFromReportAsAttached(volumeName api.UniqueVolumeName, nodeName string) error
|
||||
|
||||
// Unmarks the desire to detach for the specified volume (add the volume back to
|
||||
// the node's volumesToReportedAsAttached list)
|
||||
AddVolumeToReportAsAttached(volumeName api.UniqueVolumeName, nodeName string)
|
||||
}
|
||||
|
||||
// VolumeToAttach represents a volume that should be attached to a node.
|
||||
@@ -561,24 +569,23 @@ func (oe *operationExecutor) generateDetachVolumeFunc(
|
||||
}
|
||||
|
||||
return func() error {
|
||||
var err error
|
||||
if verifySafeToDetach {
|
||||
safeToDetachErr := oe.verifyVolumeIsSafeToDetach(volumeToDetach)
|
||||
if safeToDetachErr != nil {
|
||||
// On failure, return error. Caller will log and retry.
|
||||
return err
|
||||
}
|
||||
err = oe.verifyVolumeIsSafeToDetach(volumeToDetach)
|
||||
}
|
||||
|
||||
// Execute detach
|
||||
detachErr := volumeDetacher.Detach(volumeName, volumeToDetach.NodeName)
|
||||
if detachErr != nil {
|
||||
// On failure, return error. Caller will log and retry.
|
||||
if err == nil {
|
||||
err = volumeDetacher.Detach(volumeName, volumeToDetach.NodeName)
|
||||
}
|
||||
if err != nil {
|
||||
// On failure, add volume back to ReportAsAttached list
|
||||
actualStateOfWorld.AddVolumeToReportAsAttached(
|
||||
volumeToDetach.VolumeName, volumeToDetach.NodeName)
|
||||
return fmt.Errorf(
|
||||
"DetachVolume.Detach failed for volume %q (spec.Name: %q) from node %q with: %v",
|
||||
volumeToDetach.VolumeName,
|
||||
volumeToDetach.VolumeSpec.Name(),
|
||||
volumeToDetach.NodeName,
|
||||
detachErr)
|
||||
err)
|
||||
}
|
||||
|
||||
glog.Infof(
|
||||
|
Reference in New Issue
Block a user