Fix issue in node status updating VolumeAttached list
During volume detach, the following might happen in reconciler 1. Pod is deleting 2. remove volume from reportedAsAttached, so node status updater will update volumeAttached list 3. detach failed due to some issue 4. volume is added back in reportedAsAttached 5. reconciler loops again the volume, remove volume from reportedAsAttached 6. detach will not be trigged because exponential back off, detach call will fail with exponential backoff error 7. another pod is added which using the same volume on the same node 8. reconciler loops and it will NOT try to tigger detach anymore At this point, volume is still attached and in actual state, but volumeAttached list in node status does not has this volume anymore, and will block volume mount from kubelet. The fix in first round is to add volume back into the volume list that need to reported as attached at step 6 when detach call failed with error (exponentical backoff). However this might has some performance issue if detach fail for a while. During this time, volume will be keep removing/adding back to node status which will cause a surge of API calls. So we changed to logic to check first whether operation is safe to retry which means no pending operation or it is not in exponentical backoff time period before calling detach. This way we can avoid keep removing/adding volume from node status. Change-Id: I5d4e760c880d72937d34b9d3e904ecad125f802e
This commit is contained in:
@@ -151,13 +151,13 @@ func (rc *reconciler) reconcile() {
|
||||
// The operation key format is different depending on whether the volume
|
||||
// allows multi attach across different nodes.
|
||||
if util.IsMultiAttachAllowed(attachedVolume.VolumeSpec) {
|
||||
if rc.attacherDetacher.IsOperationPending(attachedVolume.VolumeName, "" /* podName */, attachedVolume.NodeName) {
|
||||
klog.V(10).Infof("Operation for volume %q is already running for node %q. Can't start detach", attachedVolume.VolumeName, attachedVolume.NodeName)
|
||||
if !rc.attacherDetacher.IsOperationSafeToRetry(attachedVolume.VolumeName, "" /* podName */, attachedVolume.NodeName, operationexecutor.DetachOperationName) {
|
||||
klog.V(10).Infof("Operation for volume %q is already running or still in exponential backoff for node %q. Can't start detach", attachedVolume.VolumeName, attachedVolume.NodeName)
|
||||
continue
|
||||
}
|
||||
} else {
|
||||
if rc.attacherDetacher.IsOperationPending(attachedVolume.VolumeName, "" /* podName */, "" /* nodeName */) {
|
||||
klog.V(10).Infof("Operation for volume %q is already running in the cluster. Can't start detach for %q", attachedVolume.VolumeName, attachedVolume.NodeName)
|
||||
if !rc.attacherDetacher.IsOperationSafeToRetry(attachedVolume.VolumeName, "" /* podName */, "" /* nodeName */, operationexecutor.DetachOperationName) {
|
||||
klog.V(10).Infof("Operation for volume %q is already running or still in exponential backoff in the cluster. Can't start detach for %q", attachedVolume.VolumeName, attachedVolume.NodeName)
|
||||
continue
|
||||
}
|
||||
}
|
||||
@@ -193,6 +193,8 @@ func (rc *reconciler) reconcile() {
|
||||
|
||||
// Before triggering volume detach, mark volume as detached and update the node status
|
||||
// If it fails to update node status, skip detach volume
|
||||
// If volume detach operation fails, the volume needs to be added back to report as attached so that node status
|
||||
// has the correct volume attachment information.
|
||||
err = rc.actualStateOfWorld.RemoveVolumeFromReportAsAttached(attachedVolume.VolumeName, attachedVolume.NodeName)
|
||||
if err != nil {
|
||||
klog.V(5).Infof("RemoveVolumeFromReportAsAttached failed while removing volume %q from node %q with: %v",
|
||||
@@ -222,10 +224,17 @@ func (rc *reconciler) reconcile() {
|
||||
klog.Warningf(attachedVolume.GenerateMsgDetailed("attacherDetacher.DetachVolume started", fmt.Sprintf("This volume is not safe to detach, but maxWaitForUnmountDuration %v expired, force detaching", rc.maxWaitForUnmountDuration)))
|
||||
}
|
||||
}
|
||||
if err != nil && !exponentialbackoff.IsExponentialBackoff(err) {
|
||||
// Ignore exponentialbackoff.IsExponentialBackoff errors, they are expected.
|
||||
// Log all other errors.
|
||||
klog.Errorf(attachedVolume.GenerateErrorDetailed("attacherDetacher.DetachVolume failed to start", err).Error())
|
||||
if err != nil {
|
||||
// Add volume back to ReportAsAttached if DetachVolume call failed so that node status updater will add it back to VolumeAttached list.
|
||||
// This function is also called during executing the volume detach operation in operation_generoator.
|
||||
// It is needed here too because DetachVolume call might fail before executing the actual operation in operation_executor (e.g., cannot find volume plugin etc.)
|
||||
rc.actualStateOfWorld.AddVolumeToReportAsAttached(attachedVolume.VolumeName, attachedVolume.NodeName)
|
||||
|
||||
if !exponentialbackoff.IsExponentialBackoff(err) {
|
||||
// Ignore exponentialbackoff.IsExponentialBackoff errors, they are expected.
|
||||
// Log all other errors.
|
||||
klog.Errorf(attachedVolume.GenerateErrorDetailed("attacherDetacher.DetachVolume failed to start", err).Error())
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Reference in New Issue
Block a user