diff --git a/pkg/controller/volume/attachdetach/reconciler/reconciler_test.go b/pkg/controller/volume/attachdetach/reconciler/reconciler_test.go index f4c3c52fb7e..2dd3378de73 100644 --- a/pkg/controller/volume/attachdetach/reconciler/reconciler_test.go +++ b/pkg/controller/volume/attachdetach/reconciler/reconciler_test.go @@ -18,6 +18,7 @@ package reconciler import ( "context" + "fmt" "testing" "time" @@ -47,6 +48,7 @@ const ( syncLoopPeriod = 100 * time.Minute maxWaitForUnmountDuration = 50 * time.Millisecond maxLongWaitForUnmountDuration = 4200 * time.Second + volumeAttachedCheckTimeout = 5 * time.Second ) // Calls Run() @@ -604,7 +606,7 @@ func Test_Run_OneVolumeAttachAndDetachUncertainNodesWithReadWriteOnce(t *testing // Volume is added to asw. Because attach operation fails, volume should not be reported as attached to the node. waitForVolumeAddedToNode(t, generatedVolumeName, nodeName1, asw) verifyVolumeAttachedToNode(t, generatedVolumeName, nodeName1, cache.AttachStateAttached, asw) - verifyVolumeReportedAsAttachedToNode(t, logger, generatedVolumeName, nodeName1, true, asw) + verifyVolumeReportedAsAttachedToNode(t, logger, generatedVolumeName, nodeName1, true, asw, volumeAttachedCheckTimeout) // When volume is added to the node, it is set to mounted by default. Then the status will be updated by checking node status VolumeInUse. // Without this, the delete operation will be delayed due to mounted status @@ -665,7 +667,7 @@ func Test_Run_UpdateNodeStatusFailBeforeOneVolumeDetachNodeWithReadWriteOnce(t * // Volume is added to asw, volume should be reported as attached to the node. waitForVolumeAddedToNode(t, generatedVolumeName, nodeName1, asw) verifyVolumeAttachedToNode(t, generatedVolumeName, nodeName1, cache.AttachStateAttached, asw) - verifyVolumeReportedAsAttachedToNode(t, logger, generatedVolumeName, nodeName1, true, asw) + verifyVolumeReportedAsAttachedToNode(t, logger, generatedVolumeName, nodeName1, true, asw, volumeAttachedCheckTimeout) // Delete the pod dsw.DeletePod(types.UniquePodName(podName1), generatedVolumeName, nodeName1) @@ -683,7 +685,7 @@ func Test_Run_UpdateNodeStatusFailBeforeOneVolumeDetachNodeWithReadWriteOnce(t * // in node status. By calling this function (GetVolumesToReportAttached), node status should be updated, and the volume // will not need to be updated until new changes are applied (detach is triggered again) verifyVolumeAttachedToNode(t, generatedVolumeName, nodeName1, cache.AttachStateAttached, asw) - verifyVolumeReportedAsAttachedToNode(t, logger, generatedVolumeName, nodeName1, true, asw) + verifyVolumeReportedAsAttachedToNode(t, logger, generatedVolumeName, nodeName1, true, asw, volumeAttachedCheckTimeout) } @@ -728,11 +730,10 @@ func Test_Run_OneVolumeDetachFailNodeWithReadWriteOnce(t *testing.T) { t.Fatalf("AddPod failed. Expected: Actual: <%v>", podAddErr) } - time.Sleep(1000 * time.Millisecond) // Volume is added to asw, volume should be reported as attached to the node. waitForVolumeAddedToNode(t, generatedVolumeName, nodeName1, asw) verifyVolumeAttachedToNode(t, generatedVolumeName, nodeName1, cache.AttachStateAttached, asw) - verifyVolumeReportedAsAttachedToNode(t, logger, generatedVolumeName, nodeName1, true, asw) + verifyVolumeReportedAsAttachedToNode(t, logger, generatedVolumeName, nodeName1, true, asw, volumeAttachedCheckTimeout) // Delete the pod, but detach will fail dsw.DeletePod(types.UniquePodName(podName1), generatedVolumeName, nodeName1) @@ -741,28 +742,29 @@ func Test_Run_OneVolumeDetachFailNodeWithReadWriteOnce(t *testing.T) { // Right before detach operation is performed, the volume will be first removed from being reported // as attached on node status (RemoveVolumeFromReportAsAttached). After detach operation which is expected to fail, // controller then added the volume back as attached. - // Here it sleeps 100ms so that detach should be triggered already at this point. + // Here verifyVolumeReportedAsAttachedToNode will wait until the detach is triggered. // verifyVolumeReportedAsAttachedToNode will check volume is in the list of volume attached that needs to be updated // in node status. By calling this function (GetVolumesToReportAttached), node status should be updated, and the volume // will not need to be updated until new changes are applied (detach is triggered again) - time.Sleep(100 * time.Millisecond) verifyVolumeAttachedToNode(t, generatedVolumeName, nodeName1, cache.AttachStateAttached, asw) - verifyVolumeReportedAsAttachedToNode(t, logger, generatedVolumeName, nodeName1, true, asw) + verifyVolumeReportedAsAttachedToNode(t, logger, generatedVolumeName, nodeName1, true, asw, volumeAttachedCheckTimeout) - // After the first detach fails, reconciler will wait for a period of time before retrying to detach. - // The wait time is increasing exponentially from initial value of 0.5s (0.5, 1, 2, 4, ...). - // The test here waits for 100 Millisecond to make sure it is in exponential backoff period after - // the first detach operation. At this point, volumes status should not be updated - time.Sleep(100 * time.Millisecond) - verifyVolumeAttachedToNode(t, generatedVolumeName, nodeName1, cache.AttachStateAttached, asw) - verifyVolumeNoStatusUpdateNeeded(t, logger, generatedVolumeName, nodeName1, asw) + // The below verifies check intermediate state and is flaky but useful + // + // // After the first detach fails, reconciler will wait for a period of time before retrying to detach. + // // The wait time is increasing exponentially from initial value of 0.5s (0.5, 1, 2, 4, ...). + // // The test here waits for 100 Millisecond to make sure it is in exponential backoff period after + // // the first detach operation. At this point, volumes status should not be updated + // time.Sleep(100 * time.Millisecond) + // verifyVolumeAttachedToNode(t, generatedVolumeName, nodeName1, cache.AttachStateAttached, asw) + // verifyVolumeNoStatusUpdateNeeded(t, logger, generatedVolumeName, nodeName1, asw) - // Wait for 600ms to make sure second detach operation triggered. Again, The volume will be - // removed from being reported as attached on node status and then added back as attached. - // The volume will be in the list of attached volumes that need to be updated to node status. - time.Sleep(600 * time.Millisecond) - verifyVolumeAttachedToNode(t, generatedVolumeName, nodeName1, cache.AttachStateAttached, asw) - verifyVolumeReportedAsAttachedToNode(t, logger, generatedVolumeName, nodeName1, true, asw) + // // Wait for 600ms to make sure second detach operation triggered. Again, The volume will be + // // removed from being reported as attached on node status and then added back as attached. + // // The volume will be in the list of attached volumes that need to be updated to node status. + // time.Sleep(600 * time.Millisecond) + // verifyVolumeAttachedToNode(t, generatedVolumeName, nodeName1, cache.AttachStateAttached, asw) + // verifyVolumeReportedAsAttachedToNode(t, logger, generatedVolumeName, nodeName1, true, asw, volumeAttachedCheckTimeout) // Add a second pod which tries to attach the volume to the same node. // After adding pod to the same node, detach will not be triggered any more. @@ -770,8 +772,7 @@ func Test_Run_OneVolumeDetachFailNodeWithReadWriteOnce(t *testing.T) { if podAddErr != nil { t.Fatalf("AddPod failed. Expected: Actual: <%v>", podAddErr) } - // Sleep 1s to verify no detach are triggered after second pod is added in the future. - time.Sleep(1000 * time.Millisecond) + // verify no detach are triggered after second pod is added in the future. verifyVolumeAttachedToNode(t, generatedVolumeName, nodeName1, cache.AttachStateAttached, asw) verifyVolumeNoStatusUpdateNeeded(t, logger, generatedVolumeName, nodeName1, asw) @@ -834,7 +835,7 @@ func Test_Run_OneVolumeAttachAndDetachTimeoutNodesWithReadWriteOnce(t *testing.T // Volume is added to asw. Because attach operation fails, volume should not be reported as attached to the node. waitForVolumeAddedToNode(t, generatedVolumeName, nodeName1, asw) verifyVolumeAttachedToNode(t, generatedVolumeName, nodeName1, cache.AttachStateUncertain, asw) - verifyVolumeReportedAsAttachedToNode(t, logger, generatedVolumeName, nodeName1, false, asw) + verifyVolumeReportedAsAttachedToNode(t, logger, generatedVolumeName, nodeName1, false, asw, volumeAttachedCheckTimeout) // When volume is added to the node, it is set to mounted by default. Then the status will be updated by checking node status VolumeInUse. // Without this, the delete operation will be delayed due to mounted status @@ -1612,24 +1613,32 @@ func verifyVolumeReportedAsAttachedToNode( nodeName k8stypes.NodeName, isAttached bool, asw cache.ActualStateOfWorld, + timeout time.Duration, ) { - result := false - volumes := asw.GetVolumesToReportAttached(logger) - for _, volume := range volumes[nodeName] { - if volume.Name == volumeName { - result = true + var result bool + var lastErr error + err := wait.PollUntilContextTimeout(context.TODO(), 50*time.Millisecond, timeout, false, func(context.Context) (done bool, err error) { + volumes := asw.GetVolumesToReportAttached(logger) + for _, volume := range volumes[nodeName] { + if volume.Name == volumeName { + result = true + } } - } - if result == isAttached { - t.Logf("Volume <%v> is reported as attached to node <%v>: %v", volumeName, nodeName, result) - return + if result == isAttached { + t.Logf("Volume <%v> is reported as attached to node <%v>: %v", volumeName, nodeName, result) + return true, nil + } + lastErr = fmt.Errorf("Check volume <%v> is reported as attached to node <%v>, got %v, expected %v", + volumeName, + nodeName, + result, + isAttached) + return false, nil + }) + if err != nil { + t.Fatalf("last error: %q, wait timeout: %q", lastErr, err.Error()) } - t.Fatalf("Check volume <%v> is reported as attached to node <%v>, got %v, expected %v", - volumeName, - nodeName, - result, - isAttached) }