deflake: Add retry with timeout to wait for final conditions

This commit is contained in:
Paco Xu 2023-03-16 13:16:54 +08:00
parent 8b2dae57d4
commit c14068c202

View File

@ -18,6 +18,7 @@ package reconciler
import ( import (
"context" "context"
"fmt"
"testing" "testing"
"time" "time"
@ -47,6 +48,7 @@ const (
syncLoopPeriod = 100 * time.Minute syncLoopPeriod = 100 * time.Minute
maxWaitForUnmountDuration = 50 * time.Millisecond maxWaitForUnmountDuration = 50 * time.Millisecond
maxLongWaitForUnmountDuration = 4200 * time.Second maxLongWaitForUnmountDuration = 4200 * time.Second
volumeAttachedCheckTimeout = 5 * time.Second
) )
// Calls Run() // Calls Run()
@ -604,7 +606,7 @@ func Test_Run_OneVolumeAttachAndDetachUncertainNodesWithReadWriteOnce(t *testing
// Volume is added to asw. Because attach operation fails, volume should not be reported as attached to the node. // Volume is added to asw. Because attach operation fails, volume should not be reported as attached to the node.
waitForVolumeAddedToNode(t, generatedVolumeName, nodeName1, asw) waitForVolumeAddedToNode(t, generatedVolumeName, nodeName1, asw)
verifyVolumeAttachedToNode(t, generatedVolumeName, nodeName1, cache.AttachStateAttached, asw) verifyVolumeAttachedToNode(t, generatedVolumeName, nodeName1, cache.AttachStateAttached, asw)
verifyVolumeReportedAsAttachedToNode(t, logger, generatedVolumeName, nodeName1, true, asw) verifyVolumeReportedAsAttachedToNode(t, logger, generatedVolumeName, nodeName1, true, asw, volumeAttachedCheckTimeout)
// When volume is added to the node, it is set to mounted by default. Then the status will be updated by checking node status VolumeInUse. // When volume is added to the node, it is set to mounted by default. Then the status will be updated by checking node status VolumeInUse.
// Without this, the delete operation will be delayed due to mounted status // Without this, the delete operation will be delayed due to mounted status
@ -665,7 +667,7 @@ func Test_Run_UpdateNodeStatusFailBeforeOneVolumeDetachNodeWithReadWriteOnce(t *
// Volume is added to asw, volume should be reported as attached to the node. // Volume is added to asw, volume should be reported as attached to the node.
waitForVolumeAddedToNode(t, generatedVolumeName, nodeName1, asw) waitForVolumeAddedToNode(t, generatedVolumeName, nodeName1, asw)
verifyVolumeAttachedToNode(t, generatedVolumeName, nodeName1, cache.AttachStateAttached, asw) verifyVolumeAttachedToNode(t, generatedVolumeName, nodeName1, cache.AttachStateAttached, asw)
verifyVolumeReportedAsAttachedToNode(t, logger, generatedVolumeName, nodeName1, true, asw) verifyVolumeReportedAsAttachedToNode(t, logger, generatedVolumeName, nodeName1, true, asw, volumeAttachedCheckTimeout)
// Delete the pod // Delete the pod
dsw.DeletePod(types.UniquePodName(podName1), generatedVolumeName, nodeName1) dsw.DeletePod(types.UniquePodName(podName1), generatedVolumeName, nodeName1)
@ -683,7 +685,7 @@ func Test_Run_UpdateNodeStatusFailBeforeOneVolumeDetachNodeWithReadWriteOnce(t *
// in node status. By calling this function (GetVolumesToReportAttached), node status should be updated, and the volume // in node status. By calling this function (GetVolumesToReportAttached), node status should be updated, and the volume
// will not need to be updated until new changes are applied (detach is triggered again) // will not need to be updated until new changes are applied (detach is triggered again)
verifyVolumeAttachedToNode(t, generatedVolumeName, nodeName1, cache.AttachStateAttached, asw) verifyVolumeAttachedToNode(t, generatedVolumeName, nodeName1, cache.AttachStateAttached, asw)
verifyVolumeReportedAsAttachedToNode(t, logger, generatedVolumeName, nodeName1, true, asw) verifyVolumeReportedAsAttachedToNode(t, logger, generatedVolumeName, nodeName1, true, asw, volumeAttachedCheckTimeout)
} }
@ -728,11 +730,10 @@ func Test_Run_OneVolumeDetachFailNodeWithReadWriteOnce(t *testing.T) {
t.Fatalf("AddPod failed. Expected: <no error> Actual: <%v>", podAddErr) t.Fatalf("AddPod failed. Expected: <no error> Actual: <%v>", podAddErr)
} }
time.Sleep(1000 * time.Millisecond)
// Volume is added to asw, volume should be reported as attached to the node. // Volume is added to asw, volume should be reported as attached to the node.
waitForVolumeAddedToNode(t, generatedVolumeName, nodeName1, asw) waitForVolumeAddedToNode(t, generatedVolumeName, nodeName1, asw)
verifyVolumeAttachedToNode(t, generatedVolumeName, nodeName1, cache.AttachStateAttached, asw) verifyVolumeAttachedToNode(t, generatedVolumeName, nodeName1, cache.AttachStateAttached, asw)
verifyVolumeReportedAsAttachedToNode(t, logger, generatedVolumeName, nodeName1, true, asw) verifyVolumeReportedAsAttachedToNode(t, logger, generatedVolumeName, nodeName1, true, asw, volumeAttachedCheckTimeout)
// Delete the pod, but detach will fail // Delete the pod, but detach will fail
dsw.DeletePod(types.UniquePodName(podName1), generatedVolumeName, nodeName1) dsw.DeletePod(types.UniquePodName(podName1), generatedVolumeName, nodeName1)
@ -741,28 +742,29 @@ func Test_Run_OneVolumeDetachFailNodeWithReadWriteOnce(t *testing.T) {
// Right before detach operation is performed, the volume will be first removed from being reported // Right before detach operation is performed, the volume will be first removed from being reported
// as attached on node status (RemoveVolumeFromReportAsAttached). After detach operation which is expected to fail, // as attached on node status (RemoveVolumeFromReportAsAttached). After detach operation which is expected to fail,
// controller then added the volume back as attached. // controller then added the volume back as attached.
// Here it sleeps 100ms so that detach should be triggered already at this point. // Here verifyVolumeReportedAsAttachedToNode will wait until the detach is triggered.
// verifyVolumeReportedAsAttachedToNode will check volume is in the list of volume attached that needs to be updated // verifyVolumeReportedAsAttachedToNode will check volume is in the list of volume attached that needs to be updated
// in node status. By calling this function (GetVolumesToReportAttached), node status should be updated, and the volume // in node status. By calling this function (GetVolumesToReportAttached), node status should be updated, and the volume
// will not need to be updated until new changes are applied (detach is triggered again) // will not need to be updated until new changes are applied (detach is triggered again)
time.Sleep(100 * time.Millisecond)
verifyVolumeAttachedToNode(t, generatedVolumeName, nodeName1, cache.AttachStateAttached, asw) verifyVolumeAttachedToNode(t, generatedVolumeName, nodeName1, cache.AttachStateAttached, asw)
verifyVolumeReportedAsAttachedToNode(t, logger, generatedVolumeName, nodeName1, true, asw) verifyVolumeReportedAsAttachedToNode(t, logger, generatedVolumeName, nodeName1, true, asw, volumeAttachedCheckTimeout)
// After the first detach fails, reconciler will wait for a period of time before retrying to detach. // The below verifies check intermediate state and is flaky but useful
// The wait time is increasing exponentially from initial value of 0.5s (0.5, 1, 2, 4, ...). //
// The test here waits for 100 Millisecond to make sure it is in exponential backoff period after // // After the first detach fails, reconciler will wait for a period of time before retrying to detach.
// the first detach operation. At this point, volumes status should not be updated // // The wait time is increasing exponentially from initial value of 0.5s (0.5, 1, 2, 4, ...).
time.Sleep(100 * time.Millisecond) // // The test here waits for 100 Millisecond to make sure it is in exponential backoff period after
verifyVolumeAttachedToNode(t, generatedVolumeName, nodeName1, cache.AttachStateAttached, asw) // // the first detach operation. At this point, volumes status should not be updated
verifyVolumeNoStatusUpdateNeeded(t, logger, generatedVolumeName, nodeName1, asw) // time.Sleep(100 * time.Millisecond)
// verifyVolumeAttachedToNode(t, generatedVolumeName, nodeName1, cache.AttachStateAttached, asw)
// verifyVolumeNoStatusUpdateNeeded(t, logger, generatedVolumeName, nodeName1, asw)
// Wait for 600ms to make sure second detach operation triggered. Again, The volume will be // // Wait for 600ms to make sure second detach operation triggered. Again, The volume will be
// removed from being reported as attached on node status and then added back as attached. // // removed from being reported as attached on node status and then added back as attached.
// The volume will be in the list of attached volumes that need to be updated to node status. // // The volume will be in the list of attached volumes that need to be updated to node status.
time.Sleep(600 * time.Millisecond) // time.Sleep(600 * time.Millisecond)
verifyVolumeAttachedToNode(t, generatedVolumeName, nodeName1, cache.AttachStateAttached, asw) // verifyVolumeAttachedToNode(t, generatedVolumeName, nodeName1, cache.AttachStateAttached, asw)
verifyVolumeReportedAsAttachedToNode(t, logger, generatedVolumeName, nodeName1, true, asw) // verifyVolumeReportedAsAttachedToNode(t, logger, generatedVolumeName, nodeName1, true, asw, volumeAttachedCheckTimeout)
// Add a second pod which tries to attach the volume to the same node. // Add a second pod which tries to attach the volume to the same node.
// After adding pod to the same node, detach will not be triggered any more. // After adding pod to the same node, detach will not be triggered any more.
@ -770,8 +772,7 @@ func Test_Run_OneVolumeDetachFailNodeWithReadWriteOnce(t *testing.T) {
if podAddErr != nil { if podAddErr != nil {
t.Fatalf("AddPod failed. Expected: <no error> Actual: <%v>", podAddErr) t.Fatalf("AddPod failed. Expected: <no error> Actual: <%v>", podAddErr)
} }
// Sleep 1s to verify no detach are triggered after second pod is added in the future. // verify no detach are triggered after second pod is added in the future.
time.Sleep(1000 * time.Millisecond)
verifyVolumeAttachedToNode(t, generatedVolumeName, nodeName1, cache.AttachStateAttached, asw) verifyVolumeAttachedToNode(t, generatedVolumeName, nodeName1, cache.AttachStateAttached, asw)
verifyVolumeNoStatusUpdateNeeded(t, logger, generatedVolumeName, nodeName1, asw) verifyVolumeNoStatusUpdateNeeded(t, logger, generatedVolumeName, nodeName1, asw)
@ -834,7 +835,7 @@ func Test_Run_OneVolumeAttachAndDetachTimeoutNodesWithReadWriteOnce(t *testing.T
// Volume is added to asw. Because attach operation fails, volume should not be reported as attached to the node. // Volume is added to asw. Because attach operation fails, volume should not be reported as attached to the node.
waitForVolumeAddedToNode(t, generatedVolumeName, nodeName1, asw) waitForVolumeAddedToNode(t, generatedVolumeName, nodeName1, asw)
verifyVolumeAttachedToNode(t, generatedVolumeName, nodeName1, cache.AttachStateUncertain, asw) verifyVolumeAttachedToNode(t, generatedVolumeName, nodeName1, cache.AttachStateUncertain, asw)
verifyVolumeReportedAsAttachedToNode(t, logger, generatedVolumeName, nodeName1, false, asw) verifyVolumeReportedAsAttachedToNode(t, logger, generatedVolumeName, nodeName1, false, asw, volumeAttachedCheckTimeout)
// When volume is added to the node, it is set to mounted by default. Then the status will be updated by checking node status VolumeInUse. // When volume is added to the node, it is set to mounted by default. Then the status will be updated by checking node status VolumeInUse.
// Without this, the delete operation will be delayed due to mounted status // Without this, the delete operation will be delayed due to mounted status
@ -1612,24 +1613,32 @@ func verifyVolumeReportedAsAttachedToNode(
nodeName k8stypes.NodeName, nodeName k8stypes.NodeName,
isAttached bool, isAttached bool,
asw cache.ActualStateOfWorld, asw cache.ActualStateOfWorld,
timeout time.Duration,
) { ) {
result := false var result bool
volumes := asw.GetVolumesToReportAttached(logger) var lastErr error
for _, volume := range volumes[nodeName] { err := wait.PollUntilContextTimeout(context.TODO(), 50*time.Millisecond, timeout, false, func(context.Context) (done bool, err error) {
if volume.Name == volumeName { volumes := asw.GetVolumesToReportAttached(logger)
result = true for _, volume := range volumes[nodeName] {
if volume.Name == volumeName {
result = true
}
} }
}
if result == isAttached { if result == isAttached {
t.Logf("Volume <%v> is reported as attached to node <%v>: %v", volumeName, nodeName, result) t.Logf("Volume <%v> is reported as attached to node <%v>: %v", volumeName, nodeName, result)
return return true, nil
}
lastErr = fmt.Errorf("Check volume <%v> is reported as attached to node <%v>, got %v, expected %v",
volumeName,
nodeName,
result,
isAttached)
return false, nil
})
if err != nil {
t.Fatalf("last error: %q, wait timeout: %q", lastErr, err.Error())
} }
t.Fatalf("Check volume <%v> is reported as attached to node <%v>, got %v, expected %v",
volumeName,
nodeName,
result,
isAttached)
} }