deflake: Add retry with timeout to wait for final conditions

This commit is contained in:
Paco Xu 2023-03-16 13:16:54 +08:00
parent 8b2dae57d4
commit c14068c202

View File

@ -18,6 +18,7 @@ package reconciler
import (
"context"
"fmt"
"testing"
"time"
@ -47,6 +48,7 @@ const (
syncLoopPeriod = 100 * time.Minute
maxWaitForUnmountDuration = 50 * time.Millisecond
maxLongWaitForUnmountDuration = 4200 * time.Second
volumeAttachedCheckTimeout = 5 * time.Second
)
// Calls Run()
@ -604,7 +606,7 @@ func Test_Run_OneVolumeAttachAndDetachUncertainNodesWithReadWriteOnce(t *testing
// Volume is added to asw. Because attach operation fails, volume should not be reported as attached to the node.
waitForVolumeAddedToNode(t, generatedVolumeName, nodeName1, asw)
verifyVolumeAttachedToNode(t, generatedVolumeName, nodeName1, cache.AttachStateAttached, asw)
verifyVolumeReportedAsAttachedToNode(t, logger, generatedVolumeName, nodeName1, true, asw)
verifyVolumeReportedAsAttachedToNode(t, logger, generatedVolumeName, nodeName1, true, asw, volumeAttachedCheckTimeout)
// When volume is added to the node, it is set to mounted by default. Then the status will be updated by checking node status VolumeInUse.
// Without this, the delete operation will be delayed due to mounted status
@ -665,7 +667,7 @@ func Test_Run_UpdateNodeStatusFailBeforeOneVolumeDetachNodeWithReadWriteOnce(t *
// Volume is added to asw, volume should be reported as attached to the node.
waitForVolumeAddedToNode(t, generatedVolumeName, nodeName1, asw)
verifyVolumeAttachedToNode(t, generatedVolumeName, nodeName1, cache.AttachStateAttached, asw)
verifyVolumeReportedAsAttachedToNode(t, logger, generatedVolumeName, nodeName1, true, asw)
verifyVolumeReportedAsAttachedToNode(t, logger, generatedVolumeName, nodeName1, true, asw, volumeAttachedCheckTimeout)
// Delete the pod
dsw.DeletePod(types.UniquePodName(podName1), generatedVolumeName, nodeName1)
@ -683,7 +685,7 @@ func Test_Run_UpdateNodeStatusFailBeforeOneVolumeDetachNodeWithReadWriteOnce(t *
// in node status. By calling this function (GetVolumesToReportAttached), node status should be updated, and the volume
// will not need to be updated until new changes are applied (detach is triggered again)
verifyVolumeAttachedToNode(t, generatedVolumeName, nodeName1, cache.AttachStateAttached, asw)
verifyVolumeReportedAsAttachedToNode(t, logger, generatedVolumeName, nodeName1, true, asw)
verifyVolumeReportedAsAttachedToNode(t, logger, generatedVolumeName, nodeName1, true, asw, volumeAttachedCheckTimeout)
}
@ -728,11 +730,10 @@ func Test_Run_OneVolumeDetachFailNodeWithReadWriteOnce(t *testing.T) {
t.Fatalf("AddPod failed. Expected: <no error> Actual: <%v>", podAddErr)
}
time.Sleep(1000 * time.Millisecond)
// Volume is added to asw, volume should be reported as attached to the node.
waitForVolumeAddedToNode(t, generatedVolumeName, nodeName1, asw)
verifyVolumeAttachedToNode(t, generatedVolumeName, nodeName1, cache.AttachStateAttached, asw)
verifyVolumeReportedAsAttachedToNode(t, logger, generatedVolumeName, nodeName1, true, asw)
verifyVolumeReportedAsAttachedToNode(t, logger, generatedVolumeName, nodeName1, true, asw, volumeAttachedCheckTimeout)
// Delete the pod, but detach will fail
dsw.DeletePod(types.UniquePodName(podName1), generatedVolumeName, nodeName1)
@ -741,28 +742,29 @@ func Test_Run_OneVolumeDetachFailNodeWithReadWriteOnce(t *testing.T) {
// Right before detach operation is performed, the volume will be first removed from being reported
// as attached on node status (RemoveVolumeFromReportAsAttached). After detach operation which is expected to fail,
// controller then added the volume back as attached.
// Here it sleeps 100ms so that detach should be triggered already at this point.
// Here verifyVolumeReportedAsAttachedToNode will wait until the detach is triggered.
// verifyVolumeReportedAsAttachedToNode will check volume is in the list of volume attached that needs to be updated
// in node status. By calling this function (GetVolumesToReportAttached), node status should be updated, and the volume
// will not need to be updated until new changes are applied (detach is triggered again)
time.Sleep(100 * time.Millisecond)
verifyVolumeAttachedToNode(t, generatedVolumeName, nodeName1, cache.AttachStateAttached, asw)
verifyVolumeReportedAsAttachedToNode(t, logger, generatedVolumeName, nodeName1, true, asw)
verifyVolumeReportedAsAttachedToNode(t, logger, generatedVolumeName, nodeName1, true, asw, volumeAttachedCheckTimeout)
// After the first detach fails, reconciler will wait for a period of time before retrying to detach.
// The wait time is increasing exponentially from initial value of 0.5s (0.5, 1, 2, 4, ...).
// The test here waits for 100 Millisecond to make sure it is in exponential backoff period after
// the first detach operation. At this point, volumes status should not be updated
time.Sleep(100 * time.Millisecond)
verifyVolumeAttachedToNode(t, generatedVolumeName, nodeName1, cache.AttachStateAttached, asw)
verifyVolumeNoStatusUpdateNeeded(t, logger, generatedVolumeName, nodeName1, asw)
// The below verifies check intermediate state and is flaky but useful
//
// // After the first detach fails, reconciler will wait for a period of time before retrying to detach.
// // The wait time is increasing exponentially from initial value of 0.5s (0.5, 1, 2, 4, ...).
// // The test here waits for 100 Millisecond to make sure it is in exponential backoff period after
// // the first detach operation. At this point, volumes status should not be updated
// time.Sleep(100 * time.Millisecond)
// verifyVolumeAttachedToNode(t, generatedVolumeName, nodeName1, cache.AttachStateAttached, asw)
// verifyVolumeNoStatusUpdateNeeded(t, logger, generatedVolumeName, nodeName1, asw)
// Wait for 600ms to make sure second detach operation triggered. Again, The volume will be
// removed from being reported as attached on node status and then added back as attached.
// The volume will be in the list of attached volumes that need to be updated to node status.
time.Sleep(600 * time.Millisecond)
verifyVolumeAttachedToNode(t, generatedVolumeName, nodeName1, cache.AttachStateAttached, asw)
verifyVolumeReportedAsAttachedToNode(t, logger, generatedVolumeName, nodeName1, true, asw)
// // Wait for 600ms to make sure second detach operation triggered. Again, The volume will be
// // removed from being reported as attached on node status and then added back as attached.
// // The volume will be in the list of attached volumes that need to be updated to node status.
// time.Sleep(600 * time.Millisecond)
// verifyVolumeAttachedToNode(t, generatedVolumeName, nodeName1, cache.AttachStateAttached, asw)
// verifyVolumeReportedAsAttachedToNode(t, logger, generatedVolumeName, nodeName1, true, asw, volumeAttachedCheckTimeout)
// Add a second pod which tries to attach the volume to the same node.
// After adding pod to the same node, detach will not be triggered any more.
@ -770,8 +772,7 @@ func Test_Run_OneVolumeDetachFailNodeWithReadWriteOnce(t *testing.T) {
if podAddErr != nil {
t.Fatalf("AddPod failed. Expected: <no error> Actual: <%v>", podAddErr)
}
// Sleep 1s to verify no detach are triggered after second pod is added in the future.
time.Sleep(1000 * time.Millisecond)
// verify no detach are triggered after second pod is added in the future.
verifyVolumeAttachedToNode(t, generatedVolumeName, nodeName1, cache.AttachStateAttached, asw)
verifyVolumeNoStatusUpdateNeeded(t, logger, generatedVolumeName, nodeName1, asw)
@ -834,7 +835,7 @@ func Test_Run_OneVolumeAttachAndDetachTimeoutNodesWithReadWriteOnce(t *testing.T
// Volume is added to asw. Because attach operation fails, volume should not be reported as attached to the node.
waitForVolumeAddedToNode(t, generatedVolumeName, nodeName1, asw)
verifyVolumeAttachedToNode(t, generatedVolumeName, nodeName1, cache.AttachStateUncertain, asw)
verifyVolumeReportedAsAttachedToNode(t, logger, generatedVolumeName, nodeName1, false, asw)
verifyVolumeReportedAsAttachedToNode(t, logger, generatedVolumeName, nodeName1, false, asw, volumeAttachedCheckTimeout)
// When volume is added to the node, it is set to mounted by default. Then the status will be updated by checking node status VolumeInUse.
// Without this, the delete operation will be delayed due to mounted status
@ -1612,24 +1613,32 @@ func verifyVolumeReportedAsAttachedToNode(
nodeName k8stypes.NodeName,
isAttached bool,
asw cache.ActualStateOfWorld,
timeout time.Duration,
) {
result := false
volumes := asw.GetVolumesToReportAttached(logger)
for _, volume := range volumes[nodeName] {
if volume.Name == volumeName {
result = true
var result bool
var lastErr error
err := wait.PollUntilContextTimeout(context.TODO(), 50*time.Millisecond, timeout, false, func(context.Context) (done bool, err error) {
volumes := asw.GetVolumesToReportAttached(logger)
for _, volume := range volumes[nodeName] {
if volume.Name == volumeName {
result = true
}
}
}
if result == isAttached {
t.Logf("Volume <%v> is reported as attached to node <%v>: %v", volumeName, nodeName, result)
return
if result == isAttached {
t.Logf("Volume <%v> is reported as attached to node <%v>: %v", volumeName, nodeName, result)
return true, nil
}
lastErr = fmt.Errorf("Check volume <%v> is reported as attached to node <%v>, got %v, expected %v",
volumeName,
nodeName,
result,
isAttached)
return false, nil
})
if err != nil {
t.Fatalf("last error: %q, wait timeout: %q", lastErr, err.Error())
}
t.Fatalf("Check volume <%v> is reported as attached to node <%v>, got %v, expected %v",
volumeName,
nodeName,
result,
isAttached)
}