From 14dfeecd47959e752e360e5bc43d247c11b42b27 Mon Sep 17 00:00:00 2001
From: Aleksandra Malinowska <aleksandram@google.com>
Date: Wed, 20 Sep 2017 19:56:52 +0200
Subject: [PATCH] improve retrying logic when checking CA status

---
 .../autoscaling/cluster_size_autoscaling.go   | 78 +++++++++++++++----
 1 file changed, 62 insertions(+), 16 deletions(-)

diff --git a/test/e2e/autoscaling/cluster_size_autoscaling.go b/test/e2e/autoscaling/cluster_size_autoscaling.go
index c2fbdabb782..c6c8d0e6dda 100644
--- a/test/e2e/autoscaling/cluster_size_autoscaling.go
+++ b/test/e2e/autoscaling/cluster_size_autoscaling.go
@@ -62,6 +62,7 @@ const (
 	rcCreationRetryDelay   = 20 * time.Second
 	makeSchedulableTimeout = 10 * time.Minute
 	makeSchedulableDelay   = 20 * time.Second
+	freshStatusLimit       = 20 * time.Second
 
 	gkeEndpoint      = "https://test-container.sandbox.googleapis.com"
 	gkeUpdateTimeout = 15 * time.Minute
@@ -73,6 +74,7 @@ const (
 
 	caNoScaleUpStatus      = "NoActivity"
 	caOngoingScaleUpStatus = "InProgress"
+	timestampFormat        = "2006-01-02 15:04:05 -0700 MST"
 )
 
 var _ = SIGDescribe("Cluster size autoscaling [Slow]", func() {
@@ -192,15 +194,21 @@ var _ = SIGDescribe("Cluster size autoscaling [Slow]", func() {
 		})
 
 	It("shouldn't trigger additional scale-ups during processing scale-up [Feature:ClusterSizeAutoscalingScaleUp]", func() {
-		status, err := getScaleUpStatus(c)
+		// Wait for the situation to stabilize - CA should be running and have up-to-date node readiness info.
+		status, err := waitForScaleUpStatus(c, func(s *scaleUpStatus) bool {
+			return s.ready == s.target && s.ready <= nodeCount
+		}, scaleUpTriggerTimeout)
 		framework.ExpectNoError(err)
+
 		unmanagedNodes := nodeCount - status.ready
 
 		By("Schedule more pods than can fit and wait for cluster to scale-up")
 		ReserveMemory(f, "memory-reservation", 100, nodeCount*memAllocatableMb, false, 1*time.Second)
 		defer framework.DeleteRCAndPods(f.ClientSet, f.InternalClientset, f.Namespace.Name, "memory-reservation")
 
-		status, err = waitForScaleUpStatus(c, caOngoingScaleUpStatus, scaleUpTriggerTimeout)
+		status, err = waitForScaleUpStatus(c, func(s *scaleUpStatus) bool {
+			return s.status == caOngoingScaleUpStatus
+		}, scaleUpTriggerTimeout)
 		framework.ExpectNoError(err)
 		target := status.target
 		framework.ExpectNoError(waitForAllCaPodsReadyInNamespace(f, c))
@@ -211,6 +219,7 @@ var _ = SIGDescribe("Cluster size autoscaling [Slow]", func() {
 		if status.target != target {
 			glog.Warningf("Final number of nodes (%v) does not match initial scale-up target (%v).", status.target, target)
 		}
+		Expect(status.timestamp.Add(freshStatusLimit).Before(time.Now())).Should(Equal(false))
 		Expect(status.status).Should(Equal(caNoScaleUpStatus))
 		Expect(status.ready).Should(Equal(status.target))
 		Expect(len(framework.GetReadySchedulableNodesOrDie(f.ClientSet).Items)).Should(Equal(status.target + unmanagedNodes))
@@ -1342,9 +1351,30 @@ func getClusterwideStatus(c clientset.Interface) (string, error) {
 }
 
 type scaleUpStatus struct {
-	status string
-	ready  int
-	target int
+	status    string
+	ready     int
+	target    int
+	timestamp time.Time
+}
+
+// Try to get timestamp from status.
+// Status configmap is not parsing-friendly, so evil regexpery follows.
+func getStatusTimestamp(status string) (time.Time, error) {
+	timestampMatcher, err := regexp.Compile("Cluster-autoscaler status at \\s*([0-9\\-]+ [0-9]+:[0-9]+:[0-9]+\\.[0-9]+ \\+[0-9]+ [A-Za-z]+):")
+	if err != nil {
+		return time.Time{}, err
+	}
+
+	timestampMatches := timestampMatcher.FindAllStringSubmatch(status, -1)
+	if len(timestampMatches) < 1 {
+		return time.Time{}, fmt.Errorf("Failed to parse CA status timestamp, raw status: %v", status)
+	}
+
+	timestamp, err := time.Parse(timestampFormat, timestampMatches[0][1])
+	if err != nil {
+		return time.Time{}, err
+	}
+	return timestamp, nil
 }
 
 // Try to get scaleup statuses of all node groups.
@@ -1358,6 +1388,12 @@ func getScaleUpStatus(c clientset.Interface) (*scaleUpStatus, error) {
 	if !ok {
 		return nil, fmt.Errorf("Status information not found in configmap")
 	}
+
+	timestamp, err := getStatusTimestamp(status)
+	if err != nil {
+		return nil, err
+	}
+
 	matcher, err := regexp.Compile("s*ScaleUp:\\s*([A-Za-z]+)\\s*\\(ready=([0-9]+)\\s*cloudProviderTarget=([0-9]+)\\s*\\)")
 	if err != nil {
 		return nil, err
@@ -1366,10 +1402,12 @@ func getScaleUpStatus(c clientset.Interface) (*scaleUpStatus, error) {
 	if len(matches) < 1 {
 		return nil, fmt.Errorf("Failed to parse CA status configmap, raw status: %v", status)
 	}
+
 	result := scaleUpStatus{
-		status: caNoScaleUpStatus,
-		ready:  0,
-		target: 0,
+		status:    caNoScaleUpStatus,
+		ready:     0,
+		target:    0,
+		timestamp: timestamp,
 	}
 	for _, match := range matches {
 		if match[1] == caOngoingScaleUpStatus {
@@ -1390,17 +1428,25 @@ func getScaleUpStatus(c clientset.Interface) (*scaleUpStatus, error) {
 	return &result, nil
 }
 
-func waitForScaleUpStatus(c clientset.Interface, expected string, timeout time.Duration) (*scaleUpStatus, error) {
-	for start := time.Now(); time.Since(start) < timeout; time.Sleep(5 * time.Second) {
-		status, err := getScaleUpStatus(c)
-		if err != nil {
-			return nil, err
+func waitForScaleUpStatus(c clientset.Interface, cond func(s *scaleUpStatus) bool, timeout time.Duration) (*scaleUpStatus, error) {
+	var finalErr error
+	var status *scaleUpStatus
+	err := wait.PollImmediate(5*time.Second, timeout, func() (bool, error) {
+		status, finalErr = getScaleUpStatus(c)
+		if finalErr != nil {
+			return false, nil
 		}
-		if status.status == expected {
-			return status, nil
+		if status.timestamp.Add(freshStatusLimit).Before(time.Now()) {
+			// stale status
+			finalErr = fmt.Errorf("Status too old")
+			return false, nil
 		}
+		return cond(status), nil
+	})
+	if err != nil {
+		err = fmt.Errorf("Failed to find expected scale up status: %v, last status: %v, final err: %v", err, status, finalErr)
 	}
-	return nil, fmt.Errorf("ScaleUp status did not reach expected value: %v", expected)
+	return status, err
 }
 
 // This is a temporary fix to allow CA to migrate some kube-system pods