Merge pull request #8246 from rrati/red-flag-pod-termination

Correct logic for failing after % of containers fail. #7790
2015-05-16 02:46:07 +01:00 · 2015-05-16 02:46:07 +01:00 · ab0844840a
commit ab0844840a
parent cf337051e0 30c7dbc7bd
1 changed files with 29 additions and 18 deletions
--- a/test/e2e/util.go
+++ b/test/e2e/util.go
@ -69,6 +69,11 @@ type TestContextType struct {
 var testContext TestContextType
 type ContainerFailures struct {
 	status   *api.ContainerStateTerminated
 	restarts int
 }
 func Logf(format string, a ...interface{}) {
 	fmt.Fprintf(GinkgoWriter, "INFO: "+format+"\n", a...)
 }
@ -555,6 +560,7 @@ func RunRC(c *client.Client, name string, ns, image string, replicas int) error
 		pending := 0
 		unknown := 0
 		inactive := 0
 		failedContainers := 0
 		time.Sleep(10 * time.Second)
 		// TODO: Use a reflector both to put less strain on the cluster and
@ -566,8 +572,8 @@ func RunRC(c *client.Client, name string, ns, image string, replicas int) error
 		for _, p := range currentPods.Items {
 			if p.Status.Phase == api.PodRunning {
 				current++
-				if err := VerifyContainersAreNotFailed(p, maxContainerFailures); err != nil {
+				for _, v := range FailedContainers(p) {
-					return err
+					failedContainers = failedContainers + v.restarts
 				}
 			} else if p.Status.Phase == api.PodPending {
 				if p.Spec.Host == "" {
@ -618,6 +624,10 @@ func RunRC(c *client.Client, name string, ns, image string, replicas int) error
 		}
 		last = current
 		oldPods = currentPods
 		if failedContainers > maxContainerFailures {
 			return fmt.Errorf("%d containers failed which is more than allowed %d", failedContainers, maxContainerFailures)
 		}
 	}
 	if current != replicas {
 		return fmt.Errorf("Only %d pods started out of %d", current, replicas)
@ -686,35 +696,36 @@ func listPods(c *client.Client, namespace string, label labels.Selector, field f
 	return pods, err
 }
-//VerifyContainersAreNotFailed confirms that containers didn't enter an invalid state.
+// FailedContainers inspects all containers in a pod and returns failure
-//For example, too many restarts, or non nill Termination, and so on.
+// information for containers that have failed or been restarted.
-func VerifyContainersAreNotFailed(pod api.Pod, restartMax int) error {
+// A map is returned where the key is the containerID and the value is a
-	var errStrings []string
+// struct containing the restart and failure information
 func FailedContainers(pod api.Pod) map[string]ContainerFailures {
 	var state ContainerFailures
 	states := make(map[string]ContainerFailures)
 	statuses := pod.Status.ContainerStatuses
 	if len(statuses) == 0 {
 		return nil
 	} else {
 		for _, status := range statuses {
 			var errormsg string = ""
 			if status.State.Termination != nil {
-				errormsg = "status.State.Termination was nil"
+				states[status.ContainerID] = ContainerFailures{status: status.State.Termination}
 			} else if status.LastTerminationState.Termination != nil {
-				errormsg = "status.LastTerminationState.Termination was nil"
+				states[status.ContainerID] = ContainerFailures{status: status.LastTerminationState.Termination}
 			} else if status.RestartCount > restartMax {
 				errormsg = fmt.Sprintf("restarted %d times", restartMax)
 			}
-
+			if status.RestartCount > 0 {
-			if len(errormsg) != 0 {
+				var ok bool
-				errStrings = append(errStrings, fmt.Sprintf("Error: Pod %s (host: %s) : Container w/ name %s status was bad (%v).", pod.Name, pod.Spec.Host, status.Name, errormsg))
+				if state, ok = states[status.ContainerID]; !ok {
 					state = ContainerFailures{}
 				}
 				state.restarts = status.RestartCount
 				states[status.ContainerID] = state
 			}
 		}
 	}
-	if len(errStrings) > 0 {
+	return states
 		return fmt.Errorf(strings.Join(errStrings, "\n"))
 	}
 	return nil
 }
 // Prints the histogram of the events and returns the number of bad events.