Refactor the reboot test to print accurate information about node failures
As well as events from the kube-system namespace
This commit is contained in:
		| @@ -18,6 +18,7 @@ package e2e | |||||||
|  |  | ||||||
| import ( | import ( | ||||||
| 	"fmt" | 	"fmt" | ||||||
|  | 	"sync" | ||||||
| 	"time" | 	"time" | ||||||
|  |  | ||||||
| 	"k8s.io/kubernetes/pkg/api" | 	"k8s.io/kubernetes/pkg/api" | ||||||
| @@ -26,6 +27,7 @@ import ( | |||||||
| 	"k8s.io/kubernetes/pkg/labels" | 	"k8s.io/kubernetes/pkg/labels" | ||||||
|  |  | ||||||
| 	. "github.com/onsi/ginkgo" | 	. "github.com/onsi/ginkgo" | ||||||
|  | 	. "github.com/onsi/gomega" | ||||||
| ) | ) | ||||||
|  |  | ||||||
| const ( | const ( | ||||||
| @@ -43,16 +45,33 @@ const ( | |||||||
| ) | ) | ||||||
|  |  | ||||||
| var _ = Describe("Reboot", func() { | var _ = Describe("Reboot", func() { | ||||||
| 	f := NewFramework("reboot") | 	var f *Framework | ||||||
|  |  | ||||||
| 	BeforeEach(func() { | 	BeforeEach(func() { | ||||||
| 		// These tests requires SSH to nodes, so the provider check should be identical to there | 		// These tests requires SSH to nodes, so the provider check should be identical to there | ||||||
| 		// (the limiting factor is the implementation of util.go's getSigner(...)). | 		// (the limiting factor is the implementation of util.go's getSigner(...)). | ||||||
|  |  | ||||||
| 		// Cluster must support node reboot | 		// Cluster must support node reboot | ||||||
| 		SkipUnlessProviderIs("gce", "gke", "aws") | 		SkipUnlessProviderIs(providersWithSSH...) | ||||||
| 	}) | 	}) | ||||||
|  |  | ||||||
|  | 	AfterEach(func() { | ||||||
|  | 		if CurrentGinkgoTestDescription().Failed { | ||||||
|  | 			// Most of the reboot tests just make sure that addon/system pods are running, so dump | ||||||
|  | 			// events for the kube-system namespace on failures | ||||||
|  | 			namespaceName := api.NamespaceSystem | ||||||
|  | 			By(fmt.Sprintf("Collecting events from namespace %q.", namespaceName)) | ||||||
|  | 			events, err := f.Client.Events(namespaceName).List(labels.Everything(), fields.Everything()) | ||||||
|  | 			Expect(err).NotTo(HaveOccurred()) | ||||||
|  |  | ||||||
|  | 			for _, e := range events.Items { | ||||||
|  | 				Logf("event for %v: %v %v: %v", e.InvolvedObject.Name, e.Source, e.Reason, e.Message) | ||||||
|  | 			} | ||||||
|  | 		} | ||||||
|  | 	}) | ||||||
|  |  | ||||||
|  | 	f = NewFramework("reboot") | ||||||
|  |  | ||||||
| 	It("each node by ordering clean reboot and ensure they function upon restart", func() { | 	It("each node by ordering clean reboot and ensure they function upon restart", func() { | ||||||
| 		// clean shutdown and restart | 		// clean shutdown and restart | ||||||
| 		// We sleep 10 seconds to give some time for ssh command to cleanly finish before the node is rebooted. | 		// We sleep 10 seconds to give some time for ssh command to cleanly finish before the node is rebooted. | ||||||
| @@ -100,22 +119,32 @@ func testReboot(c *client.Client, rebootCmd string) { | |||||||
| 	if err != nil { | 	if err != nil { | ||||||
| 		Failf("Error getting nodes: %v", err) | 		Failf("Error getting nodes: %v", err) | ||||||
| 	} | 	} | ||||||
| 	result := make(chan bool, len(nodelist.Items)) | 	result := make([]bool, len(nodelist.Items)) | ||||||
| 	for _, n := range nodelist.Items { | 	wg := sync.WaitGroup{} | ||||||
| 		go rebootNode(c, testContext.Provider, n.ObjectMeta.Name, rebootCmd, result) | 	wg.Add(len(nodelist.Items)) | ||||||
|  |  | ||||||
|  | 	failed := false | ||||||
|  | 	for ix := range nodelist.Items { | ||||||
|  | 		go func(ix int) { | ||||||
|  | 			defer wg.Done() | ||||||
|  | 			n := nodelist.Items[ix] | ||||||
|  | 			result[ix] = rebootNode(c, testContext.Provider, n.ObjectMeta.Name, rebootCmd) | ||||||
|  | 			if !result[ix] { | ||||||
|  | 				failed = true | ||||||
|  | 			} | ||||||
|  | 		}(ix) | ||||||
| 	} | 	} | ||||||
|  |  | ||||||
| 	// Wait for all to finish and check the final result. | 	// Wait for all to finish and check the final result. | ||||||
| 	failed := false | 	wg.Wait() | ||||||
| 	// TODO(a-robinson): Change to `for range` syntax and remove logging once |  | ||||||
| 	// we support only Go >= 1.4. |  | ||||||
| 	for _, n := range nodelist.Items { |  | ||||||
| 		if !<-result { |  | ||||||
| 			Failf("Node %s failed reboot test.", n.ObjectMeta.Name) |  | ||||||
| 			failed = true |  | ||||||
| 		} |  | ||||||
| 	} |  | ||||||
| 	if failed { | 	if failed { | ||||||
|  | 		for ix := range nodelist.Items { | ||||||
|  | 			n := nodelist.Items[ix] | ||||||
|  | 			if !result[ix] { | ||||||
|  | 				Logf("Node %s failed reboot test.", n.ObjectMeta.Name) | ||||||
|  | 			} | ||||||
|  | 		} | ||||||
| 		Failf("Test failed; at least one node failed to reboot in the time given.") | 		Failf("Test failed; at least one node failed to reboot in the time given.") | ||||||
| 	} | 	} | ||||||
| } | } | ||||||
| @@ -149,7 +178,7 @@ func issueSSHCommand(node *api.Node, provider, cmd string) error { | |||||||
| // | // | ||||||
| // It returns true through result only if all of the steps pass; at the first | // It returns true through result only if all of the steps pass; at the first | ||||||
| // failed step, it will return false through result and not run the rest. | // failed step, it will return false through result and not run the rest. | ||||||
| func rebootNode(c *client.Client, provider, name, rebootCmd string, result chan bool) { | func rebootNode(c *client.Client, provider, name, rebootCmd string) bool { | ||||||
| 	// Setup | 	// Setup | ||||||
| 	ns := api.NamespaceSystem | 	ns := api.NamespaceSystem | ||||||
| 	ps := newPodStore(c, ns, labels.Everything(), fields.OneTermEqualSelector(client.PodHost, name)) | 	ps := newPodStore(c, ns, labels.Everything(), fields.OneTermEqualSelector(client.PodHost, name)) | ||||||
| @@ -160,14 +189,12 @@ func rebootNode(c *client.Client, provider, name, rebootCmd string, result chan | |||||||
| 	node, err := c.Nodes().Get(name) | 	node, err := c.Nodes().Get(name) | ||||||
| 	if err != nil { | 	if err != nil { | ||||||
| 		Logf("Couldn't get node %s", name) | 		Logf("Couldn't get node %s", name) | ||||||
| 		result <- false | 		return false | ||||||
| 		return |  | ||||||
| 	} | 	} | ||||||
|  |  | ||||||
| 	// Node sanity check: ensure it is "ready". | 	// Node sanity check: ensure it is "ready". | ||||||
| 	if !waitForNodeToBeReady(c, name, nodeReadyInitialTimeout) { | 	if !waitForNodeToBeReady(c, name, nodeReadyInitialTimeout) { | ||||||
| 		result <- false | 		return false | ||||||
| 		return |  | ||||||
| 	} | 	} | ||||||
|  |  | ||||||
| 	// Get all the pods on the node that don't have liveness probe set. | 	// Get all the pods on the node that don't have liveness probe set. | ||||||
| @@ -191,36 +218,31 @@ func rebootNode(c *client.Client, provider, name, rebootCmd string, result chan | |||||||
| 	// For each pod, we do a sanity check to ensure it's running / healthy | 	// For each pod, we do a sanity check to ensure it's running / healthy | ||||||
| 	// now, as that's what we'll be checking later. | 	// now, as that's what we'll be checking later. | ||||||
| 	if !checkPodsRunningReady(c, ns, podNames, podReadyBeforeTimeout) { | 	if !checkPodsRunningReady(c, ns, podNames, podReadyBeforeTimeout) { | ||||||
| 		result <- false | 		return false | ||||||
| 		return |  | ||||||
| 	} | 	} | ||||||
|  |  | ||||||
| 	// Reboot the node. | 	// Reboot the node. | ||||||
| 	if err = issueSSHCommand(node, provider, rebootCmd); err != nil { | 	if err = issueSSHCommand(node, provider, rebootCmd); err != nil { | ||||||
| 		Logf("Error while issuing ssh command: %v", err) | 		Logf("Error while issuing ssh command: %v", err) | ||||||
| 		result <- false | 		return false | ||||||
| 		return |  | ||||||
| 	} | 	} | ||||||
|  |  | ||||||
| 	// Wait for some kind of "not ready" status. | 	// Wait for some kind of "not ready" status. | ||||||
| 	if !waitForNodeToBeNotReady(c, name, rebootNodeNotReadyTimeout) { | 	if !waitForNodeToBeNotReady(c, name, rebootNodeNotReadyTimeout) { | ||||||
| 		result <- false | 		return false | ||||||
| 		return |  | ||||||
| 	} | 	} | ||||||
|  |  | ||||||
| 	// Wait for some kind of "ready" status. | 	// Wait for some kind of "ready" status. | ||||||
| 	if !waitForNodeToBeReady(c, name, rebootNodeReadyAgainTimeout) { | 	if !waitForNodeToBeReady(c, name, rebootNodeReadyAgainTimeout) { | ||||||
| 		result <- false | 		return false | ||||||
| 		return |  | ||||||
| 	} | 	} | ||||||
|  |  | ||||||
| 	// Ensure all of the pods that we found on this node before the reboot are | 	// Ensure all of the pods that we found on this node before the reboot are | ||||||
| 	// running / healthy. | 	// running / healthy. | ||||||
| 	if !checkPodsRunningReady(c, ns, podNames, rebootPodReadyAgainTimeout) { | 	if !checkPodsRunningReady(c, ns, podNames, rebootPodReadyAgainTimeout) { | ||||||
| 		result <- false | 		return false | ||||||
| 		return |  | ||||||
| 	} | 	} | ||||||
|  |  | ||||||
| 	Logf("Reboot successful on node %s", name) | 	Logf("Reboot successful on node %s", name) | ||||||
| 	result <- true | 	return true | ||||||
| } | } | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Brendan Burns
					Brendan Burns