From e878c20ac7726bf71f99893564442c83122c1df1 Mon Sep 17 00:00:00 2001 From: Francesco Romani Date: Wed, 6 Oct 2021 17:41:45 +0200 Subject: [PATCH 1/2] e2e: node: improve error logging In the `restartKubelet` helper, we use `exec.Command`, whose return value is the output as the command, but as `[]byte`. The way we logged the output of the command was as value, making the output, meant to be human readable, unnecessarily hard to read. We fix this annoying behaviour converting the output to string before to log it out, making pretty obvious to understand the outcome of the command. Signed-off-by: Francesco Romani --- test/e2e_node/util.go | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/test/e2e_node/util.go b/test/e2e_node/util.go index fde40d03c75..5a3599ee7fc 100644 --- a/test/e2e_node/util.go +++ b/test/e2e_node/util.go @@ -418,10 +418,10 @@ func restartKubelet() { kubeletServiceName := findKubletServiceName(false) // reset the kubelet service start-limit-hit stdout, err := exec.Command("sudo", "systemctl", "reset-failed", kubeletServiceName).CombinedOutput() - framework.ExpectNoError(err, "Failed to reset kubelet start-limit-hit with systemctl: %v, %v", err, stdout) + framework.ExpectNoError(err, "Failed to reset kubelet start-limit-hit with systemctl: %v, %s", err, string(stdout)) stdout, err = exec.Command("sudo", "systemctl", "restart", kubeletServiceName).CombinedOutput() - framework.ExpectNoError(err, "Failed to restart kubelet with systemctl: %v, %v", err, stdout) + framework.ExpectNoError(err, "Failed to restart kubelet with systemctl: %v, %s", err, string(stdout)) } // stopKubelet will kill the running kubelet, and returns a func that will restart the process again @@ -430,14 +430,14 @@ func stopKubelet() func() { // reset the kubelet service start-limit-hit stdout, err := exec.Command("sudo", "systemctl", "reset-failed", kubeletServiceName).CombinedOutput() - framework.ExpectNoError(err, "Failed to reset kubelet start-limit-hit with systemctl: %v, %v", err, stdout) + framework.ExpectNoError(err, "Failed to reset kubelet start-limit-hit with systemctl: %v, %s", err, string(stdout)) stdout, err = exec.Command("sudo", "systemctl", "kill", kubeletServiceName).CombinedOutput() - framework.ExpectNoError(err, "Failed to stop kubelet with systemctl: %v, %v", err, stdout) + framework.ExpectNoError(err, "Failed to stop kubelet with systemctl: %v, %s", err, string(stdout)) return func() { stdout, err := exec.Command("sudo", "systemctl", "start", kubeletServiceName).CombinedOutput() - framework.ExpectNoError(err, "Failed to restart kubelet with systemctl: %v, %v", err, stdout) + framework.ExpectNoError(err, "Failed to restart kubelet with systemctl: %v, %s", err, string(stdout)) } } From d15bff283924055c7687d19f2906dea05f5482b5 Mon Sep 17 00:00:00 2001 From: Francesco Romani Date: Wed, 6 Oct 2021 17:45:22 +0200 Subject: [PATCH 2/2] e2e: node: expose the `running` flag Each e2e test knows it wants to restart a running kubelet or a non-running kubelet. The vast majority of times, we want to restart a running kubelet (e.g. to change config or to check some properties hold across kubelet crashes/restarts), but sometimes we stop the kubelet, do some actions and only then restart. To accomodate both use cases, we just expose the `running` boolean flag to the e2e tests. Having the `restartKubelet` explicitly restarting a running kubelet helps us to trobuleshoot e2e failures on which the kubelet was supposed to be running, while it was not; attempting a restart in such cases only murkied the waters further, making the troubleshooting and the eventual fix harder. In the happy path, no expected change in behaviour. Signed-off-by: Francesco Romani --- test/e2e_node/device_plugin_test.go | 2 +- test/e2e_node/gpu_device_plugin_test.go | 8 ++++---- test/e2e_node/hugepages_test.go | 2 +- test/e2e_node/memory_manager_test.go | 2 +- test/e2e_node/podresources_test.go | 2 +- test/e2e_node/util.go | 4 ++-- 6 files changed, 10 insertions(+), 10 deletions(-) diff --git a/test/e2e_node/device_plugin_test.go b/test/e2e_node/device_plugin_test.go index 9e4493f3a21..c579c7a5ed8 100644 --- a/test/e2e_node/device_plugin_test.go +++ b/test/e2e_node/device_plugin_test.go @@ -220,7 +220,7 @@ func testDevicePlugin(f *framework.Framework, pluginSockDir string) { restartTime := time.Now() ginkgo.By("Restarting Kubelet") - restartKubelet() + restartKubelet(true) // We need to wait for node to be ready before re-registering stub device plugin. // Otherwise, Kubelet DeviceManager may remove the re-registered sockets after it starts. diff --git a/test/e2e_node/gpu_device_plugin_test.go b/test/e2e_node/gpu_device_plugin_test.go index 20b410b3326..54f787846da 100644 --- a/test/e2e_node/gpu_device_plugin_test.go +++ b/test/e2e_node/gpu_device_plugin_test.go @@ -103,7 +103,7 @@ var _ = SIGDescribe("NVIDIA GPU Device Plugin [Feature:GPUDevicePlugin][NodeFeat f.PodClient().DeleteSync(p.Name, metav1.DeleteOptions{}, 2*time.Minute) } - restartKubelet() + restartKubelet(true) ginkgo.By("Waiting for GPUs to become unavailable on the local node") gomega.Eventually(func() bool { @@ -142,7 +142,7 @@ var _ = SIGDescribe("NVIDIA GPU Device Plugin [Feature:GPUDevicePlugin][NodeFeat framework.ExpectEqual(devIDRestart1, devID1) ginkgo.By("Restarting Kubelet") - restartKubelet() + restartKubelet(true) framework.WaitForAllNodesSchedulable(f.ClientSet, 30*time.Minute) ginkgo.By("Checking that scheduled pods can continue to run even after we delete device plugin and restart Kubelet.") @@ -172,7 +172,7 @@ var _ = SIGDescribe("NVIDIA GPU Device Plugin [Feature:GPUDevicePlugin][NodeFeat } ginkgo.By("Restarting Kubelet") - restartKubelet() + restartKubelet(true) ginkgo.By("Confirming that after a kubelet and pod restart, GPU assignment is kept") ensurePodContainerRestart(f, p1.Name, p1.Name) @@ -181,7 +181,7 @@ var _ = SIGDescribe("NVIDIA GPU Device Plugin [Feature:GPUDevicePlugin][NodeFeat ginkgo.By("Restarting Kubelet and creating another pod") - restartKubelet() + restartKubelet(true) framework.WaitForAllNodesSchedulable(f.ClientSet, 30*time.Minute) ensurePodContainerRestart(f, p1.Name, p1.Name) diff --git a/test/e2e_node/hugepages_test.go b/test/e2e_node/hugepages_test.go index e62376d8c38..74c715333cc 100644 --- a/test/e2e_node/hugepages_test.go +++ b/test/e2e_node/hugepages_test.go @@ -207,7 +207,7 @@ var _ = SIGDescribe("HugePages [Serial] [Feature:HugePages][NodeSpecialFeature:H framework.ExpectEqual(value.String(), "9Mi", "huge pages with size 3Mi should be supported") ginkgo.By("restarting the node and verifying that huge pages with size 3Mi are not supported") - restartKubelet() + restartKubelet(true) ginkgo.By("verifying that the hugepages-3Mi resource no longer is present") gomega.Eventually(func() bool { diff --git a/test/e2e_node/memory_manager_test.go b/test/e2e_node/memory_manager_test.go index eee9674de48..6bdfd3a5340 100644 --- a/test/e2e_node/memory_manager_test.go +++ b/test/e2e_node/memory_manager_test.go @@ -348,7 +348,7 @@ var _ = SIGDescribe("Memory Manager [Serial] [Feature:MemoryManager]", func() { return kubeletHealthCheck(kubeletHealthCheckURL) }, time.Minute, time.Second).Should(gomega.BeFalse()) - restartKubelet() + restartKubelet(false) // wait until the kubelet health check will pass gomega.Eventually(func() bool { diff --git a/test/e2e_node/podresources_test.go b/test/e2e_node/podresources_test.go index 591060c8af2..3a010e0f0c0 100644 --- a/test/e2e_node/podresources_test.go +++ b/test/e2e_node/podresources_test.go @@ -731,7 +731,7 @@ var _ = SIGDescribe("POD Resources [Serial] [Feature:PodResources][NodeFeature:P expectPodResources(1, cli, []podDesc{desc}) ginkgo.By("Restarting Kubelet") - restartKubelet() + restartKubelet(true) framework.WaitForAllNodesSchedulable(f.ClientSet, framework.TestContext.NodeSchedulableTimeout) expectPodResources(1, cli, []podDesc{desc}) tpd.deletePodsForTest(f) diff --git a/test/e2e_node/util.go b/test/e2e_node/util.go index 5a3599ee7fc..411b8e02529 100644 --- a/test/e2e_node/util.go +++ b/test/e2e_node/util.go @@ -414,8 +414,8 @@ func findKubletServiceName(running bool) string { return kubeletServiceName } -func restartKubelet() { - kubeletServiceName := findKubletServiceName(false) +func restartKubelet(running bool) { + kubeletServiceName := findKubletServiceName(running) // reset the kubelet service start-limit-hit stdout, err := exec.Command("sudo", "systemctl", "reset-failed", kubeletServiceName).CombinedOutput() framework.ExpectNoError(err, "Failed to reset kubelet start-limit-hit with systemctl: %v, %s", err, string(stdout))