From e878c20ac7726bf71f99893564442c83122c1df1 Mon Sep 17 00:00:00 2001
From: Francesco Romani <fromani@redhat.com>
Date: Wed, 6 Oct 2021 17:41:45 +0200
Subject: [PATCH 1/2] e2e: node: improve error logging

In the `restartKubelet` helper, we use `exec.Command`, whose
return value is the output as the command, but as `[]byte`.
The way we logged the output of the command was as value, making
the output, meant to be human readable, unnecessarily hard to read.

We fix this annoying behaviour converting the output to string before
to log it out, making pretty obvious to understand the outcome of
the command.

Signed-off-by: Francesco Romani <fromani@redhat.com>
---
 test/e2e_node/util.go | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/test/e2e_node/util.go b/test/e2e_node/util.go
index fde40d03c75..5a3599ee7fc 100644
--- a/test/e2e_node/util.go
+++ b/test/e2e_node/util.go
@@ -418,10 +418,10 @@ func restartKubelet() {
 	kubeletServiceName := findKubletServiceName(false)
 	// reset the kubelet service start-limit-hit
 	stdout, err := exec.Command("sudo", "systemctl", "reset-failed", kubeletServiceName).CombinedOutput()
-	framework.ExpectNoError(err, "Failed to reset kubelet start-limit-hit with systemctl: %v, %v", err, stdout)
+	framework.ExpectNoError(err, "Failed to reset kubelet start-limit-hit with systemctl: %v, %s", err, string(stdout))
 
 	stdout, err = exec.Command("sudo", "systemctl", "restart", kubeletServiceName).CombinedOutput()
-	framework.ExpectNoError(err, "Failed to restart kubelet with systemctl: %v, %v", err, stdout)
+	framework.ExpectNoError(err, "Failed to restart kubelet with systemctl: %v, %s", err, string(stdout))
 }
 
 // stopKubelet will kill the running kubelet, and returns a func that will restart the process again
@@ -430,14 +430,14 @@ func stopKubelet() func() {
 
 	// reset the kubelet service start-limit-hit
 	stdout, err := exec.Command("sudo", "systemctl", "reset-failed", kubeletServiceName).CombinedOutput()
-	framework.ExpectNoError(err, "Failed to reset kubelet start-limit-hit with systemctl: %v, %v", err, stdout)
+	framework.ExpectNoError(err, "Failed to reset kubelet start-limit-hit with systemctl: %v, %s", err, string(stdout))
 
 	stdout, err = exec.Command("sudo", "systemctl", "kill", kubeletServiceName).CombinedOutput()
-	framework.ExpectNoError(err, "Failed to stop kubelet with systemctl: %v, %v", err, stdout)
+	framework.ExpectNoError(err, "Failed to stop kubelet with systemctl: %v, %s", err, string(stdout))
 
 	return func() {
 		stdout, err := exec.Command("sudo", "systemctl", "start", kubeletServiceName).CombinedOutput()
-		framework.ExpectNoError(err, "Failed to restart kubelet with systemctl: %v, %v", err, stdout)
+		framework.ExpectNoError(err, "Failed to restart kubelet with systemctl: %v, %s", err, string(stdout))
 	}
 }
 

From d15bff283924055c7687d19f2906dea05f5482b5 Mon Sep 17 00:00:00 2001
From: Francesco Romani <fromani@redhat.com>
Date: Wed, 6 Oct 2021 17:45:22 +0200
Subject: [PATCH 2/2] e2e: node: expose the `running` flag

Each e2e test knows it wants to restart a running kubelet or a
non-running kubelet. The vast majority of times, we want to
restart a running kubelet (e.g. to change config or to check
some properties hold across kubelet crashes/restarts), but sometimes
we stop the kubelet, do some actions and only then restart.

To accomodate both use cases, we just expose the `running` boolean
flag to the e2e tests.

Having the `restartKubelet` explicitly restarting a running kubelet
helps us to trobuleshoot e2e failures on which the kubelet
was supposed to be running, while it was not; attempting a restart
in such cases only murkied the waters further, making the
troubleshooting and the eventual fix harder.

In the happy path, no expected change in behaviour.

Signed-off-by: Francesco Romani <fromani@redhat.com>
---
 test/e2e_node/device_plugin_test.go     | 2 +-
 test/e2e_node/gpu_device_plugin_test.go | 8 ++++----
 test/e2e_node/hugepages_test.go         | 2 +-
 test/e2e_node/memory_manager_test.go    | 2 +-
 test/e2e_node/podresources_test.go      | 2 +-
 test/e2e_node/util.go                   | 4 ++--
 6 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/test/e2e_node/device_plugin_test.go b/test/e2e_node/device_plugin_test.go
index 9e4493f3a21..c579c7a5ed8 100644
--- a/test/e2e_node/device_plugin_test.go
+++ b/test/e2e_node/device_plugin_test.go
@@ -220,7 +220,7 @@ func testDevicePlugin(f *framework.Framework, pluginSockDir string) {
 
 			restartTime := time.Now()
 			ginkgo.By("Restarting Kubelet")
-			restartKubelet()
+			restartKubelet(true)
 
 			// We need to wait for node to be ready before re-registering stub device plugin.
 			// Otherwise, Kubelet DeviceManager may remove the re-registered sockets after it starts.
diff --git a/test/e2e_node/gpu_device_plugin_test.go b/test/e2e_node/gpu_device_plugin_test.go
index 20b410b3326..54f787846da 100644
--- a/test/e2e_node/gpu_device_plugin_test.go
+++ b/test/e2e_node/gpu_device_plugin_test.go
@@ -103,7 +103,7 @@ var _ = SIGDescribe("NVIDIA GPU Device Plugin [Feature:GPUDevicePlugin][NodeFeat
 				f.PodClient().DeleteSync(p.Name, metav1.DeleteOptions{}, 2*time.Minute)
 			}
 
-			restartKubelet()
+			restartKubelet(true)
 
 			ginkgo.By("Waiting for GPUs to become unavailable on the local node")
 			gomega.Eventually(func() bool {
@@ -142,7 +142,7 @@ var _ = SIGDescribe("NVIDIA GPU Device Plugin [Feature:GPUDevicePlugin][NodeFeat
 			framework.ExpectEqual(devIDRestart1, devID1)
 
 			ginkgo.By("Restarting Kubelet")
-			restartKubelet()
+			restartKubelet(true)
 			framework.WaitForAllNodesSchedulable(f.ClientSet, 30*time.Minute)
 
 			ginkgo.By("Checking that scheduled pods can continue to run even after we delete device plugin and restart Kubelet.")
@@ -172,7 +172,7 @@ var _ = SIGDescribe("NVIDIA GPU Device Plugin [Feature:GPUDevicePlugin][NodeFeat
 			}
 
 			ginkgo.By("Restarting Kubelet")
-			restartKubelet()
+			restartKubelet(true)
 
 			ginkgo.By("Confirming that after a kubelet and pod restart, GPU assignment is kept")
 			ensurePodContainerRestart(f, p1.Name, p1.Name)
@@ -181,7 +181,7 @@ var _ = SIGDescribe("NVIDIA GPU Device Plugin [Feature:GPUDevicePlugin][NodeFeat
 
 			ginkgo.By("Restarting Kubelet and creating another pod")
 
-			restartKubelet()
+			restartKubelet(true)
 			framework.WaitForAllNodesSchedulable(f.ClientSet, 30*time.Minute)
 
 			ensurePodContainerRestart(f, p1.Name, p1.Name)
diff --git a/test/e2e_node/hugepages_test.go b/test/e2e_node/hugepages_test.go
index e62376d8c38..74c715333cc 100644
--- a/test/e2e_node/hugepages_test.go
+++ b/test/e2e_node/hugepages_test.go
@@ -207,7 +207,7 @@ var _ = SIGDescribe("HugePages [Serial] [Feature:HugePages][NodeSpecialFeature:H
 		framework.ExpectEqual(value.String(), "9Mi", "huge pages with size 3Mi should be supported")
 
 		ginkgo.By("restarting the node and verifying that huge pages with size 3Mi are not supported")
-		restartKubelet()
+		restartKubelet(true)
 
 		ginkgo.By("verifying that the hugepages-3Mi resource no longer is present")
 		gomega.Eventually(func() bool {
diff --git a/test/e2e_node/memory_manager_test.go b/test/e2e_node/memory_manager_test.go
index eee9674de48..6bdfd3a5340 100644
--- a/test/e2e_node/memory_manager_test.go
+++ b/test/e2e_node/memory_manager_test.go
@@ -348,7 +348,7 @@ var _ = SIGDescribe("Memory Manager [Serial] [Feature:MemoryManager]", func() {
 				return kubeletHealthCheck(kubeletHealthCheckURL)
 			}, time.Minute, time.Second).Should(gomega.BeFalse())
 
-			restartKubelet()
+			restartKubelet(false)
 
 			// wait until the kubelet health check will pass
 			gomega.Eventually(func() bool {
diff --git a/test/e2e_node/podresources_test.go b/test/e2e_node/podresources_test.go
index 591060c8af2..3a010e0f0c0 100644
--- a/test/e2e_node/podresources_test.go
+++ b/test/e2e_node/podresources_test.go
@@ -731,7 +731,7 @@ var _ = SIGDescribe("POD Resources [Serial] [Feature:PodResources][NodeFeature:P
 			expectPodResources(1, cli, []podDesc{desc})
 
 			ginkgo.By("Restarting Kubelet")
-			restartKubelet()
+			restartKubelet(true)
 			framework.WaitForAllNodesSchedulable(f.ClientSet, framework.TestContext.NodeSchedulableTimeout)
 			expectPodResources(1, cli, []podDesc{desc})
 			tpd.deletePodsForTest(f)
diff --git a/test/e2e_node/util.go b/test/e2e_node/util.go
index 5a3599ee7fc..411b8e02529 100644
--- a/test/e2e_node/util.go
+++ b/test/e2e_node/util.go
@@ -414,8 +414,8 @@ func findKubletServiceName(running bool) string {
 	return kubeletServiceName
 }
 
-func restartKubelet() {
-	kubeletServiceName := findKubletServiceName(false)
+func restartKubelet(running bool) {
+	kubeletServiceName := findKubletServiceName(running)
 	// reset the kubelet service start-limit-hit
 	stdout, err := exec.Command("sudo", "systemctl", "reset-failed", kubeletServiceName).CombinedOutput()
 	framework.ExpectNoError(err, "Failed to reset kubelet start-limit-hit with systemctl: %v, %s", err, string(stdout))