Extends gpu_device_plugin e2e_node test to verify that scheduled pods
can continue to run even after device plugin deletion and kubelet restarts.
This commit is contained in:
@@ -18,7 +18,6 @@ package e2e_node
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"path/filepath"
|
|
||||||
"regexp"
|
"regexp"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
@@ -49,6 +48,7 @@ var _ = framework.KubeDescribe("NVIDIA GPU Device Plugin [Feature:GPUDevicePlugi
|
|||||||
initialConfig.FeatureGates[string(features.DevicePlugins)] = true
|
initialConfig.FeatureGates[string(features.DevicePlugins)] = true
|
||||||
})
|
})
|
||||||
|
|
||||||
|
var devicePluginPod *v1.Pod
|
||||||
BeforeEach(func() {
|
BeforeEach(func() {
|
||||||
By("Ensuring that Nvidia GPUs exists on the node")
|
By("Ensuring that Nvidia GPUs exists on the node")
|
||||||
if !checkIfNvidiaGPUsExistOnNode() {
|
if !checkIfNvidiaGPUsExistOnNode() {
|
||||||
@@ -56,7 +56,7 @@ var _ = framework.KubeDescribe("NVIDIA GPU Device Plugin [Feature:GPUDevicePlugi
|
|||||||
}
|
}
|
||||||
|
|
||||||
By("Creating the Google Device Plugin pod for NVIDIA GPU in GKE")
|
By("Creating the Google Device Plugin pod for NVIDIA GPU in GKE")
|
||||||
f.PodClient().CreateSync(framework.NVIDIADevicePlugin(f.Namespace.Name))
|
devicePluginPod = f.PodClient().CreateSync(framework.NVIDIADevicePlugin(f.Namespace.Name))
|
||||||
|
|
||||||
By("Waiting for GPUs to become available on the local node")
|
By("Waiting for GPUs to become available on the local node")
|
||||||
Eventually(func() bool {
|
Eventually(func() bool {
|
||||||
@@ -84,7 +84,7 @@ var _ = framework.KubeDescribe("NVIDIA GPU Device Plugin [Feature:GPUDevicePlugi
|
|||||||
It("checks that when Kubelet restarts exclusive GPU assignation to pods is kept.", func() {
|
It("checks that when Kubelet restarts exclusive GPU assignation to pods is kept.", func() {
|
||||||
By("Creating one GPU pod on a node with at least two GPUs")
|
By("Creating one GPU pod on a node with at least two GPUs")
|
||||||
p1 := f.PodClient().CreateSync(makeCudaPauseImage())
|
p1 := f.PodClient().CreateSync(makeCudaPauseImage())
|
||||||
devId1 := getDeviceId(f, p1.Name, p1.Name, 1)
|
count1, devId1 := getDeviceId(f, p1.Name, p1.Name, 1)
|
||||||
p1, err := f.PodClient().Get(p1.Name, metav1.GetOptions{})
|
p1, err := f.PodClient().Get(p1.Name, metav1.GetOptions{})
|
||||||
framework.ExpectNoError(err)
|
framework.ExpectNoError(err)
|
||||||
|
|
||||||
@@ -92,17 +92,36 @@ var _ = framework.KubeDescribe("NVIDIA GPU Device Plugin [Feature:GPUDevicePlugi
|
|||||||
restartKubelet(f)
|
restartKubelet(f)
|
||||||
|
|
||||||
By("Confirming that after a kubelet and pod restart, GPU assignement is kept")
|
By("Confirming that after a kubelet and pod restart, GPU assignement is kept")
|
||||||
devIdRestart := getDeviceId(f, p1.Name, p1.Name, 2)
|
count1, devIdRestart1 := getDeviceId(f, p1.Name, p1.Name, count1+1)
|
||||||
Expect(devIdRestart).To(Equal(devId1))
|
Expect(devIdRestart1).To(Equal(devId1))
|
||||||
|
|
||||||
By("Restarting Kubelet and creating another pod")
|
By("Restarting Kubelet and creating another pod")
|
||||||
restartKubelet(f)
|
restartKubelet(f)
|
||||||
p2 := f.PodClient().CreateSync(makeCudaPauseImage())
|
p2 := f.PodClient().CreateSync(makeCudaPauseImage())
|
||||||
|
|
||||||
By("Checking that pods got a different GPU")
|
By("Checking that pods got a different GPU")
|
||||||
devId2 := getDeviceId(f, p2.Name, p2.Name, 1)
|
count2, devId2 := getDeviceId(f, p2.Name, p2.Name, 1)
|
||||||
Expect(devId1).To(Not(Equal(devId2)))
|
Expect(devId1).To(Not(Equal(devId2)))
|
||||||
|
|
||||||
|
By("Deleting device plugin.")
|
||||||
|
f.PodClient().Delete(devicePluginPod.Name, &metav1.DeleteOptions{})
|
||||||
|
By("Waiting for GPUs to become unavailable on the local node")
|
||||||
|
Eventually(func() bool {
|
||||||
|
return framework.NumberOfNVIDIAGPUs(getLocalNode(f)) <= 0
|
||||||
|
}, 10*time.Minute, framework.Poll).Should(BeTrue())
|
||||||
|
By("Checking that scheduled pods can continue to run even after we delete device plugin.")
|
||||||
|
count1, devIdRestart1 = getDeviceId(f, p1.Name, p1.Name, count1+1)
|
||||||
|
Expect(devIdRestart1).To(Equal(devId1))
|
||||||
|
count2, devIdRestart2 := getDeviceId(f, p2.Name, p2.Name, count2+1)
|
||||||
|
Expect(devIdRestart2).To(Equal(devId2))
|
||||||
|
By("Restarting Kubelet.")
|
||||||
|
restartKubelet(f)
|
||||||
|
By("Checking that scheduled pods can continue to run even after we delete device plugin and restart Kubelet.")
|
||||||
|
count1, devIdRestart1 = getDeviceId(f, p1.Name, p1.Name, count1+2)
|
||||||
|
Expect(devIdRestart1).To(Equal(devId1))
|
||||||
|
count2, devIdRestart2 = getDeviceId(f, p2.Name, p2.Name, count2+2)
|
||||||
|
Expect(devIdRestart2).To(Equal(devId2))
|
||||||
|
|
||||||
// Cleanup
|
// Cleanup
|
||||||
f.PodClient().DeleteSync(p1.Name, &metav1.DeleteOptions{}, framework.DefaultPodDeletionTimeout)
|
f.PodClient().DeleteSync(p1.Name, &metav1.DeleteOptions{}, framework.DefaultPodDeletionTimeout)
|
||||||
f.PodClient().DeleteSync(p2.Name, &metav1.DeleteOptions{}, framework.DefaultPodDeletionTimeout)
|
f.PodClient().DeleteSync(p2.Name, &metav1.DeleteOptions{}, framework.DefaultPodDeletionTimeout)
|
||||||
@@ -140,9 +159,6 @@ func newDecimalResourceList(name v1.ResourceName, quantity int64) v1.ResourceLis
|
|||||||
|
|
||||||
// TODO: Find a uniform way to deal with systemctl/initctl/service operations. #34494
|
// TODO: Find a uniform way to deal with systemctl/initctl/service operations. #34494
|
||||||
func restartKubelet(f *framework.Framework) {
|
func restartKubelet(f *framework.Framework) {
|
||||||
beforeSocks, err := filepath.Glob("/var/lib/kubelet/device-plugins/nvidiaGPU*.sock")
|
|
||||||
framework.ExpectNoError(err)
|
|
||||||
Expect(len(beforeSocks)).NotTo(BeZero())
|
|
||||||
stdout, err := exec.Command("sudo", "systemctl", "list-units", "kubelet*", "--state=running").CombinedOutput()
|
stdout, err := exec.Command("sudo", "systemctl", "list-units", "kubelet*", "--state=running").CombinedOutput()
|
||||||
framework.ExpectNoError(err)
|
framework.ExpectNoError(err)
|
||||||
regex := regexp.MustCompile("(kubelet-[0-9]+)")
|
regex := regexp.MustCompile("(kubelet-[0-9]+)")
|
||||||
@@ -152,19 +168,18 @@ func restartKubelet(f *framework.Framework) {
|
|||||||
framework.Logf("Get running kubelet with systemctl: %v, %v", string(stdout), kube)
|
framework.Logf("Get running kubelet with systemctl: %v, %v", string(stdout), kube)
|
||||||
stdout, err = exec.Command("sudo", "systemctl", "restart", kube).CombinedOutput()
|
stdout, err = exec.Command("sudo", "systemctl", "restart", kube).CombinedOutput()
|
||||||
framework.ExpectNoError(err, "Failed to restart kubelet with systemctl: %v, %v", err, stdout)
|
framework.ExpectNoError(err, "Failed to restart kubelet with systemctl: %v, %v", err, stdout)
|
||||||
Eventually(func() ([]string, error) {
|
|
||||||
return filepath.Glob("/var/lib/kubelet/device-plugins/nvidiaGPU*.sock")
|
|
||||||
}, 5*time.Minute, framework.Poll).ShouldNot(ConsistOf(beforeSocks))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func getDeviceId(f *framework.Framework, podName string, contName string, restartCount int32) string {
|
func getDeviceId(f *framework.Framework, podName string, contName string, restartCount int32) (int32, string) {
|
||||||
|
var count int32
|
||||||
// Wait till pod has been restarted at least restartCount times.
|
// Wait till pod has been restarted at least restartCount times.
|
||||||
Eventually(func() bool {
|
Eventually(func() bool {
|
||||||
p, err := f.PodClient().Get(podName, metav1.GetOptions{})
|
p, err := f.PodClient().Get(podName, metav1.GetOptions{})
|
||||||
if err != nil || len(p.Status.ContainerStatuses) < 1 {
|
if err != nil || len(p.Status.ContainerStatuses) < 1 {
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
return p.Status.ContainerStatuses[0].RestartCount >= restartCount
|
count = p.Status.ContainerStatuses[0].RestartCount
|
||||||
|
return count >= restartCount
|
||||||
}, 5*time.Minute, framework.Poll).Should(BeTrue())
|
}, 5*time.Minute, framework.Poll).Should(BeTrue())
|
||||||
logs, err := framework.GetPodLogs(f.ClientSet, f.Namespace.Name, podName, contName)
|
logs, err := framework.GetPodLogs(f.ClientSet, f.Namespace.Name, podName, contName)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -174,7 +189,7 @@ func getDeviceId(f *framework.Framework, podName string, contName string, restar
|
|||||||
regex := regexp.MustCompile("gpu devices: (nvidia[0-9]+)")
|
regex := regexp.MustCompile("gpu devices: (nvidia[0-9]+)")
|
||||||
matches := regex.FindStringSubmatch(logs)
|
matches := regex.FindStringSubmatch(logs)
|
||||||
if len(matches) < 2 {
|
if len(matches) < 2 {
|
||||||
return ""
|
return count, ""
|
||||||
}
|
}
|
||||||
return matches[1]
|
return count, matches[1]
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user