Merge pull request #63246 from losipiuk/lo/autoscaler-e2e-gpu-tests
Automatic merge from submit-queue (batch tested with PRs 63246, 63185). If you want to cherry-pick this change to another branch, please follow the instructions <a href="https://github.com/kubernetes/community/blob/master/contributors/devel/cherry-picks.md">here</a>. Add scale-up test from 0 for GPU node-pool **Release note**: ```release-note NONE ```
This commit is contained in:
		@@ -207,6 +207,76 @@ var _ = SIGDescribe("Cluster size autoscaling [Slow]", func() {
 | 
				
			|||||||
	It("should increase cluster size if pending pods are small [Feature:ClusterSizeAutoscalingScaleUp]",
 | 
						It("should increase cluster size if pending pods are small [Feature:ClusterSizeAutoscalingScaleUp]",
 | 
				
			||||||
		func() { simpleScaleUpTest(0) })
 | 
							func() { simpleScaleUpTest(0) })
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						It("Should scale up GPU pool from 0 [Feature:ClusterSizeAutoscalingGpu]", func() {
 | 
				
			||||||
 | 
							framework.SkipUnlessProviderIs("gke")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							const gpuPoolName = "gpu-pool"
 | 
				
			||||||
 | 
							addGpuNodePool(gpuPoolName, "nvidia-tesla-k80", 1, 0)
 | 
				
			||||||
 | 
							defer deleteNodePool(gpuPoolName)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							installNvidiaDriversDaemonSet()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							By("Enable autoscaler")
 | 
				
			||||||
 | 
							framework.ExpectNoError(enableAutoscaler(gpuPoolName, 0, 1))
 | 
				
			||||||
 | 
							defer disableAutoscaler(gpuPoolName, 0, 1)
 | 
				
			||||||
 | 
							Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(0))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							By("Schedule a pod which requires GPU")
 | 
				
			||||||
 | 
							framework.ExpectNoError(scheduleGpuPod(f, "gpu-pod-rc"))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
 | 
				
			||||||
 | 
								func(size int) bool { return size == nodeCount+1 }, scaleUpTimeout))
 | 
				
			||||||
 | 
							Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(1))
 | 
				
			||||||
 | 
						})
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						It("Should scale up GPU pool from 1 [Feature:ClusterSizeAutoscalingGpu]", func() {
 | 
				
			||||||
 | 
							framework.SkipUnlessProviderIs("gke")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							const gpuPoolName = "gpu-pool"
 | 
				
			||||||
 | 
							addGpuNodePool(gpuPoolName, "nvidia-tesla-k80", 1, 1)
 | 
				
			||||||
 | 
							defer deleteNodePool(gpuPoolName)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							installNvidiaDriversDaemonSet()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							By("Schedule a single pod which requires GPU")
 | 
				
			||||||
 | 
							framework.ExpectNoError(scheduleGpuPod(f, "gpu-pod-rc"))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							By("Enable autoscaler")
 | 
				
			||||||
 | 
							framework.ExpectNoError(enableAutoscaler(gpuPoolName, 0, 2))
 | 
				
			||||||
 | 
							defer disableAutoscaler(gpuPoolName, 0, 2)
 | 
				
			||||||
 | 
							Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(1))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							framework.ScaleRC(f.ClientSet, f.ScalesGetter, f.Namespace.Name, "gpu-pod-rc", 2, false)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
 | 
				
			||||||
 | 
								func(size int) bool { return size == nodeCount+2 }, scaleUpTimeout))
 | 
				
			||||||
 | 
							Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(2))
 | 
				
			||||||
 | 
						})
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						It("Should scale down GPU pool from 1 [Feature:ClusterSizeAutoscalingGpu]", func() {
 | 
				
			||||||
 | 
							framework.SkipUnlessProviderIs("gke")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							const gpuPoolName = "gpu-pool"
 | 
				
			||||||
 | 
							addGpuNodePool(gpuPoolName, "nvidia-tesla-k80", 1, 1)
 | 
				
			||||||
 | 
							defer deleteNodePool(gpuPoolName)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							installNvidiaDriversDaemonSet()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							By("Schedule a single pod which requires GPU")
 | 
				
			||||||
 | 
							framework.ExpectNoError(scheduleGpuPod(f, "gpu-pod-rc"))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							By("Enable autoscaler")
 | 
				
			||||||
 | 
							framework.ExpectNoError(enableAutoscaler(gpuPoolName, 0, 1))
 | 
				
			||||||
 | 
							defer disableAutoscaler(gpuPoolName, 0, 1)
 | 
				
			||||||
 | 
							Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(1))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							framework.DeleteRCAndPods(f.ClientSet, f.InternalClientset, f.ScalesGetter, f.Namespace.Name, "gpu-pod-rc")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
 | 
				
			||||||
 | 
								func(size int) bool { return size == nodeCount }, scaleDownTimeout))
 | 
				
			||||||
 | 
							Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(0))
 | 
				
			||||||
 | 
						})
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	It("should increase cluster size if pending pods are small and one node is broken [Feature:ClusterSizeAutoscalingScaleUp]",
 | 
						It("should increase cluster size if pending pods are small and one node is broken [Feature:ClusterSizeAutoscalingScaleUp]",
 | 
				
			||||||
		func() {
 | 
							func() {
 | 
				
			||||||
			framework.TestUnderTemporaryNetworkFailure(c, "default", getAnyNode(c), func() { simpleScaleUpTest(1) })
 | 
								framework.TestUnderTemporaryNetworkFailure(c, "default", getAnyNode(c), func() { simpleScaleUpTest(1) })
 | 
				
			||||||
@@ -957,6 +1027,12 @@ var _ = SIGDescribe("Cluster size autoscaling [Slow]", func() {
 | 
				
			|||||||
	})
 | 
						})
 | 
				
			||||||
})
 | 
					})
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					func installNvidiaDriversDaemonSet() {
 | 
				
			||||||
 | 
						By("Add daemonset which installs nvidia drivers")
 | 
				
			||||||
 | 
						// the link differs from one in GKE documentation; discussed with @mindprince this one should be used
 | 
				
			||||||
 | 
						framework.RunKubectlOrDie("apply", "-f", "https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/daemonset.yaml")
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
func execCmd(args ...string) *exec.Cmd {
 | 
					func execCmd(args ...string) *exec.Cmd {
 | 
				
			||||||
	glog.Infof("Executing: %s", strings.Join(args, " "))
 | 
						glog.Infof("Executing: %s", strings.Join(args, " "))
 | 
				
			||||||
	return exec.Command(args[0], args[1:]...)
 | 
						return exec.Command(args[0], args[1:]...)
 | 
				
			||||||
@@ -1300,6 +1376,16 @@ func addNodePool(name string, machineType string, numNodes int) {
 | 
				
			|||||||
	framework.ExpectNoError(err, string(output))
 | 
						framework.ExpectNoError(err, string(output))
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					func addGpuNodePool(name string, gpuType string, gpuCount int, numNodes int) {
 | 
				
			||||||
 | 
						args := []string{"beta", "container", "node-pools", "create", name, "--quiet",
 | 
				
			||||||
 | 
							"--accelerator", "type=" + gpuType + ",count=" + strconv.Itoa(gpuCount),
 | 
				
			||||||
 | 
							"--num-nodes=" + strconv.Itoa(numNodes),
 | 
				
			||||||
 | 
							"--cluster=" + framework.TestContext.CloudConfig.Cluster}
 | 
				
			||||||
 | 
						output, err := execCmd(getGcloudCommand(args)...).CombinedOutput()
 | 
				
			||||||
 | 
						glog.Infof("Creating node-pool %s: %s", name, output)
 | 
				
			||||||
 | 
						framework.ExpectNoError(err, string(output))
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
func deleteNodePool(name string) {
 | 
					func deleteNodePool(name string) {
 | 
				
			||||||
	glog.Infof("Deleting node pool %s", name)
 | 
						glog.Infof("Deleting node pool %s", name)
 | 
				
			||||||
	args := []string{"container", "node-pools", "delete", name, "--quiet",
 | 
						args := []string{"container", "node-pools", "delete", name, "--quiet",
 | 
				
			||||||
@@ -1320,7 +1406,7 @@ func deleteNodePool(name string) {
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
func getPoolNodes(f *framework.Framework, poolName string) []*v1.Node {
 | 
					func getPoolNodes(f *framework.Framework, poolName string) []*v1.Node {
 | 
				
			||||||
	nodes := make([]*v1.Node, 0, 1)
 | 
						nodes := make([]*v1.Node, 0, 1)
 | 
				
			||||||
	nodeList := framework.GetReadySchedulableNodesOrDie(f.ClientSet)
 | 
						nodeList := framework.GetReadyNodesIncludingTaintedOrDie(f.ClientSet)
 | 
				
			||||||
	for _, node := range nodeList.Items {
 | 
						for _, node := range nodeList.Items {
 | 
				
			||||||
		if node.Labels[gkeNodepoolNameKey] == poolName {
 | 
							if node.Labels[gkeNodepoolNameKey] == poolName {
 | 
				
			||||||
			nodes = append(nodes, &node)
 | 
								nodes = append(nodes, &node)
 | 
				
			||||||
@@ -1624,6 +1710,26 @@ func makeNodeSchedulable(c clientset.Interface, node *v1.Node, failOnCriticalAdd
 | 
				
			|||||||
	return fmt.Errorf("Failed to remove taint from node in allowed number of retries")
 | 
						return fmt.Errorf("Failed to remove taint from node in allowed number of retries")
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					func scheduleGpuPod(f *framework.Framework, id string) error {
 | 
				
			||||||
 | 
						config := &testutils.RCConfig{
 | 
				
			||||||
 | 
							Client:         f.ClientSet,
 | 
				
			||||||
 | 
							InternalClient: f.InternalClientset,
 | 
				
			||||||
 | 
							Name:           id,
 | 
				
			||||||
 | 
							Namespace:      f.Namespace.Name,
 | 
				
			||||||
 | 
							Timeout:        3 * scaleUpTimeout, // spinning up GPU node is slow
 | 
				
			||||||
 | 
							Image:          imageutils.GetPauseImageName(),
 | 
				
			||||||
 | 
							Replicas:       1,
 | 
				
			||||||
 | 
							GpuLimit:       1,
 | 
				
			||||||
 | 
							Labels:         map[string]string{"requires-gpu": "yes"},
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						err := framework.RunRC(*config)
 | 
				
			||||||
 | 
						if err != nil {
 | 
				
			||||||
 | 
							return err
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
						return nil
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
// Create an RC running a given number of pods with anti-affinity
 | 
					// Create an RC running a given number of pods with anti-affinity
 | 
				
			||||||
func runAntiAffinityPods(f *framework.Framework, namespace string, pods int, id string, podLabels, antiAffinityLabels map[string]string) error {
 | 
					func runAntiAffinityPods(f *framework.Framework, namespace string, pods int, id string, podLabels, antiAffinityLabels map[string]string) error {
 | 
				
			||||||
	config := &testutils.RCConfig{
 | 
						config := &testutils.RCConfig{
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -2608,6 +2608,18 @@ func GetReadySchedulableNodesOrDie(c clientset.Interface) (nodes *v1.NodeList) {
 | 
				
			|||||||
	return nodes
 | 
						return nodes
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					// GetReadyNodesIncludingTaintedOrDie returns all ready nodes, even those which are tainted.
 | 
				
			||||||
 | 
					// There are cases when we care about tainted nodes
 | 
				
			||||||
 | 
					// E.g. in tests related to nodes with gpu we care about nodes despite
 | 
				
			||||||
 | 
					// presence of nvidia.com/gpu=present:NoSchedule taint
 | 
				
			||||||
 | 
					func GetReadyNodesIncludingTaintedOrDie(c clientset.Interface) (nodes *v1.NodeList) {
 | 
				
			||||||
 | 
						nodes = waitListSchedulableNodesOrDie(c)
 | 
				
			||||||
 | 
						FilterNodes(nodes, func(node v1.Node) bool {
 | 
				
			||||||
 | 
							return isNodeSchedulable(&node)
 | 
				
			||||||
 | 
						})
 | 
				
			||||||
 | 
						return nodes
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
func WaitForAllNodesSchedulable(c clientset.Interface, timeout time.Duration) error {
 | 
					func WaitForAllNodesSchedulable(c clientset.Interface, timeout time.Duration) error {
 | 
				
			||||||
	Logf("Waiting up to %v for all (but %d) nodes to be schedulable", timeout, TestContext.AllowedNotReadyNodes)
 | 
						Logf("Waiting up to %v for all (but %d) nodes to be schedulable", timeout, TestContext.AllowedNotReadyNodes)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -124,6 +124,7 @@ type RCConfig struct {
 | 
				
			|||||||
	CpuLimit          int64 // millicores
 | 
						CpuLimit          int64 // millicores
 | 
				
			||||||
	MemRequest        int64 // bytes
 | 
						MemRequest        int64 // bytes
 | 
				
			||||||
	MemLimit          int64 // bytes
 | 
						MemLimit          int64 // bytes
 | 
				
			||||||
 | 
						GpuLimit          int64 // count
 | 
				
			||||||
	ReadinessProbe    *v1.Probe
 | 
						ReadinessProbe    *v1.Probe
 | 
				
			||||||
	DNSPolicy         *v1.DNSPolicy
 | 
						DNSPolicy         *v1.DNSPolicy
 | 
				
			||||||
	PriorityClassName string
 | 
						PriorityClassName string
 | 
				
			||||||
@@ -615,7 +616,7 @@ func (config *RCConfig) applyTo(template *v1.PodTemplateSpec) {
 | 
				
			|||||||
			c.Ports = append(c.Ports, v1.ContainerPort{Name: k, ContainerPort: int32(v), HostPort: int32(v)})
 | 
								c.Ports = append(c.Ports, v1.ContainerPort{Name: k, ContainerPort: int32(v), HostPort: int32(v)})
 | 
				
			||||||
		}
 | 
							}
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
	if config.CpuLimit > 0 || config.MemLimit > 0 {
 | 
						if config.CpuLimit > 0 || config.MemLimit > 0 || config.GpuLimit > 0 {
 | 
				
			||||||
		template.Spec.Containers[0].Resources.Limits = v1.ResourceList{}
 | 
							template.Spec.Containers[0].Resources.Limits = v1.ResourceList{}
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
	if config.CpuLimit > 0 {
 | 
						if config.CpuLimit > 0 {
 | 
				
			||||||
@@ -633,6 +634,9 @@ func (config *RCConfig) applyTo(template *v1.PodTemplateSpec) {
 | 
				
			|||||||
	if config.MemRequest > 0 {
 | 
						if config.MemRequest > 0 {
 | 
				
			||||||
		template.Spec.Containers[0].Resources.Requests[v1.ResourceMemory] = *resource.NewQuantity(config.MemRequest, resource.DecimalSI)
 | 
							template.Spec.Containers[0].Resources.Requests[v1.ResourceMemory] = *resource.NewQuantity(config.MemRequest, resource.DecimalSI)
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
						if config.GpuLimit > 0 {
 | 
				
			||||||
 | 
							template.Spec.Containers[0].Resources.Limits["nvidia.com/gpu"] = *resource.NewQuantity(config.GpuLimit, resource.DecimalSI)
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
	if len(config.Volumes) > 0 {
 | 
						if len(config.Volumes) > 0 {
 | 
				
			||||||
		template.Spec.Volumes = config.Volumes
 | 
							template.Spec.Volumes = config.Volumes
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user