Get rid of separate DumbSpreading function and just treat zero-limit

pods as having a constant non-zero memory and CPU limit.
2015-07-05 11:39:35 -07:00
parent 44ed229069
commit 4ea8b8a66d
4 changed files with 102 additions and 96 deletions
--- a/plugin/pkg/scheduler/algorithm/priorities/priorities.go
+++ b/plugin/pkg/scheduler/algorithm/priorities/priorities.go
@@ -21,6 +21,7 @@ import (

 	"github.com/GoogleCloudPlatform/kubernetes/pkg/api"
 	"github.com/GoogleCloudPlatform/kubernetes/pkg/labels"
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/api/resource"
 	"github.com/GoogleCloudPlatform/kubernetes/plugin/pkg/scheduler/algorithm"
 	"github.com/GoogleCloudPlatform/kubernetes/plugin/pkg/scheduler/algorithm/predicates"
 	"github.com/golang/glog"
@@ -28,42 +29,72 @@ import (

 // the unused capacity is calculated on a scale of 0-10
 // 0 being the lowest priority and 10 being the highest
-func calculateScore(requested, capacity int64, node string) int {
+func calculateScore(requested int64, capacity int64, node string) int {
 	if capacity == 0 {
 		return 0
 	}
 	if requested > capacity {
-		glog.Infof("Combined requested resources from existing pods exceeds capacity on minion: %s", node)
+		glog.Infof("Combined requested resources %d from existing pods exceeds capacity %d on minion: %s",
+			requested, capacity, node)
 		return 0
 	}
 	return int(((capacity - requested) * 10) / capacity)
 }

+// For each of these resources, a pod that doesn't request the resource explicitly
+// will be treated as having requested the amount indicated below, for the purpose
+// of computing priority only. This ensures that when scheduling zero-limit pods, such
+// pods will not all be scheduled to the machine with the smallest in-use limit,
+// and that when scheduling regular pods, such pods will not see zero-limit pods as
+// consuming no resources whatsoever.
+const defaultMilliCpuLimit int64 = 100  // 0.1 core
+const defaultMemoryLimit int64 = 60 * 1024 * 1024  // 60 MB
+
+// TODO: Consider setting default as a fixed fraction of machine capacity (take "capacity api.ResourceList"
+// as an additional argument here) rather than using constants
+func toNonzeroLimits(limits *api.ResourceList) (int64, int64) {
+	var out_millicpu, out_memory int64
+	// Override if un-set, but not if explicitly set to zero
+	if (*limits.Cpu() == resource.Quantity{}) {
+		out_millicpu = defaultMilliCpuLimit
+	} else {
+		out_millicpu = limits.Cpu().MilliValue()
+	}
+	// Override if un-set, but not if explicitly set to zero
+	if (*limits.Memory() == resource.Quantity{}) {
+		out_memory = defaultMemoryLimit
+	} else {
+		out_memory = limits.Memory().Value()
+	}
+	return out_millicpu, out_memory
+}
+
 // Calculate the resource occupancy on a node.  'node' has information about the resources on the node.
 // 'pods' is a list of pods currently scheduled on the node.
 func calculateResourceOccupancy(pod *api.Pod, node api.Node, pods []*api.Pod) algorithm.HostPriority {
 	totalMilliCPU := int64(0)
 	totalMemory := int64(0)
+	capacityMilliCPU := node.Status.Capacity.Cpu().MilliValue()
+	capacityMemory := node.Status.Capacity.Memory().Value()
+
 	for _, existingPod := range pods {
 		for _, container := range existingPod.Spec.Containers {
-			totalMilliCPU += container.Resources.Limits.Cpu().MilliValue()
-			totalMemory += container.Resources.Limits.Memory().Value()
+			cpu, memory := toNonzeroLimits(&container.Resources.Limits)
+			totalMilliCPU += cpu
+			totalMemory += memory
 		}
 	}
 	// Add the resources requested by the current pod being scheduled.
 	// This also helps differentiate between differently sized, but empty, minions.
 	for _, container := range pod.Spec.Containers {
-		totalMilliCPU += container.Resources.Limits.Cpu().MilliValue()
-		totalMemory += container.Resources.Limits.Memory().Value()
+		cpu, memory := toNonzeroLimits(&container.Resources.Limits)
+		totalMilliCPU += cpu
+		totalMemory += memory
 	}

-	capacityMilliCPU := node.Status.Capacity.Cpu().MilliValue()
-	capacityMemory := node.Status.Capacity.Memory().Value()
-
 	cpuScore := calculateScore(totalMilliCPU, capacityMilliCPU, node.Name)
 	memoryScore := calculateScore(totalMemory, capacityMemory, node.Name)
-//	glog.V(10).Infof(
-	glog.Infof(
+	glog.V(10).Infof(
 		"%v -> %v: Least Requested Priority, Absolute/Requested: (%d, %d) / (%d, %d) Score: (%d, %d)",
 		pod.Name, node.Name,
 		totalMilliCPU, totalMemory,
@@ -95,47 +126,6 @@ func LeastRequestedPriority(pod *api.Pod, podLister algorithm.PodLister, minionL
 	return list, nil
 }

-func min(l, r int64) (m int64) {
-	m = r
-	if l < r {
-		m = l
-	}
-	return m
-}
-
-// See comment for DumbSpreadingPriority()
-const dumbSpreadingDenominator int64 = 10
-
-// DumbSpreadingPriority is a priority function that favors nodes with fewer pods.
-// It works like LeastRequestedPeriority but instead of using 10 * percentage of machine free by resource,
-// it uses 10 * percentage of machine free by pod, with "percentage of machine free by pod" claculated as
-// (dumbSpreadingDenominator - number of pods already on the node + 1) / dumbSpreadingDenominator.
-// dumbSpreadingDenominator serves like the machine capacity in LeasRequestedPriority but is chosen
-// so that we equate one pod with a reasonable amount of resources when we combine all the scores together.
-func DumbSpreadingPriority(pod *api.Pod, podLister algorithm.PodLister, minionLister algorithm.MinionLister) (algorithm.HostPriorityList, error) {
-	nodes, err := minionLister.List()
-	if err != nil {
-		return algorithm.HostPriorityList{}, err
-	}
-	podsToMachines, err := predicates.MapPodsToMachines(podLister)
-
-	list := algorithm.HostPriorityList{}
-	for _, node := range nodes.Items {
-		npods := int64(len(podsToMachines[node.Name]))
-		score := calculateScore(min(npods+1, dumbSpreadingDenominator), dumbSpreadingDenominator, node.Name)
-//		glog.V(10).Infof(
-		glog.Infof(
-			"%v -> %v: DumbSpreadPriority, Old # pods (%d) Score: (%d)",
-			pod.Name, node.Name, npods, score,
-		)
-		list = append(list, algorithm.HostPriority{
-			Host:  node.Name,
-			Score: score,
-		})
-	}
-	return list, nil
-}
-
 type NodeLabelPrioritizer struct {
 	label    string
 	presence bool
@@ -205,15 +195,17 @@ func calculateBalancedResourceAllocation(pod *api.Pod, node api.Node, pods []*ap
 	score := int(0)
 	for _, existingPod := range pods {
 		for _, container := range existingPod.Spec.Containers {
-			totalMilliCPU += container.Resources.Limits.Cpu().MilliValue()
-			totalMemory += container.Resources.Limits.Memory().Value()
+			cpu, memory := toNonzeroLimits(&container.Resources.Limits)
+			totalMilliCPU += cpu
+			totalMemory += memory
 		}
 	}
 	// Add the resources requested by the current pod being scheduled.
 	// This also helps differentiate between differently sized, but empty, minions.
 	for _, container := range pod.Spec.Containers {
-		totalMilliCPU += container.Resources.Limits.Cpu().MilliValue()
-		totalMemory += container.Resources.Limits.Memory().Value()
+		cpu, memory := toNonzeroLimits(&container.Resources.Limits)
+		totalMilliCPU += cpu
+		totalMemory += memory
 	}

 	capacityMilliCPU := node.Status.Capacity.Cpu().MilliValue()
@@ -232,8 +224,7 @@ func calculateBalancedResourceAllocation(pod *api.Pod, node api.Node, pods []*ap
 		diff := math.Abs(cpuFraction - memoryFraction)
 		score = int(10 - diff*10)
 	}
-//	glog.V(10).Infof(
-	glog.Infof(
+	glog.V(10).Infof(
 		"%v -> %v: Balanced Resource Allocation, Absolute/Requested: (%d, %d) / (%d, %d) Score: (%d)",
 		pod.Name, node.Name,
 		totalMilliCPU, totalMemory,