Merge pull request #28769 from wojtek-t/optimize_priorities

Automatic merge from submit-queue Optimize priorities in scheduler Ref #28590 It's probably easier to review it commit by commit, since those changes are kind of independent from each other. @davidopp - FYI
2016-07-11 07:49:23 -07:00
parent 897d277095 d02e8d2885
commit 9b74e24fa3
11 changed files with 99 additions and 113 deletions
--- a/plugin/pkg/scheduler/algorithm/predicates/predicates.go
+++ b/plugin/pkg/scheduler/algorithm/predicates/predicates.go
@@ -65,10 +65,10 @@ func (c *CachedNodeInfo) GetNodeInfo(id string) (*api.Node, error) {
 	return node.(*api.Node), nil
 }

-// podMetadata defines a type, that is an expected type that is passed
-// as metadata for predicate functions
+// podMetadata is a type that is passed as metadata for predicate functions
 type predicateMetadata struct {
 	podBestEffort bool
+	podRequest    *resourceRequest
 }

 func PredicateMetadata(pod *api.Pod) interface{} {
@@ -78,6 +78,7 @@ func PredicateMetadata(pod *api.Pod) interface{} {
 	}
 	return &predicateMetadata{
 		podBestEffort: isPodBestEffort(pod),
+		podRequest:    getResourceRequest(pod),
 	}
 }

@@ -405,7 +406,7 @@ type resourceRequest struct {
 	nvidiaGPU int64
 }

-func getResourceRequest(pod *api.Pod) resourceRequest {
+func getResourceRequest(pod *api.Pod) *resourceRequest {
 	result := resourceRequest{}
 	for _, container := range pod.Spec.Containers {
 		requests := container.Resources.Requests
@@ -423,7 +424,7 @@ func getResourceRequest(pod *api.Pod) resourceRequest {
 			result.milliCPU = cpu
 		}
 	}
-	return result
+	return &result
 }

 func CheckPodsExceedingFreeResources(pods []*api.Pod, allocatable api.ResourceList) (fitting []*api.Pod, notFittingCPU, notFittingMemory, notFittingNvidiaGPU []*api.Pod) {
@@ -471,17 +472,25 @@ func PodFitsResources(pod *api.Pod, meta interface{}, nodeInfo *schedulercache.N
 	if node == nil {
 		return false, fmt.Errorf("node not found")
 	}
-	allocatable := node.Status.Allocatable
-	allowedPodNumber := allocatable.Pods().Value()
-	if int64(len(nodeInfo.Pods()))+1 > allowedPodNumber {
+	allowedPodNumber := nodeInfo.AllowedPodNumber()
+	if len(nodeInfo.Pods())+1 > allowedPodNumber {
 		return false,
-			newInsufficientResourceError(podCountResourceName, 1, int64(len(nodeInfo.Pods())), allowedPodNumber)
+			newInsufficientResourceError(podCountResourceName, 1, int64(len(nodeInfo.Pods())), int64(allowedPodNumber))
+	}
+
+	var podRequest *resourceRequest
+	predicateMeta, ok := meta.(*predicateMetadata)
+	if ok {
+		podRequest = predicateMeta.podRequest
+	} else {
+		// We couldn't parse metadata - fallback to computing it.
+		podRequest = getResourceRequest(pod)
 	}
-	podRequest := getResourceRequest(pod)
 	if podRequest.milliCPU == 0 && podRequest.memory == 0 && podRequest.nvidiaGPU == 0 {
 		return true, nil
 	}

+	allocatable := node.Status.Allocatable
 	totalMilliCPU := allocatable.Cpu().MilliValue()
 	totalMemory := allocatable.Memory().Value()
 	totalNvidiaGPU := allocatable.NvidiaGPU().Value()
@@ -498,8 +507,12 @@ func PodFitsResources(pod *api.Pod, meta interface{}, nodeInfo *schedulercache.N
 		return false,
 			newInsufficientResourceError(nvidiaGpuResourceName, podRequest.nvidiaGPU, nodeInfo.RequestedResource().NvidiaGPU, totalNvidiaGPU)
 	}
-	glog.V(10).Infof("Schedule Pod %+v on Node %+v is allowed, Node is running only %v out of %v Pods.",
-		podName(pod), node.Name, len(nodeInfo.Pods()), allowedPodNumber)
+	if glog.V(10) {
+		// We explicitly don't do glog.V(10).Infof() to avoid computing all the parameters if this is
+		// not logged. There is visible performance gain from it.
+		glog.Infof("Schedule Pod %+v on Node %+v is allowed, Node is running only %v out of %v Pods.",
+			podName(pod), node.Name, len(nodeInfo.Pods()), allowedPodNumber)
+	}
 	return true, nil
 }

@@ -758,8 +771,10 @@ func getUsedPorts(pods ...*api.Pod) map[int]bool {
 	// TODO: Aggregate it at the NodeInfo level.
 	ports := make(map[int]bool)
 	for _, pod := range pods {
-		for _, container := range pod.Spec.Containers {
-			for _, podPort := range container.Ports {
+		for j := range pod.Spec.Containers {
+			container := &pod.Spec.Containers[j]
+			for k := range container.Ports {
+				podPort := &container.Ports[k]
 				// "0" is explicitly ignored in PodFitsHostPorts,
 				// which is the only function that uses this value.
 				if podPort.HostPort != 0 {
@@ -999,19 +1014,11 @@ func (checker *PodAffinityChecker) NodeMatchPodAffinityAntiAffinity(pod *api.Pod
 	return true
 }

-type TolerationMatch struct {
-	info NodeInfo
-}
-
-func NewTolerationMatchPredicate(info NodeInfo) algorithm.FitPredicate {
-	tolerationMatch := &TolerationMatch{
-		info: info,
-	}
-	return tolerationMatch.PodToleratesNodeTaints
-}
-
-func (t *TolerationMatch) PodToleratesNodeTaints(pod *api.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo) (bool, error) {
+func PodToleratesNodeTaints(pod *api.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo) (bool, error) {
 	node := nodeInfo.Node()
+	if node == nil {
+		return false, fmt.Errorf("node not found")
+	}

 	taints, err := api.GetTaintsFromNodeAnnotations(node.Annotations)
 	if err != nil {
@@ -1040,7 +1047,8 @@ func tolerationsToleratesTaints(tolerations []api.Toleration, taints []api.Taint
 		return false
 	}

-	for _, taint := range taints {
+	for i := range taints {
+		taint := &taints[i]
 		// skip taints that have effect PreferNoSchedule, since it is for priorities
 		if taint.Effect == api.TaintEffectPreferNoSchedule {
 			continue
--- a/plugin/pkg/scheduler/algorithm/predicates/predicates_test.go
+++ b/plugin/pkg/scheduler/algorithm/predicates/predicates_test.go
@@ -2724,10 +2724,9 @@ func TestPodToleratesTaints(t *testing.T) {
 	}

 	for _, test := range podTolerateTaintsTests {
-		tolerationMatch := TolerationMatch{FakeNodeInfo(test.node)}
 		nodeInfo := schedulercache.NewNodeInfo()
 		nodeInfo.SetNode(&test.node)
-		fits, err := tolerationMatch.PodToleratesNodeTaints(test.pod, PredicateMetadata(test.pod), nodeInfo)
+		fits, err := PodToleratesNodeTaints(test.pod, PredicateMetadata(test.pod), nodeInfo)
 		if fits == false && !reflect.DeepEqual(err, ErrTaintsTolerationsNotMatch) {
 			t.Errorf("%s, unexpected error: %v", test.test, err)
 		}
--- a/plugin/pkg/scheduler/algorithm/priorities/node_affinity.go
+++ b/plugin/pkg/scheduler/algorithm/priorities/node_affinity.go
@@ -25,31 +25,20 @@ import (
 	"k8s.io/kubernetes/plugin/pkg/scheduler/schedulercache"
 )

-type NodeAffinity struct {
-	nodeLister algorithm.NodeLister
-}
-
-func NewNodeAffinityPriority(nodeLister algorithm.NodeLister) algorithm.PriorityFunction {
-	nodeAffinity := &NodeAffinity{
-		nodeLister: nodeLister,
-	}
-	return nodeAffinity.CalculateNodeAffinityPriority
-}
-
 // CalculateNodeAffinityPriority prioritizes nodes according to node affinity scheduling preferences
 // indicated in PreferredDuringSchedulingIgnoredDuringExecution. Each time a node match a preferredSchedulingTerm,
 // it will a get an add of preferredSchedulingTerm.Weight. Thus, the more preferredSchedulingTerms
 // the node satisfies and the more the preferredSchedulingTerm that is satisfied weights, the higher
 // score the node gets.
-func (s *NodeAffinity) CalculateNodeAffinityPriority(pod *api.Pod, nodeNameToInfo map[string]*schedulercache.NodeInfo, nodeLister algorithm.NodeLister) (schedulerapi.HostPriorityList, error) {
-	var maxCount int
-	counts := map[string]int{}
-
+func CalculateNodeAffinityPriority(pod *api.Pod, nodeNameToInfo map[string]*schedulercache.NodeInfo, nodeLister algorithm.NodeLister) (schedulerapi.HostPriorityList, error) {
 	nodes, err := nodeLister.List()
 	if err != nil {
 		return nil, err
 	}

+	var maxCount float64
+	counts := make(map[string]float64, len(nodes.Items))
+
 	affinity, err := api.GetAffinityFromPodAnnotations(pod.Annotations)
 	if err != nil {
 		return nil, err
@@ -72,7 +61,7 @@ func (s *NodeAffinity) CalculateNodeAffinityPriority(pod *api.Pod, nodeNameToInf

 			for _, node := range nodes.Items {
 				if nodeSelector.Matches(labels.Set(node.Labels)) {
-					counts[node.Name] += int(preferredSchedulingTerm.Weight)
+					counts[node.Name] += float64(preferredSchedulingTerm.Weight)
 				}

 				if counts[node.Name] > maxCount {
@@ -82,15 +71,20 @@ func (s *NodeAffinity) CalculateNodeAffinityPriority(pod *api.Pod, nodeNameToInf
 		}
 	}

-	result := []schedulerapi.HostPriority{}
+	result := make(schedulerapi.HostPriorityList, 0, len(nodes.Items))
 	for i := range nodes.Items {
 		node := &nodes.Items[i]
-		fScore := float64(0)
 		if maxCount > 0 {
-			fScore = 10 * (float64(counts[node.Name]) / float64(maxCount))
+			fScore := 10 * (counts[node.Name] / maxCount)
+			result = append(result, schedulerapi.HostPriority{Host: node.Name, Score: int(fScore)})
+			if glog.V(10) {
+				// We explicitly don't do glog.V(10).Infof() to avoid computing all the parameters if this is
+				// not logged. There is visible performance gain from it.
+				glog.Infof("%v -> %v: NodeAffinityPriority, Score: (%d)", pod.Name, node.Name, int(fScore))
+			}
+		} else {
+			result = append(result, schedulerapi.HostPriority{Host: node.Name, Score: 0})
 		}
-		result = append(result, schedulerapi.HostPriority{Host: node.Name, Score: int(fScore)})
-		glog.V(10).Infof("%v -> %v: NodeAffinityPriority, Score: (%d)", pod.Name, node.Name, int(fScore))
 	}
 	return result, nil
 }
--- a/plugin/pkg/scheduler/algorithm/priorities/node_affinity_test.go
+++ b/plugin/pkg/scheduler/algorithm/priorities/node_affinity_test.go
@@ -156,8 +156,7 @@ func TestNodeAffinityPriority(t *testing.T) {
 	}

 	for _, test := range tests {
-		nodeAffinity := NodeAffinity{nodeLister: algorithm.FakeNodeLister(api.NodeList{Items: test.nodes})}
-		list, err := nodeAffinity.CalculateNodeAffinityPriority(test.pod, schedulercache.CreateNodeNameToInfoMap(nil), algorithm.FakeNodeLister(api.NodeList{Items: test.nodes}))
+		list, err := CalculateNodeAffinityPriority(test.pod, schedulercache.CreateNodeNameToInfoMap(nil), algorithm.FakeNodeLister(api.NodeList{Items: test.nodes}))
 		if err != nil {
 			t.Errorf("unexpected error: %v", err)
 		}
--- a/plugin/pkg/scheduler/algorithm/priorities/taint_toleration.go
+++ b/plugin/pkg/scheduler/algorithm/priorities/taint_toleration.go
@@ -24,22 +24,10 @@ import (
 	"k8s.io/kubernetes/plugin/pkg/scheduler/schedulercache"
 )

-// NodeTaints hold the node lister
-type TaintToleration struct {
-	nodeLister algorithm.NodeLister
-}
-
-// NewTaintTolerationPriority
-func NewTaintTolerationPriority(nodeLister algorithm.NodeLister) algorithm.PriorityFunction {
-	taintToleration := &TaintToleration{
-		nodeLister: nodeLister,
-	}
-	return taintToleration.ComputeTaintTolerationPriority
-}
-
 // CountIntolerableTaintsPreferNoSchedule gives the count of intolerable taints of a pod with effect PreferNoSchedule
-func countIntolerableTaintsPreferNoSchedule(taints []api.Taint, tolerations []api.Toleration) (intolerableTaints int) {
-	for _, taint := range taints {
+func countIntolerableTaintsPreferNoSchedule(taints []api.Taint, tolerations []api.Toleration) (intolerableTaints float64) {
+	for i := range taints {
+		taint := &taints[i]
 		// check only on taints that have effect PreferNoSchedule
 		if taint.Effect != api.TaintEffectPreferNoSchedule {
 			continue
@@ -54,27 +42,27 @@ func countIntolerableTaintsPreferNoSchedule(taints []api.Taint, tolerations []ap

 // getAllTolerationEffectPreferNoSchedule gets the list of all Toleration with Effect PreferNoSchedule
 func getAllTolerationPreferNoSchedule(tolerations []api.Toleration) (tolerationList []api.Toleration) {
-	for _, toleration := range tolerations {
+	for i := range tolerations {
+		toleration := &tolerations[i]
 		if len(toleration.Effect) == 0 || toleration.Effect == api.TaintEffectPreferNoSchedule {
-			tolerationList = append(tolerationList, toleration)
+			tolerationList = append(tolerationList, *toleration)
 		}
 	}
 	return
 }

 // ComputeTaintTolerationPriority prepares the priority list for all the nodes based on the number of intolerable taints on the node
-func (s *TaintToleration) ComputeTaintTolerationPriority(pod *api.Pod, nodeNameToInfo map[string]*schedulercache.NodeInfo, nodeLister algorithm.NodeLister) (schedulerapi.HostPriorityList, error) {
-	// counts hold the count of intolerable taints of a pod for a given node
-	counts := make(map[string]int)
-
-	// the max value of counts
-	var maxCount int
-
+func ComputeTaintTolerationPriority(pod *api.Pod, nodeNameToInfo map[string]*schedulercache.NodeInfo, nodeLister algorithm.NodeLister) (schedulerapi.HostPriorityList, error) {
 	nodes, err := nodeLister.List()
 	if err != nil {
 		return nil, err
 	}

+	// the max value of counts
+	var maxCount float64
+	// counts hold the count of intolerable taints of a pod for a given node
+	counts := make(map[string]float64, len(nodes.Items))
+
 	tolerations, err := api.GetTolerationsFromPodAnnotations(pod.Annotations)
 	if err != nil {
 		return nil, err
@@ -99,14 +87,19 @@ func (s *TaintToleration) ComputeTaintTolerationPriority(pod *api.Pod, nodeNameT

 	// The maximum priority value to give to a node
 	// Priority values range from 0 - maxPriority
-	const maxPriority = 10
+	const maxPriority = float64(10)
 	result := make(schedulerapi.HostPriorityList, 0, len(nodes.Items))
-	for _, node := range nodes.Items {
-		fScore := float64(maxPriority)
+	for i := range nodes.Items {
+		node := &nodes.Items[i]
+		fScore := maxPriority
 		if maxCount > 0 {
-			fScore = (1.0 - float64(counts[node.Name])/float64(maxCount)) * 10
+			fScore = (1.0 - counts[node.Name]/maxCount) * 10
+		}
+		if glog.V(10) {
+			// We explicitly don't do glog.V(10).Infof() to avoid computing all the parameters if this is
+			// not logged. There is visible performance gain from it.
+			glog.Infof("%v -> %v: Taint Toleration Priority, Score: (%d)", pod.Name, node.Name, int(fScore))
 		}
-		glog.V(10).Infof("%v -> %v: Taint Toleration Priority, Score: (%d)", pod.Name, node.Name, int(fScore))

 		result = append(result, schedulerapi.HostPriority{Host: node.Name, Score: int(fScore)})
 	}
--- a/plugin/pkg/scheduler/algorithm/priorities/taint_toleration_test.go
+++ b/plugin/pkg/scheduler/algorithm/priorities/taint_toleration_test.go
@@ -212,8 +212,7 @@ func TestTaintAndToleration(t *testing.T) {
 	}
 	for _, test := range tests {
 		nodeNameToInfo := schedulercache.CreateNodeNameToInfoMap([]*api.Pod{{}})
-		taintToleration := TaintToleration{nodeLister: algorithm.FakeNodeLister(api.NodeList{Items: test.nodes})}
-		list, err := taintToleration.ComputeTaintTolerationPriority(
+		list, err := ComputeTaintTolerationPriority(
 			test.pod,
 			nodeNameToInfo,
 			algorithm.FakeNodeLister(api.NodeList{Items: test.nodes}))