Add logic to account for pods nominated to run on nodes, but are not running yet.

Add tests for the new logic.
2017-11-07 17:09:21 -08:00
parent 2cbb07a439
commit 8a17ae241d
12 changed files with 853 additions and 388 deletions
--- a/plugin/pkg/scheduler/core/generic_scheduler.go
+++ b/plugin/pkg/scheduler/core/generic_scheduler.go
@@ -81,6 +81,7 @@ func (f *FitError) Error() string {
 type genericScheduler struct {
 	cache                 schedulercache.Cache
 	equivalenceCache      *EquivalenceCache
+	schedulingQueue       SchedulingQueue
 	predicates            map[string]algorithm.FitPredicate
 	priorityMetaProducer  algorithm.MetadataProducer
 	predicateMetaProducer algorithm.PredicateMetadataProducer
@@ -114,7 +115,7 @@ func (g *genericScheduler) Schedule(pod *v1.Pod, nodeLister algorithm.NodeLister
 	}

 	trace.Step("Computing predicates")
-	filteredNodes, failedPredicateMap, err := findNodesThatFit(pod, g.cachedNodeInfoMap, nodes, g.predicates, g.extenders, g.predicateMetaProducer, g.equivalenceCache)
+	filteredNodes, failedPredicateMap, err := findNodesThatFit(pod, g.cachedNodeInfoMap, nodes, g.predicates, g.extenders, g.predicateMetaProducer, g.equivalenceCache, g.schedulingQueue)
 	if err != nil {
 		return "", err
 	}
@@ -177,53 +178,54 @@ func (g *genericScheduler) selectHost(priorityList schedulerapi.HostPriorityList

 // preempt finds nodes with pods that can be preempted to make room for "pod" to
 // schedule. It chooses one of the nodes and preempts the pods on the node and
-// returns the node and the list of preempted pods if such a node is found.
-// TODO(bsalamat): Add priority-based scheduling. More info: today one or more
-// pending pods (different from the pod that triggered the preemption(s)) may
-// schedule into some portion of the resources freed up by the preemption(s)
-// before the pod that triggered the preemption(s) has a chance to schedule
-// there, thereby preventing the pod that triggered the preemption(s) from
-// scheduling. Solution is given at:
-// https://github.com/kubernetes/community/blob/master/contributors/design-proposals/scheduling/pod-preemption.md#preemption-mechanics
-func (g *genericScheduler) Preempt(pod *v1.Pod, nodeLister algorithm.NodeLister, scheduleErr error) (*v1.Node, []*v1.Pod, error) {
+// returns 1) the node, 2) the list of preempted pods if such a node is found,
+// 3) A list of pods whose nominated node name should be cleared, and 4) any
+// possible error.
+func (g *genericScheduler) Preempt(pod *v1.Pod, nodeLister algorithm.NodeLister, scheduleErr error) (*v1.Node, []*v1.Pod, []*v1.Pod, error) {
 	// Scheduler may return various types of errors. Consider preemption only if
 	// the error is of type FitError.
 	fitError, ok := scheduleErr.(*FitError)
 	if !ok || fitError == nil {
-		return nil, nil, nil
+		return nil, nil, nil, nil
 	}
 	err := g.cache.UpdateNodeNameToInfoMap(g.cachedNodeInfoMap)
 	if err != nil {
-		return nil, nil, err
+		return nil, nil, nil, err
 	}
 	if !podEligibleToPreemptOthers(pod, g.cachedNodeInfoMap) {
 		glog.V(5).Infof("Pod %v is not eligible for more preemption.", pod.Name)
-		return nil, nil, nil
+		return nil, nil, nil, nil
 	}
 	allNodes, err := nodeLister.List()
 	if err != nil {
-		return nil, nil, err
+		return nil, nil, nil, err
 	}
 	if len(allNodes) == 0 {
-		return nil, nil, ErrNoNodesAvailable
+		return nil, nil, nil, ErrNoNodesAvailable
 	}
 	potentialNodes := nodesWherePreemptionMightHelp(pod, allNodes, fitError.FailedPredicates)
 	if len(potentialNodes) == 0 {
 		glog.V(3).Infof("Preemption will not help schedule pod %v on any node.", pod.Name)
-		return nil, nil, nil
+		// In this case, we should clean-up any existing nominated node name of the pod.
+		return nil, nil, []*v1.Pod{pod}, nil
 	}
-	nodeToPods, err := selectNodesForPreemption(pod, g.cachedNodeInfoMap, potentialNodes, g.predicates, g.predicateMetaProducer)
+	nodeToPods, err := selectNodesForPreemption(pod, g.cachedNodeInfoMap, potentialNodes, g.predicates, g.predicateMetaProducer, g.schedulingQueue)
 	if err != nil {
-		return nil, nil, err
+		return nil, nil, nil, err
 	}
 	for len(nodeToPods) > 0 {
 		node := pickOneNodeForPreemption(nodeToPods)
 		if node == nil {
-			return nil, nil, err
+			return nil, nil, nil, err
 		}
 		passes, pErr := nodePassesExtendersForPreemption(pod, node.Name, nodeToPods[node], g.cachedNodeInfoMap, g.extenders)
 		if passes && pErr == nil {
-			return node, nodeToPods[node], err
+			// Lower priority pods nominated to run on this node, may no longer fit on
+			// this node. So, we should remove their nomination. Removing their
+			// nomination updates these pods and moves them to the active queue. It
+			// lets scheduler find another place for them.
+			nominatedPods := g.getLowerPriorityNominatedPods(pod, node.Name)
+			return node, nodeToPods[node], nominatedPods, err
 		}
 		if pErr != nil {
 			glog.Errorf("Error occurred while checking extenders for preemption on node %v: %v", node, pErr)
@@ -231,7 +233,30 @@ func (g *genericScheduler) Preempt(pod *v1.Pod, nodeLister algorithm.NodeLister,
 		// Remove the node from the map and try to pick a different node.
 		delete(nodeToPods, node)
 	}
-	return nil, nil, err
+	return nil, nil, nil, err
+}
+
+// GetLowerPriorityNominatedPods returns pods whose priority is smaller than the
+// priority of the given "pod" and are nominated to run on the given node.
+// Note: We could possibly check if the nominated lower priority pods still fit
+// and return those that no longer fit, but that would require lots of
+// manipulation of NodeInfo and PredicateMeta per nominated pod. It may not be
+// worth the complexity, especially because we generally expect to have a very
+// small number of nominated pods per node.
+func (g *genericScheduler) getLowerPriorityNominatedPods(pod *v1.Pod, nodeName string) []*v1.Pod {
+	pods := g.schedulingQueue.WaitingPodsForNode(nodeName)
+	if len(pods) == 0 {
+		return nil
+	}
+
+	var lowerPriorityPods []*v1.Pod
+	podPriority := util.GetPodPriority(pod)
+	for _, p := range pods {
+		if util.GetPodPriority(p) < podPriority {
+			lowerPriorityPods = append(lowerPriorityPods, p)
+		}
+	}
+	return lowerPriorityPods
 }

 // Filters the nodes to find the ones that fit based on the given predicate functions
@@ -244,6 +269,7 @@ func findNodesThatFit(
 	extenders []algorithm.SchedulerExtender,
 	metadataProducer algorithm.PredicateMetadataProducer,
 	ecache *EquivalenceCache,
+	schedulingQueue SchedulingQueue,
 ) ([]*v1.Node, FailedPredicateMap, error) {
 	var filtered []*v1.Node
 	failedPredicateMap := FailedPredicateMap{}
@@ -262,7 +288,7 @@ func findNodesThatFit(
 		meta := metadataProducer(pod, nodeNameToInfo)
 		checkNode := func(i int) {
 			nodeName := nodes[i].Name
-			fits, failedPredicates, err := podFitsOnNode(pod, meta, nodeNameToInfo[nodeName], predicateFuncs, ecache)
+			fits, failedPredicates, err := podFitsOnNode(pod, meta, nodeNameToInfo[nodeName], predicateFuncs, ecache, schedulingQueue)
 			if err != nil {
 				predicateResultLock.Lock()
 				errs[err.Error()]++
@@ -306,9 +332,45 @@ func findNodesThatFit(
 	return filtered, failedPredicateMap, nil
 }

+// addNominatedPods adds pods with equal or greater priority which are nominated
+// to run on the node given in nodeInfo to meta and nodeInfo. It returns 1) whether
+// any pod was found, 2) augmented meta data, 3) augmented nodeInfo.
+func addNominatedPods(podPriority int32, meta algorithm.PredicateMetadata,
+	nodeInfo *schedulercache.NodeInfo, queue SchedulingQueue) (bool, algorithm.PredicateMetadata,
+	*schedulercache.NodeInfo) {
+	if queue == nil || nodeInfo == nil || nodeInfo.Node() == nil {
+		// This may happen only in tests.
+		return false, meta, nodeInfo
+	}
+	nominatedPods := queue.WaitingPodsForNode(nodeInfo.Node().Name)
+	if nominatedPods == nil || len(nominatedPods) == 0 {
+		return false, meta, nodeInfo
+	}
+	var metaOut algorithm.PredicateMetadata = nil
+	if meta != nil {
+		metaOut = meta.ShallowCopy()
+	}
+	nodeInfoOut := nodeInfo.Clone()
+	for _, p := range nominatedPods {
+		if util.GetPodPriority(p) >= podPriority {
+			nodeInfoOut.AddPod(p)
+			if metaOut != nil {
+				metaOut.AddPod(p, nodeInfoOut)
+			}
+		}
+	}
+	return true, metaOut, nodeInfoOut
+}
+
 // Checks whether node with a given name and NodeInfo satisfies all predicateFuncs.
-func podFitsOnNode(pod *v1.Pod, meta algorithm.PredicateMetadata, info *schedulercache.NodeInfo, predicateFuncs map[string]algorithm.FitPredicate,
-	ecache *EquivalenceCache) (bool, []algorithm.PredicateFailureReason, error) {
+func podFitsOnNode(
+	pod *v1.Pod,
+	meta algorithm.PredicateMetadata,
+	info *schedulercache.NodeInfo,
+	predicateFuncs map[string]algorithm.FitPredicate,
+	ecache *EquivalenceCache,
+	queue SchedulingQueue,
+) (bool, []algorithm.PredicateFailureReason, error) {
 	var (
 		equivalenceHash  uint64
 		failedPredicates []algorithm.PredicateFailureReason
@@ -318,34 +380,85 @@ func podFitsOnNode(pod *v1.Pod, meta algorithm.PredicateMetadata, info *schedule
 		reasons          []algorithm.PredicateFailureReason
 		err              error
 	)
+	predicateResults := make(map[string]HostPredicate)
+
 	if ecache != nil {
 		// getHashEquivalencePod will return immediately if no equivalence pod found
 		equivalenceHash, eCacheAvailable = ecache.getHashEquivalencePod(pod)
 	}
-	for predicateKey, predicate := range predicateFuncs {
-		// If equivalenceCache is available
-		if eCacheAvailable {
-			// PredicateWithECache will returns it's cached predicate results
-			fit, reasons, invalid = ecache.PredicateWithECache(pod.GetName(), info.Node().GetName(), predicateKey, equivalenceHash)
+	podsAdded := false
+	// We run predicates twice in some cases. If the node has greater or equal priority
+	// nominated pods, we run them when those pods are added to meta and nodeInfo.
+	// If all predicates succeed in this pass, we run them again when these
+	// nominated pods are not added. This second pass is necessary because some
+	// predicates such as inter-pod affinity may not pass without the nominated pods.
+	// If there are no nominated pods for the node or if the first run of the
+	// predicates fail, we don't run the second pass.
+	// We consider only equal or higher priority pods in the first pass, because
+	// those are the current "pod" must yield to them and not take a space opened
+	// for running them. It is ok if the current "pod" take resources freed for
+	// lower priority pods.
+	// Requiring that the new pod is schedulable in both circumstances ensures that
+	// we are making a conservative decision: predicates like resources and inter-pod
+	// anti-affinity are more likely to fail when the nominated pods are treated
+	// as running, while predicates like pod affinity are more likely to fail when
+	// the nominated pods are treated as not running. We can't just assume the
+	// nominated pods are running because they are not running right now and in fact,
+	// they may end up getting scheduled to a different node.
+	for i := 0; i < 2; i++ {
+		metaToUse := meta
+		nodeInfoToUse := info
+		if i == 0 {
+			podsAdded, metaToUse, nodeInfoToUse = addNominatedPods(util.GetPodPriority(pod), meta, info, queue)
+		} else if !podsAdded || len(failedPredicates) != 0 {
+			break
 		}
-
-		if !eCacheAvailable || invalid {
-			// we need to execute predicate functions since equivalence cache does not work
-			fit, reasons, err = predicate(pod, meta, info)
-			if err != nil {
-				return false, []algorithm.PredicateFailureReason{}, err
-			}
-
+		// Bypass eCache if node has any nominated pods.
+		// TODO(bsalamat): consider using eCache and adding proper eCache invalidations
+		// when pods are nominated or their nominations change.
+		eCacheAvailable = eCacheAvailable && !podsAdded
+		for predicateKey, predicate := range predicateFuncs {
 			if eCacheAvailable {
-				// update equivalence cache with newly computed fit & reasons
-				// TODO(resouer) should we do this in another thread? any race?
-				ecache.UpdateCachedPredicateItem(pod.GetName(), info.Node().GetName(), predicateKey, fit, reasons, equivalenceHash)
+				// PredicateWithECache will return its cached predicate results.
+				fit, reasons, invalid = ecache.PredicateWithECache(pod.GetName(), info.Node().GetName(), predicateKey, equivalenceHash)
+			}
+
+			// TODO(bsalamat): When one predicate fails and fit is false, why do we continue
+			// checking other predicates?
+			if !eCacheAvailable || invalid {
+				// we need to execute predicate functions since equivalence cache does not work
+				fit, reasons, err = predicate(pod, metaToUse, nodeInfoToUse)
+				if err != nil {
+					return false, []algorithm.PredicateFailureReason{}, err
+				}
+				if eCacheAvailable {
+					// Store data to update eCache after this loop.
+					if res, exists := predicateResults[predicateKey]; exists {
+						res.Fit = res.Fit && fit
+						res.FailReasons = append(res.FailReasons, reasons...)
+						predicateResults[predicateKey] = res
+					} else {
+						predicateResults[predicateKey] = HostPredicate{Fit: fit, FailReasons: reasons}
+					}
+				}
+			}
+			if !fit {
+				// eCache is available and valid, and predicates result is unfit, record the fail reasons
+				failedPredicates = append(failedPredicates, reasons...)
 			}
 		}
+	}

-		if !fit {
-			// eCache is available and valid, and predicates result is unfit, record the fail reasons
-			failedPredicates = append(failedPredicates, reasons...)
+	// TODO(bsalamat): This way of updating equiv. cache has a race condition against
+	// cache invalidations invoked in event handlers. This race has existed despite locks
+	// in eCache implementation. If cache is invalidated after a predicate is executed
+	// and before we update the cache, the updates should not be written to the cache.
+	if eCacheAvailable {
+		nodeName := info.Node().GetName()
+		for predKey, result := range predicateResults {
+			// update equivalence cache with newly computed fit & reasons
+			// TODO(resouer) should we do this in another thread? any race?
+			ecache.UpdateCachedPredicateItem(pod.GetName(), nodeName, predKey, result.Fit, result.FailReasons, equivalenceHash)
 		}
 	}
 	return len(failedPredicates) == 0, failedPredicates, nil
@@ -597,6 +710,7 @@ func selectNodesForPreemption(pod *v1.Pod,
 	potentialNodes []*v1.Node,
 	predicates map[string]algorithm.FitPredicate,
 	metadataProducer algorithm.PredicateMetadataProducer,
+	queue SchedulingQueue,
 ) (map[*v1.Node][]*v1.Pod, error) {

 	nodeNameToPods := map[*v1.Node][]*v1.Pod{}
@@ -610,7 +724,7 @@ func selectNodesForPreemption(pod *v1.Pod,
 		if meta != nil {
 			metaCopy = meta.ShallowCopy()
 		}
-		pods, fits := selectVictimsOnNode(pod, metaCopy, nodeNameToInfo[nodeName], predicates)
+		pods, fits := selectVictimsOnNode(pod, metaCopy, nodeNameToInfo[nodeName], predicates, queue)
 		if fits {
 			resultLock.Lock()
 			nodeNameToPods[potentialNodes[i]] = pods
@@ -672,7 +786,9 @@ func selectVictimsOnNode(
 	pod *v1.Pod,
 	meta algorithm.PredicateMetadata,
 	nodeInfo *schedulercache.NodeInfo,
-	fitPredicates map[string]algorithm.FitPredicate) ([]*v1.Pod, bool) {
+	fitPredicates map[string]algorithm.FitPredicate,
+	queue SchedulingQueue,
+) ([]*v1.Pod, bool) {
 	potentialVictims := util.SortableList{CompFunc: util.HigherPriorityPod}
 	nodeInfoCopy := nodeInfo.Clone()

@@ -703,7 +819,7 @@ func selectVictimsOnNode(
 	// that we should check is if the "pod" is failing to schedule due to pod affinity
 	// failure.
 	// TODO(bsalamat): Consider checking affinity to lower priority pods if feasible with reasonable performance.
-	if fits, _, err := podFitsOnNode(pod, meta, nodeInfoCopy, fitPredicates, nil); !fits {
+	if fits, _, err := podFitsOnNode(pod, meta, nodeInfoCopy, fitPredicates, nil, queue); !fits {
 		if err != nil {
 			glog.Warningf("Encountered error while selecting victims on node %v: %v", nodeInfo.Node().Name, err)
 		}
@@ -714,7 +830,7 @@ func selectVictimsOnNode(
 	for _, p := range potentialVictims.Items {
 		lpp := p.(*v1.Pod)
 		addPod(lpp)
-		if fits, _, _ := podFitsOnNode(pod, meta, nodeInfoCopy, fitPredicates, nil); !fits {
+		if fits, _, _ := podFitsOnNode(pod, meta, nodeInfoCopy, fitPredicates, nil, queue); !fits {
 			removePod(lpp)
 			victims = append(victims, lpp)
 			glog.V(5).Infof("Pod %v is a potential preemption victim on node %v.", lpp.Name, nodeInfo.Node().Name)
@@ -764,7 +880,6 @@ func nodesWherePreemptionMightHelp(pod *v1.Pod, nodes []*v1.Node, failedPredicat
 // considered for preemption.
 // We look at the node that is nominated for this pod and as long as there are
 // terminating pods on the node, we don't consider this for preempting more pods.
-// TODO(bsalamat): Revisit this algorithm once scheduling by priority is added.
 func podEligibleToPreemptOthers(pod *v1.Pod, nodeNameToInfo map[string]*schedulercache.NodeInfo) bool {
 	if nodeName, found := pod.Annotations[NominatedNodeAnnotationKey]; found {
 		if nodeInfo, found := nodeNameToInfo[nodeName]; found {
@@ -782,6 +897,7 @@ func podEligibleToPreemptOthers(pod *v1.Pod, nodeNameToInfo map[string]*schedule
 func NewGenericScheduler(
 	cache schedulercache.Cache,
 	eCache *EquivalenceCache,
+	podQueue SchedulingQueue,
 	predicates map[string]algorithm.FitPredicate,
 	predicateMetaProducer algorithm.PredicateMetadataProducer,
 	prioritizers []algorithm.PriorityConfig,
@@ -790,6 +906,7 @@ func NewGenericScheduler(
 	return &genericScheduler{
 		cache:                 cache,
 		equivalenceCache:      eCache,
+		schedulingQueue:       podQueue,
 		predicates:            predicates,
 		predicateMetaProducer: predicateMetaProducer,
 		prioritizers:          prioritizers,