implement inter pod topological affinity and anti-affinity

2016-05-04 06:50:31 +00:00
parent 28a8a23471
commit 82ba4f077e
46 changed files with 8302 additions and 814 deletions
--- a/plugin/pkg/scheduler/algorithm/predicates/predicates.go
+++ b/plugin/pkg/scheduler/algorithm/predicates/predicates.go
@@ -19,14 +19,14 @@ package predicates
 import (
 	"fmt"

+	"github.com/golang/glog"
 	"k8s.io/kubernetes/pkg/api"
+	"k8s.io/kubernetes/pkg/api/unversioned"
 	"k8s.io/kubernetes/pkg/client/cache"
 	"k8s.io/kubernetes/pkg/labels"
 	"k8s.io/kubernetes/plugin/pkg/scheduler/algorithm"
+	priorityutil "k8s.io/kubernetes/plugin/pkg/scheduler/algorithm/priorities/util"
 	"k8s.io/kubernetes/plugin/pkg/scheduler/schedulercache"
-
-	"github.com/golang/glog"
-	"k8s.io/kubernetes/pkg/api/unversioned"
 )

 type NodeInfo interface {
@@ -723,3 +723,198 @@ func GeneralPredicates(pod *api.Pod, nodeName string, nodeInfo *schedulercache.N
 	}
 	return true, nil
 }
+
+type PodAffinityChecker struct {
+	info           NodeInfo
+	podLister      algorithm.PodLister
+	failureDomains priorityutil.Topologies
+}
+
+func NewPodAffinityPredicate(info NodeInfo, podLister algorithm.PodLister, failureDomains []string) algorithm.FitPredicate {
+	checker := &PodAffinityChecker{
+		info:           info,
+		podLister:      podLister,
+		failureDomains: priorityutil.Topologies{DefaultKeys: failureDomains},
+	}
+	return checker.InterPodAffinityMatches
+}
+
+func (checker *PodAffinityChecker) InterPodAffinityMatches(pod *api.Pod, nodeName string, nodeInfo *schedulercache.NodeInfo) (bool, error) {
+	node, err := checker.info.GetNodeInfo(nodeName)
+	if err != nil {
+		return false, err
+	}
+	allPods, err := checker.podLister.List(labels.Everything())
+	if err != nil {
+		return false, err
+	}
+	if checker.NodeMatchPodAffinityAntiAffinity(pod, allPods, node) {
+		return true, nil
+	}
+	return false, ErrPodAffinityNotMatch
+}
+
+// AnyPodMatchesPodAffinityTerm checks if any of given pods can match the specific podAffinityTerm.
+func (checker *PodAffinityChecker) AnyPodMatchesPodAffinityTerm(pod *api.Pod, allPods []*api.Pod, node *api.Node, podAffinityTerm api.PodAffinityTerm) (bool, error) {
+	for _, ep := range allPods {
+		match, err := checker.failureDomains.CheckIfPodMatchPodAffinityTerm(ep, pod, podAffinityTerm,
+			func(ep *api.Pod) (*api.Node, error) { return checker.info.GetNodeInfo(ep.Spec.NodeName) },
+			func(pod *api.Pod) (*api.Node, error) { return node, nil },
+		)
+		if err != nil || match {
+			return match, err
+		}
+	}
+	return false, nil
+}
+
+// Checks whether the given node has pods which satisfy all the required pod affinity scheduling rules.
+// If node has pods which satisfy all the required pod affinity scheduling rules then return true.
+func (checker *PodAffinityChecker) NodeMatchesHardPodAffinity(pod *api.Pod, allPods []*api.Pod, node *api.Node, podAffinity *api.PodAffinity) bool {
+	var podAffinityTerms []api.PodAffinityTerm
+	if len(podAffinity.RequiredDuringSchedulingIgnoredDuringExecution) != 0 {
+		podAffinityTerms = podAffinity.RequiredDuringSchedulingIgnoredDuringExecution
+	}
+	// TODO: Uncomment this block when implement RequiredDuringSchedulingRequiredDuringExecution.
+	//if len(podAffinity.RequiredDuringSchedulingRequiredDuringExecution) != 0 {
+	//	podAffinityTerms = append(podAffinityTerms, podAffinity.RequiredDuringSchedulingRequiredDuringExecution...)
+	//}
+
+	for _, podAffinityTerm := range podAffinityTerms {
+		podAffinityTermMatches, err := checker.AnyPodMatchesPodAffinityTerm(pod, allPods, node, podAffinityTerm)
+		if err != nil {
+			glog.V(10).Infof("Cannot schedule pod %+v onto node %v, an error ocurred when checking existing pods on the node for PodAffinityTerm %v err: %v",
+				podName(pod), node.Name, podAffinityTerm, err)
+			return false
+		}
+
+		if !podAffinityTermMatches {
+			// TODO: Think about whether this can be simplified once we have controllerRef
+			// Check if it is in special case that the requiredDuringScheduling affinity requirement can be disregarded.
+			// If the requiredDuringScheduling affinity requirement matches a pod's own labels and namespace, and there are no other such pods
+			// anywhere, then disregard the requirement.
+			// This allows rules like "schedule all of the pods of this collection to the same zone" to not block forever
+			// because the first pod of the collection can't be scheduled.
+			names := priorityutil.GetNamespacesFromPodAffinityTerm(pod, podAffinityTerm)
+			labelSelector, err := unversioned.LabelSelectorAsSelector(podAffinityTerm.LabelSelector)
+			if err != nil || !names.Has(pod.Namespace) || !labelSelector.Matches(labels.Set(pod.Labels)) {
+				glog.V(10).Infof("Cannot schedule pod %+v onto node %v, because none of the existing pods on this node satisfy the PodAffinityTerm %v, err: %+v",
+					podName(pod), node.Name, podAffinityTerm, err)
+				return false
+			}
+
+			// the affinity is to put the pod together with other pods from its same service or controller
+			filteredPods := priorityutil.FilterPodsByNameSpaces(names, allPods)
+			for _, filteredPod := range filteredPods {
+				// if found an existing pod from same service or RC,
+				// the affinity scheduling rules cannot be disregarded.
+				if labelSelector.Matches(labels.Set(filteredPod.Labels)) {
+					glog.V(10).Infof("Cannot schedule pod %+v onto node %v, because none of the existing pods on this node satisfy the PodAffinityTerm %v",
+						podName(pod), node.Name, podAffinityTerm)
+					return false
+				}
+			}
+		}
+	}
+	// all the required pod affinity scheduling rules satisfied
+	glog.V(10).Infof("All the required pod affinity scheduling rules are satisfied for Pod %+v, on node %v", podName(pod), node.Name)
+	return true
+}
+
+// Checks whether the given node has pods which satisfy all the
+// required pod anti-affinity scheduling rules.
+// Also checks whether putting the pod onto the node would break
+// any anti-affinity scheduling rules indicated by existing pods.
+// If node has pods which satisfy all the required pod anti-affinity
+// scheduling rules and scheduling the pod onto the node won't
+// break any existing pods' anti-affinity rules, then return true.
+func (checker *PodAffinityChecker) NodeMatchesHardPodAntiAffinity(pod *api.Pod, allPods []*api.Pod, node *api.Node, podAntiAffinity *api.PodAntiAffinity) bool {
+	var podAntiAffinityTerms []api.PodAffinityTerm
+	if len(podAntiAffinity.RequiredDuringSchedulingIgnoredDuringExecution) != 0 {
+		podAntiAffinityTerms = podAntiAffinity.RequiredDuringSchedulingIgnoredDuringExecution
+	}
+	// TODO: Uncomment this block when implement RequiredDuringSchedulingRequiredDuringExecution.
+	//if len(podAntiAffinity.RequiredDuringSchedulingRequiredDuringExecution) != 0 {
+	//	podAntiAffinityTerms = append(podAntiAffinityTerms, podAntiAffinity.RequiredDuringSchedulingRequiredDuringExecution...)
+	//}
+
+	// foreach element podAntiAffinityTerm of podAntiAffinityTerms
+	// if the pod matches the term (breaks the anti-affinity),
+	// don't schedule the pod onto this node.
+	for _, podAntiAffinityTerm := range podAntiAffinityTerms {
+		podAntiAffinityTermMatches, err := checker.AnyPodMatchesPodAffinityTerm(pod, allPods, node, podAntiAffinityTerm)
+		if err != nil || podAntiAffinityTermMatches == true {
+			glog.V(10).Infof("Cannot schedule pod %+v onto node %v, because not all the existing pods on this node satisfy the PodAntiAffinityTerm %v, err: %v",
+				podName(pod), node.Name, podAntiAffinityTerm, err)
+			return false
+		}
+	}
+
+	// Check if scheduling the pod onto this node would break
+	// any anti-affinity rules indicated by the existing pods on the node.
+	// If it would break, system should not schedule pod onto this node.
+	for _, ep := range allPods {
+		epAffinity, err := api.GetAffinityFromPodAnnotations(ep.Annotations)
+		if err != nil {
+			glog.V(10).Infof("Failed to get Affinity from Pod %+v, err: %+v", podName(pod), err)
+			return false
+		}
+		if epAffinity.PodAntiAffinity != nil {
+			var epAntiAffinityTerms []api.PodAffinityTerm
+			if len(epAffinity.PodAntiAffinity.RequiredDuringSchedulingIgnoredDuringExecution) != 0 {
+				epAntiAffinityTerms = epAffinity.PodAntiAffinity.RequiredDuringSchedulingIgnoredDuringExecution
+			}
+			// TODO: Uncomment this block when implement RequiredDuringSchedulingRequiredDuringExecution.
+			//if len(epAffinity.PodAntiAffinity.RequiredDuringSchedulingRequiredDuringExecution) != 0 {
+			//	epAntiAffinityTerms = append(epAntiAffinityTerms, epAffinity.PodAntiAffinity.RequiredDuringSchedulingRequiredDuringExecution...)
+			//}
+
+			for _, epAntiAffinityTerm := range epAntiAffinityTerms {
+				labelSelector, err := unversioned.LabelSelectorAsSelector(epAntiAffinityTerm.LabelSelector)
+				if err != nil {
+					glog.V(10).Infof("Failed to get label selector from anti-affinityterm %+v of existing pod %+v, err: %+v", epAntiAffinityTerm, podName(pod), err)
+					return false
+				}
+
+				names := priorityutil.GetNamespacesFromPodAffinityTerm(ep, epAntiAffinityTerm)
+				if (len(names) == 0 || names.Has(pod.Namespace)) && labelSelector.Matches(labels.Set(pod.Labels)) {
+					epNode, err := checker.info.GetNodeInfo(ep.Spec.NodeName)
+					if err != nil || checker.failureDomains.NodesHaveSameTopologyKey(node, epNode, epAntiAffinityTerm.TopologyKey) {
+						glog.V(10).Infof("Cannot schedule Pod %+v, onto node %v because the pod would break the PodAntiAffinityTerm %+v, of existing pod %+v, err: %v",
+							podName(pod), node.Name, epAntiAffinityTerm, podName(ep), err)
+						return false
+					}
+				}
+			}
+		}
+	}
+	// all the required pod anti-affinity scheduling rules are satisfied
+	glog.V(10).Infof("Can schedule Pod %+v, on node %v because all the required pod anti-affinity scheduling rules are satisfied", podName(pod), node.Name)
+	return true
+}
+
+// NodeMatchPodAffinityAntiAffinity checks if the node matches
+// the requiredDuringScheduling affinity/anti-affinity rules indicated by the pod.
+func (checker *PodAffinityChecker) NodeMatchPodAffinityAntiAffinity(pod *api.Pod, allPods []*api.Pod, node *api.Node) bool {
+	// Parse required affinity scheduling rules.
+	affinity, err := api.GetAffinityFromPodAnnotations(pod.Annotations)
+	if err != nil {
+		glog.V(10).Infof("Failed to get Affinity from Pod %+v, err: %+v", podName(pod), err)
+		return false
+	}
+
+	// check if the current node match the inter-pod affinity scheduling rules.
+	if affinity.PodAffinity != nil {
+		if !checker.NodeMatchesHardPodAffinity(pod, allPods, node, affinity.PodAffinity) {
+			return false
+		}
+	}
+
+	// check if the current node match the inter-pod anti-affinity scheduling rules.
+	if affinity.PodAntiAffinity != nil {
+		if !checker.NodeMatchesHardPodAntiAffinity(pod, allPods, node, affinity.PodAntiAffinity) {
+			return false
+		}
+	}
+	return true
+}