implement inter pod topological affinity and anti-affinity
This commit is contained in:
@@ -19,14 +19,14 @@ package predicates
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/golang/glog"
|
||||
"k8s.io/kubernetes/pkg/api"
|
||||
"k8s.io/kubernetes/pkg/api/unversioned"
|
||||
"k8s.io/kubernetes/pkg/client/cache"
|
||||
"k8s.io/kubernetes/pkg/labels"
|
||||
"k8s.io/kubernetes/plugin/pkg/scheduler/algorithm"
|
||||
priorityutil "k8s.io/kubernetes/plugin/pkg/scheduler/algorithm/priorities/util"
|
||||
"k8s.io/kubernetes/plugin/pkg/scheduler/schedulercache"
|
||||
|
||||
"github.com/golang/glog"
|
||||
"k8s.io/kubernetes/pkg/api/unversioned"
|
||||
)
|
||||
|
||||
type NodeInfo interface {
|
||||
@@ -723,3 +723,198 @@ func GeneralPredicates(pod *api.Pod, nodeName string, nodeInfo *schedulercache.N
|
||||
}
|
||||
return true, nil
|
||||
}
|
||||
|
||||
type PodAffinityChecker struct {
|
||||
info NodeInfo
|
||||
podLister algorithm.PodLister
|
||||
failureDomains priorityutil.Topologies
|
||||
}
|
||||
|
||||
func NewPodAffinityPredicate(info NodeInfo, podLister algorithm.PodLister, failureDomains []string) algorithm.FitPredicate {
|
||||
checker := &PodAffinityChecker{
|
||||
info: info,
|
||||
podLister: podLister,
|
||||
failureDomains: priorityutil.Topologies{DefaultKeys: failureDomains},
|
||||
}
|
||||
return checker.InterPodAffinityMatches
|
||||
}
|
||||
|
||||
func (checker *PodAffinityChecker) InterPodAffinityMatches(pod *api.Pod, nodeName string, nodeInfo *schedulercache.NodeInfo) (bool, error) {
|
||||
node, err := checker.info.GetNodeInfo(nodeName)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
allPods, err := checker.podLister.List(labels.Everything())
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
if checker.NodeMatchPodAffinityAntiAffinity(pod, allPods, node) {
|
||||
return true, nil
|
||||
}
|
||||
return false, ErrPodAffinityNotMatch
|
||||
}
|
||||
|
||||
// AnyPodMatchesPodAffinityTerm checks if any of given pods can match the specific podAffinityTerm.
|
||||
func (checker *PodAffinityChecker) AnyPodMatchesPodAffinityTerm(pod *api.Pod, allPods []*api.Pod, node *api.Node, podAffinityTerm api.PodAffinityTerm) (bool, error) {
|
||||
for _, ep := range allPods {
|
||||
match, err := checker.failureDomains.CheckIfPodMatchPodAffinityTerm(ep, pod, podAffinityTerm,
|
||||
func(ep *api.Pod) (*api.Node, error) { return checker.info.GetNodeInfo(ep.Spec.NodeName) },
|
||||
func(pod *api.Pod) (*api.Node, error) { return node, nil },
|
||||
)
|
||||
if err != nil || match {
|
||||
return match, err
|
||||
}
|
||||
}
|
||||
return false, nil
|
||||
}
|
||||
|
||||
// Checks whether the given node has pods which satisfy all the required pod affinity scheduling rules.
|
||||
// If node has pods which satisfy all the required pod affinity scheduling rules then return true.
|
||||
func (checker *PodAffinityChecker) NodeMatchesHardPodAffinity(pod *api.Pod, allPods []*api.Pod, node *api.Node, podAffinity *api.PodAffinity) bool {
|
||||
var podAffinityTerms []api.PodAffinityTerm
|
||||
if len(podAffinity.RequiredDuringSchedulingIgnoredDuringExecution) != 0 {
|
||||
podAffinityTerms = podAffinity.RequiredDuringSchedulingIgnoredDuringExecution
|
||||
}
|
||||
// TODO: Uncomment this block when implement RequiredDuringSchedulingRequiredDuringExecution.
|
||||
//if len(podAffinity.RequiredDuringSchedulingRequiredDuringExecution) != 0 {
|
||||
// podAffinityTerms = append(podAffinityTerms, podAffinity.RequiredDuringSchedulingRequiredDuringExecution...)
|
||||
//}
|
||||
|
||||
for _, podAffinityTerm := range podAffinityTerms {
|
||||
podAffinityTermMatches, err := checker.AnyPodMatchesPodAffinityTerm(pod, allPods, node, podAffinityTerm)
|
||||
if err != nil {
|
||||
glog.V(10).Infof("Cannot schedule pod %+v onto node %v, an error ocurred when checking existing pods on the node for PodAffinityTerm %v err: %v",
|
||||
podName(pod), node.Name, podAffinityTerm, err)
|
||||
return false
|
||||
}
|
||||
|
||||
if !podAffinityTermMatches {
|
||||
// TODO: Think about whether this can be simplified once we have controllerRef
|
||||
// Check if it is in special case that the requiredDuringScheduling affinity requirement can be disregarded.
|
||||
// If the requiredDuringScheduling affinity requirement matches a pod's own labels and namespace, and there are no other such pods
|
||||
// anywhere, then disregard the requirement.
|
||||
// This allows rules like "schedule all of the pods of this collection to the same zone" to not block forever
|
||||
// because the first pod of the collection can't be scheduled.
|
||||
names := priorityutil.GetNamespacesFromPodAffinityTerm(pod, podAffinityTerm)
|
||||
labelSelector, err := unversioned.LabelSelectorAsSelector(podAffinityTerm.LabelSelector)
|
||||
if err != nil || !names.Has(pod.Namespace) || !labelSelector.Matches(labels.Set(pod.Labels)) {
|
||||
glog.V(10).Infof("Cannot schedule pod %+v onto node %v, because none of the existing pods on this node satisfy the PodAffinityTerm %v, err: %+v",
|
||||
podName(pod), node.Name, podAffinityTerm, err)
|
||||
return false
|
||||
}
|
||||
|
||||
// the affinity is to put the pod together with other pods from its same service or controller
|
||||
filteredPods := priorityutil.FilterPodsByNameSpaces(names, allPods)
|
||||
for _, filteredPod := range filteredPods {
|
||||
// if found an existing pod from same service or RC,
|
||||
// the affinity scheduling rules cannot be disregarded.
|
||||
if labelSelector.Matches(labels.Set(filteredPod.Labels)) {
|
||||
glog.V(10).Infof("Cannot schedule pod %+v onto node %v, because none of the existing pods on this node satisfy the PodAffinityTerm %v",
|
||||
podName(pod), node.Name, podAffinityTerm)
|
||||
return false
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// all the required pod affinity scheduling rules satisfied
|
||||
glog.V(10).Infof("All the required pod affinity scheduling rules are satisfied for Pod %+v, on node %v", podName(pod), node.Name)
|
||||
return true
|
||||
}
|
||||
|
||||
// Checks whether the given node has pods which satisfy all the
|
||||
// required pod anti-affinity scheduling rules.
|
||||
// Also checks whether putting the pod onto the node would break
|
||||
// any anti-affinity scheduling rules indicated by existing pods.
|
||||
// If node has pods which satisfy all the required pod anti-affinity
|
||||
// scheduling rules and scheduling the pod onto the node won't
|
||||
// break any existing pods' anti-affinity rules, then return true.
|
||||
func (checker *PodAffinityChecker) NodeMatchesHardPodAntiAffinity(pod *api.Pod, allPods []*api.Pod, node *api.Node, podAntiAffinity *api.PodAntiAffinity) bool {
|
||||
var podAntiAffinityTerms []api.PodAffinityTerm
|
||||
if len(podAntiAffinity.RequiredDuringSchedulingIgnoredDuringExecution) != 0 {
|
||||
podAntiAffinityTerms = podAntiAffinity.RequiredDuringSchedulingIgnoredDuringExecution
|
||||
}
|
||||
// TODO: Uncomment this block when implement RequiredDuringSchedulingRequiredDuringExecution.
|
||||
//if len(podAntiAffinity.RequiredDuringSchedulingRequiredDuringExecution) != 0 {
|
||||
// podAntiAffinityTerms = append(podAntiAffinityTerms, podAntiAffinity.RequiredDuringSchedulingRequiredDuringExecution...)
|
||||
//}
|
||||
|
||||
// foreach element podAntiAffinityTerm of podAntiAffinityTerms
|
||||
// if the pod matches the term (breaks the anti-affinity),
|
||||
// don't schedule the pod onto this node.
|
||||
for _, podAntiAffinityTerm := range podAntiAffinityTerms {
|
||||
podAntiAffinityTermMatches, err := checker.AnyPodMatchesPodAffinityTerm(pod, allPods, node, podAntiAffinityTerm)
|
||||
if err != nil || podAntiAffinityTermMatches == true {
|
||||
glog.V(10).Infof("Cannot schedule pod %+v onto node %v, because not all the existing pods on this node satisfy the PodAntiAffinityTerm %v, err: %v",
|
||||
podName(pod), node.Name, podAntiAffinityTerm, err)
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
// Check if scheduling the pod onto this node would break
|
||||
// any anti-affinity rules indicated by the existing pods on the node.
|
||||
// If it would break, system should not schedule pod onto this node.
|
||||
for _, ep := range allPods {
|
||||
epAffinity, err := api.GetAffinityFromPodAnnotations(ep.Annotations)
|
||||
if err != nil {
|
||||
glog.V(10).Infof("Failed to get Affinity from Pod %+v, err: %+v", podName(pod), err)
|
||||
return false
|
||||
}
|
||||
if epAffinity.PodAntiAffinity != nil {
|
||||
var epAntiAffinityTerms []api.PodAffinityTerm
|
||||
if len(epAffinity.PodAntiAffinity.RequiredDuringSchedulingIgnoredDuringExecution) != 0 {
|
||||
epAntiAffinityTerms = epAffinity.PodAntiAffinity.RequiredDuringSchedulingIgnoredDuringExecution
|
||||
}
|
||||
// TODO: Uncomment this block when implement RequiredDuringSchedulingRequiredDuringExecution.
|
||||
//if len(epAffinity.PodAntiAffinity.RequiredDuringSchedulingRequiredDuringExecution) != 0 {
|
||||
// epAntiAffinityTerms = append(epAntiAffinityTerms, epAffinity.PodAntiAffinity.RequiredDuringSchedulingRequiredDuringExecution...)
|
||||
//}
|
||||
|
||||
for _, epAntiAffinityTerm := range epAntiAffinityTerms {
|
||||
labelSelector, err := unversioned.LabelSelectorAsSelector(epAntiAffinityTerm.LabelSelector)
|
||||
if err != nil {
|
||||
glog.V(10).Infof("Failed to get label selector from anti-affinityterm %+v of existing pod %+v, err: %+v", epAntiAffinityTerm, podName(pod), err)
|
||||
return false
|
||||
}
|
||||
|
||||
names := priorityutil.GetNamespacesFromPodAffinityTerm(ep, epAntiAffinityTerm)
|
||||
if (len(names) == 0 || names.Has(pod.Namespace)) && labelSelector.Matches(labels.Set(pod.Labels)) {
|
||||
epNode, err := checker.info.GetNodeInfo(ep.Spec.NodeName)
|
||||
if err != nil || checker.failureDomains.NodesHaveSameTopologyKey(node, epNode, epAntiAffinityTerm.TopologyKey) {
|
||||
glog.V(10).Infof("Cannot schedule Pod %+v, onto node %v because the pod would break the PodAntiAffinityTerm %+v, of existing pod %+v, err: %v",
|
||||
podName(pod), node.Name, epAntiAffinityTerm, podName(ep), err)
|
||||
return false
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// all the required pod anti-affinity scheduling rules are satisfied
|
||||
glog.V(10).Infof("Can schedule Pod %+v, on node %v because all the required pod anti-affinity scheduling rules are satisfied", podName(pod), node.Name)
|
||||
return true
|
||||
}
|
||||
|
||||
// NodeMatchPodAffinityAntiAffinity checks if the node matches
|
||||
// the requiredDuringScheduling affinity/anti-affinity rules indicated by the pod.
|
||||
func (checker *PodAffinityChecker) NodeMatchPodAffinityAntiAffinity(pod *api.Pod, allPods []*api.Pod, node *api.Node) bool {
|
||||
// Parse required affinity scheduling rules.
|
||||
affinity, err := api.GetAffinityFromPodAnnotations(pod.Annotations)
|
||||
if err != nil {
|
||||
glog.V(10).Infof("Failed to get Affinity from Pod %+v, err: %+v", podName(pod), err)
|
||||
return false
|
||||
}
|
||||
|
||||
// check if the current node match the inter-pod affinity scheduling rules.
|
||||
if affinity.PodAffinity != nil {
|
||||
if !checker.NodeMatchesHardPodAffinity(pod, allPods, node, affinity.PodAffinity) {
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
// check if the current node match the inter-pod anti-affinity scheduling rules.
|
||||
if affinity.PodAntiAffinity != nil {
|
||||
if !checker.NodeMatchesHardPodAntiAffinity(pod, allPods, node, affinity.PodAntiAffinity) {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
Reference in New Issue
Block a user