Merge pull request #50949 from bsalamat/preemption_eviction
Automatic merge from submit-queue
Add pod preemption to the scheduler
**What this PR does / why we need it**:
This is the last of a series of PRs to add priority-based preemption to the scheduler. This PR connects the preemption logic to the scheduler workflow.
**Which issue this PR fixes** *(optional, in `fixes #<issue number>(, fixes #<issue_number>, ...)` format, will close that issue when PR gets merged)*: fixes #48646
**Special notes for your reviewer**:
This PR includes other PRs which are under review (#50805, #50405, #50190). All the new code is located in 43627afdf9
.
**Release note**:
```release-note
Add priority-based preemption to the scheduler.
```
ref/ #47604
/assign @davidopp
@kubernetes/sig-scheduling-pr-reviews
This commit is contained in:
@@ -45,6 +45,10 @@ import (
|
||||
"github.com/golang/glog"
|
||||
)
|
||||
|
||||
const (
|
||||
MatchInterPodAffinity = "MatchInterPodAffinity"
|
||||
)
|
||||
|
||||
// NodeInfo: Other types for predicate functions...
|
||||
type NodeInfo interface {
|
||||
GetNodeInfo(nodeID string) (*v1.Node, error)
|
||||
@@ -152,7 +156,7 @@ func isVolumeConflict(volume v1.Volume, pod *v1.Pod) bool {
|
||||
// - Ceph RBD forbids if any two pods share at least same monitor, and match pool and image.
|
||||
// - ISCSI forbids if any two pods share at least same IQN, LUN and Target
|
||||
// TODO: migrate this into some per-volume specific code?
|
||||
func NoDiskConflict(pod *v1.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) {
|
||||
func NoDiskConflict(pod *v1.Pod, meta algorithm.PredicateMetadata, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) {
|
||||
for _, v := range pod.Spec.Volumes {
|
||||
for _, ev := range nodeInfo.Pods() {
|
||||
if isVolumeConflict(v, ev) {
|
||||
@@ -250,7 +254,7 @@ func (c *MaxPDVolumeCountChecker) filterVolumes(volumes []v1.Volume, namespace s
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *MaxPDVolumeCountChecker) predicate(pod *v1.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) {
|
||||
func (c *MaxPDVolumeCountChecker) predicate(pod *v1.Pod, meta algorithm.PredicateMetadata, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) {
|
||||
// If a pod doesn't have any volume attached to it, the predicate will always be true.
|
||||
// Thus we make a fast path for it, to avoid unnecessary computations in this case.
|
||||
if len(pod.Spec.Volumes) == 0 {
|
||||
@@ -371,7 +375,7 @@ func NewVolumeZonePredicate(pvInfo PersistentVolumeInfo, pvcInfo PersistentVolum
|
||||
return c.predicate
|
||||
}
|
||||
|
||||
func (c *VolumeZoneChecker) predicate(pod *v1.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) {
|
||||
func (c *VolumeZoneChecker) predicate(pod *v1.Pod, meta algorithm.PredicateMetadata, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) {
|
||||
// If a pod doesn't have any volume attached to it, the predicate will always be true.
|
||||
// Thus we make a fast path for it, to avoid unnecessary computations in this case.
|
||||
if len(pod.Spec.Volumes) == 0 {
|
||||
@@ -529,7 +533,7 @@ func podName(pod *v1.Pod) string {
|
||||
// PodFitsResources checks if a node has sufficient resources, such as cpu, memory, gpu, opaque int resources etc to run a pod.
|
||||
// First return value indicates whether a node has sufficient resources to run a pod while the second return value indicates the
|
||||
// predicate failure reasons if the node has insufficient resources to run the pod.
|
||||
func PodFitsResources(pod *v1.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) {
|
||||
func PodFitsResources(pod *v1.Pod, meta algorithm.PredicateMetadata, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) {
|
||||
node := nodeInfo.Node()
|
||||
if node == nil {
|
||||
return false, nil, fmt.Errorf("node not found")
|
||||
@@ -658,7 +662,7 @@ func podMatchesNodeLabels(pod *v1.Pod, node *v1.Node) bool {
|
||||
}
|
||||
|
||||
// PodMatchNodeSelector checks if a pod node selector matches the node label.
|
||||
func PodMatchNodeSelector(pod *v1.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) {
|
||||
func PodMatchNodeSelector(pod *v1.Pod, meta algorithm.PredicateMetadata, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) {
|
||||
node := nodeInfo.Node()
|
||||
if node == nil {
|
||||
return false, nil, fmt.Errorf("node not found")
|
||||
@@ -670,7 +674,7 @@ func PodMatchNodeSelector(pod *v1.Pod, meta interface{}, nodeInfo *schedulercach
|
||||
}
|
||||
|
||||
// PodFitsHost checks if a pod spec node name matches the current node.
|
||||
func PodFitsHost(pod *v1.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) {
|
||||
func PodFitsHost(pod *v1.Pod, meta algorithm.PredicateMetadata, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) {
|
||||
if len(pod.Spec.NodeName) == 0 {
|
||||
return true, nil, nil
|
||||
}
|
||||
@@ -709,7 +713,7 @@ func NewNodeLabelPredicate(labels []string, presence bool) algorithm.FitPredicat
|
||||
// Alternately, eliminating nodes that have a certain label, regardless of value, is also useful
|
||||
// A node may have a label with "retiring" as key and the date as the value
|
||||
// and it may be desirable to avoid scheduling new pods on this node
|
||||
func (n *NodeLabelChecker) CheckNodeLabelPresence(pod *v1.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) {
|
||||
func (n *NodeLabelChecker) CheckNodeLabelPresence(pod *v1.Pod, meta algorithm.PredicateMetadata, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) {
|
||||
node := nodeInfo.Node()
|
||||
if node == nil {
|
||||
return false, nil, fmt.Errorf("node not found")
|
||||
@@ -792,7 +796,7 @@ func NewServiceAffinityPredicate(podLister algorithm.PodLister, serviceLister al
|
||||
//
|
||||
// WARNING: This Predicate is NOT guaranteed to work if some of the predicateMetadata data isn't precomputed...
|
||||
// For that reason it is not exported, i.e. it is highly coupled to the implementation of the FitPredicate construction.
|
||||
func (s *ServiceAffinity) checkServiceAffinity(pod *v1.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) {
|
||||
func (s *ServiceAffinity) checkServiceAffinity(pod *v1.Pod, meta algorithm.PredicateMetadata, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) {
|
||||
var services []*v1.Service
|
||||
var pods []*v1.Pod
|
||||
if pm, ok := meta.(*predicateMetadata); ok && (pm.serviceAffinityMatchingPodList != nil || pm.serviceAffinityMatchingPodServices != nil) {
|
||||
@@ -804,6 +808,7 @@ func (s *ServiceAffinity) checkServiceAffinity(pod *v1.Pod, meta interface{}, no
|
||||
s.serviceAffinityMetadataProducer(pm)
|
||||
pods, services = pm.serviceAffinityMatchingPodList, pm.serviceAffinityMatchingPodServices
|
||||
}
|
||||
filteredPods := nodeInfo.FilterOutPods(pods)
|
||||
node := nodeInfo.Node()
|
||||
if node == nil {
|
||||
return false, nil, fmt.Errorf("node not found")
|
||||
@@ -813,8 +818,8 @@ func (s *ServiceAffinity) checkServiceAffinity(pod *v1.Pod, meta interface{}, no
|
||||
// Step 1: If we don't have all constraints, introspect nodes to find the missing constraints.
|
||||
if len(s.labels) > len(affinityLabels) {
|
||||
if len(services) > 0 {
|
||||
if len(pods) > 0 {
|
||||
nodeWithAffinityLabels, err := s.nodeInfo.GetNodeInfo(pods[0].Spec.NodeName)
|
||||
if len(filteredPods) > 0 {
|
||||
nodeWithAffinityLabels, err := s.nodeInfo.GetNodeInfo(filteredPods[0].Spec.NodeName)
|
||||
if err != nil {
|
||||
return false, nil, err
|
||||
}
|
||||
@@ -830,7 +835,7 @@ func (s *ServiceAffinity) checkServiceAffinity(pod *v1.Pod, meta interface{}, no
|
||||
}
|
||||
|
||||
// PodFitsHostPorts checks if a node has free ports for the requested pod ports.
|
||||
func PodFitsHostPorts(pod *v1.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) {
|
||||
func PodFitsHostPorts(pod *v1.Pod, meta algorithm.PredicateMetadata, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) {
|
||||
var wantPorts map[int]bool
|
||||
if predicateMeta, ok := meta.(*predicateMetadata); ok {
|
||||
wantPorts = predicateMeta.podPorts
|
||||
@@ -871,7 +876,7 @@ func haveSame(a1, a2 []string) bool {
|
||||
|
||||
// GeneralPredicates checks whether noncriticalPredicates and EssentialPredicates pass. noncriticalPredicates are the predicates
|
||||
// that only non-critical pods need and EssentialPredicates are the predicates that all pods, including critical pods, need
|
||||
func GeneralPredicates(pod *v1.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) {
|
||||
func GeneralPredicates(pod *v1.Pod, meta algorithm.PredicateMetadata, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) {
|
||||
var predicateFails []algorithm.PredicateFailureReason
|
||||
fit, reasons, err := noncriticalPredicates(pod, meta, nodeInfo)
|
||||
if err != nil {
|
||||
@@ -893,7 +898,7 @@ func GeneralPredicates(pod *v1.Pod, meta interface{}, nodeInfo *schedulercache.N
|
||||
}
|
||||
|
||||
// noncriticalPredicates are the predicates that only non-critical pods need
|
||||
func noncriticalPredicates(pod *v1.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) {
|
||||
func noncriticalPredicates(pod *v1.Pod, meta algorithm.PredicateMetadata, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) {
|
||||
var predicateFails []algorithm.PredicateFailureReason
|
||||
fit, reasons, err := PodFitsResources(pod, meta, nodeInfo)
|
||||
if err != nil {
|
||||
@@ -907,7 +912,7 @@ func noncriticalPredicates(pod *v1.Pod, meta interface{}, nodeInfo *schedulercac
|
||||
}
|
||||
|
||||
// EssentialPredicates are the predicates that all pods, including critical pods, need
|
||||
func EssentialPredicates(pod *v1.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) {
|
||||
func EssentialPredicates(pod *v1.Pod, meta algorithm.PredicateMetadata, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) {
|
||||
var predicateFails []algorithm.PredicateFailureReason
|
||||
fit, reasons, err := PodFitsHost(pod, meta, nodeInfo)
|
||||
if err != nil {
|
||||
@@ -953,7 +958,7 @@ func NewPodAffinityPredicate(info NodeInfo, podLister algorithm.PodLister) algor
|
||||
// InterPodAffinityMatches checks if a pod can be scheduled on the specified node with pod affinity/anti-affinity configuration.
|
||||
// First return value indicates whether a pod can be scheduled on the specified node while the second return value indicates the
|
||||
// predicate failure reasons if the pod cannot be scheduled on the specified node.
|
||||
func (c *PodAffinityChecker) InterPodAffinityMatches(pod *v1.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) {
|
||||
func (c *PodAffinityChecker) InterPodAffinityMatches(pod *v1.Pod, meta algorithm.PredicateMetadata, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) {
|
||||
node := nodeInfo.Node()
|
||||
if node == nil {
|
||||
return false, nil, fmt.Errorf("node not found")
|
||||
@@ -1138,7 +1143,7 @@ func (c *PodAffinityChecker) getMatchingAntiAffinityTerms(pod *v1.Pod, allPods [
|
||||
|
||||
// Checks if scheduling the pod onto this node would break any anti-affinity
|
||||
// rules indicated by the existing pods.
|
||||
func (c *PodAffinityChecker) satisfiesExistingPodsAntiAffinity(pod *v1.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo) bool {
|
||||
func (c *PodAffinityChecker) satisfiesExistingPodsAntiAffinity(pod *v1.Pod, meta algorithm.PredicateMetadata, nodeInfo *schedulercache.NodeInfo) bool {
|
||||
node := nodeInfo.Node()
|
||||
if node == nil {
|
||||
return false
|
||||
@@ -1246,7 +1251,7 @@ func (c *PodAffinityChecker) satisfiesPodsAffinityAntiAffinity(pod *v1.Pod, node
|
||||
}
|
||||
|
||||
// PodToleratesNodeTaints checks if a pod tolerations can tolerate the node taints
|
||||
func PodToleratesNodeTaints(pod *v1.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) {
|
||||
func PodToleratesNodeTaints(pod *v1.Pod, meta algorithm.PredicateMetadata, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) {
|
||||
return podToleratesNodeTaints(pod, nodeInfo, func(t *v1.Taint) bool {
|
||||
// PodToleratesNodeTaints is only interested in NoSchedule and NoExecute taints.
|
||||
return t.Effect == v1.TaintEffectNoSchedule || t.Effect == v1.TaintEffectNoExecute
|
||||
@@ -1254,7 +1259,7 @@ func PodToleratesNodeTaints(pod *v1.Pod, meta interface{}, nodeInfo *schedulerca
|
||||
}
|
||||
|
||||
// PodToleratesNodeNoExecuteTaints checks if a pod tolerations can tolerate the node's NoExecute taints
|
||||
func PodToleratesNodeNoExecuteTaints(pod *v1.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) {
|
||||
func PodToleratesNodeNoExecuteTaints(pod *v1.Pod, meta algorithm.PredicateMetadata, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) {
|
||||
return podToleratesNodeTaints(pod, nodeInfo, func(t *v1.Taint) bool {
|
||||
return t.Effect == v1.TaintEffectNoExecute
|
||||
})
|
||||
@@ -1279,7 +1284,7 @@ func isPodBestEffort(pod *v1.Pod) bool {
|
||||
|
||||
// CheckNodeMemoryPressurePredicate checks if a pod can be scheduled on a node
|
||||
// reporting memory pressure condition.
|
||||
func CheckNodeMemoryPressurePredicate(pod *v1.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) {
|
||||
func CheckNodeMemoryPressurePredicate(pod *v1.Pod, meta algorithm.PredicateMetadata, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) {
|
||||
var podBestEffort bool
|
||||
if predicateMeta, ok := meta.(*predicateMetadata); ok {
|
||||
podBestEffort = predicateMeta.podBestEffort
|
||||
@@ -1301,7 +1306,7 @@ func CheckNodeMemoryPressurePredicate(pod *v1.Pod, meta interface{}, nodeInfo *s
|
||||
|
||||
// CheckNodeDiskPressurePredicate checks if a pod can be scheduled on a node
|
||||
// reporting disk pressure condition.
|
||||
func CheckNodeDiskPressurePredicate(pod *v1.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) {
|
||||
func CheckNodeDiskPressurePredicate(pod *v1.Pod, meta algorithm.PredicateMetadata, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) {
|
||||
// check if node is under disk pressure
|
||||
if nodeInfo.DiskPressureCondition() == v1.ConditionTrue {
|
||||
return false, []algorithm.PredicateFailureReason{ErrNodeUnderDiskPressure}, nil
|
||||
@@ -1311,7 +1316,7 @@ func CheckNodeDiskPressurePredicate(pod *v1.Pod, meta interface{}, nodeInfo *sch
|
||||
|
||||
// CheckNodeConditionPredicate checks if a pod can be scheduled on a node reporting out of disk,
|
||||
// network unavailable and not ready condition. Only node conditions are accounted in this predicate.
|
||||
func CheckNodeConditionPredicate(pod *v1.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) {
|
||||
func CheckNodeConditionPredicate(pod *v1.Pod, meta algorithm.PredicateMetadata, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) {
|
||||
reasons := []algorithm.PredicateFailureReason{}
|
||||
|
||||
if nodeInfo == nil || nodeInfo.Node() == nil {
|
||||
@@ -1359,7 +1364,7 @@ func NewVolumeNodePredicate(pvInfo PersistentVolumeInfo, pvcInfo PersistentVolum
|
||||
return c.predicate
|
||||
}
|
||||
|
||||
func (c *VolumeNodeChecker) predicate(pod *v1.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) {
|
||||
func (c *VolumeNodeChecker) predicate(pod *v1.Pod, meta algorithm.PredicateMetadata, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) {
|
||||
if !utilfeature.DefaultFeatureGate.Enabled(features.PersistentLocalVolumes) {
|
||||
return true, nil, nil
|
||||
}
|
||||
|
Reference in New Issue
Block a user