337 lines
15 KiB
Go
337 lines
15 KiB
Go
/*
|
|
Copyright 2014 The Kubernetes Authors.
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
*/
|
|
|
|
package predicates
|
|
|
|
import (
|
|
"fmt"
|
|
|
|
"k8s.io/klog"
|
|
|
|
v1 "k8s.io/api/core/v1"
|
|
"k8s.io/apimachinery/pkg/util/sets"
|
|
utilfeature "k8s.io/apiserver/pkg/util/feature"
|
|
v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper"
|
|
"k8s.io/kubernetes/pkg/features"
|
|
pluginhelper "k8s.io/kubernetes/pkg/scheduler/framework/plugins/helper"
|
|
schedulernodeinfo "k8s.io/kubernetes/pkg/scheduler/nodeinfo"
|
|
schedutil "k8s.io/kubernetes/pkg/scheduler/util"
|
|
)
|
|
|
|
const (
|
|
// MatchInterPodAffinityPred defines the name of predicate MatchInterPodAffinity.
|
|
MatchInterPodAffinityPred = "MatchInterPodAffinity"
|
|
// CheckVolumeBindingPred defines the name of predicate CheckVolumeBinding.
|
|
CheckVolumeBindingPred = "CheckVolumeBinding"
|
|
// GeneralPred defines the name of predicate GeneralPredicates.
|
|
GeneralPred = "GeneralPredicates"
|
|
// HostNamePred defines the name of predicate HostName.
|
|
HostNamePred = "HostName"
|
|
// PodFitsHostPortsPred defines the name of predicate PodFitsHostPorts.
|
|
PodFitsHostPortsPred = "PodFitsHostPorts"
|
|
// MatchNodeSelectorPred defines the name of predicate MatchNodeSelector.
|
|
MatchNodeSelectorPred = "MatchNodeSelector"
|
|
// PodFitsResourcesPred defines the name of predicate PodFitsResources.
|
|
PodFitsResourcesPred = "PodFitsResources"
|
|
// NoDiskConflictPred defines the name of predicate NoDiskConflict.
|
|
NoDiskConflictPred = "NoDiskConflict"
|
|
// PodToleratesNodeTaintsPred defines the name of predicate PodToleratesNodeTaints.
|
|
PodToleratesNodeTaintsPred = "PodToleratesNodeTaints"
|
|
// CheckNodeUnschedulablePred defines the name of predicate CheckNodeUnschedulablePredicate.
|
|
CheckNodeUnschedulablePred = "CheckNodeUnschedulable"
|
|
// PodToleratesNodeNoExecuteTaintsPred defines the name of predicate PodToleratesNodeNoExecuteTaints.
|
|
PodToleratesNodeNoExecuteTaintsPred = "PodToleratesNodeNoExecuteTaints"
|
|
// CheckNodeLabelPresencePred defines the name of predicate CheckNodeLabelPresence.
|
|
CheckNodeLabelPresencePred = "CheckNodeLabelPresence"
|
|
// CheckServiceAffinityPred defines the name of predicate checkServiceAffinity.
|
|
CheckServiceAffinityPred = "CheckServiceAffinity"
|
|
// MaxEBSVolumeCountPred defines the name of predicate MaxEBSVolumeCount.
|
|
// DEPRECATED
|
|
// All cloudprovider specific predicates are deprecated in favour of MaxCSIVolumeCountPred.
|
|
MaxEBSVolumeCountPred = "MaxEBSVolumeCount"
|
|
// MaxGCEPDVolumeCountPred defines the name of predicate MaxGCEPDVolumeCount.
|
|
// DEPRECATED
|
|
// All cloudprovider specific predicates are deprecated in favour of MaxCSIVolumeCountPred.
|
|
MaxGCEPDVolumeCountPred = "MaxGCEPDVolumeCount"
|
|
// MaxAzureDiskVolumeCountPred defines the name of predicate MaxAzureDiskVolumeCount.
|
|
// DEPRECATED
|
|
// All cloudprovider specific predicates are deprecated in favour of MaxCSIVolumeCountPred.
|
|
MaxAzureDiskVolumeCountPred = "MaxAzureDiskVolumeCount"
|
|
// MaxCinderVolumeCountPred defines the name of predicate MaxCinderDiskVolumeCount.
|
|
// DEPRECATED
|
|
// All cloudprovider specific predicates are deprecated in favour of MaxCSIVolumeCountPred.
|
|
MaxCinderVolumeCountPred = "MaxCinderVolumeCount"
|
|
// MaxCSIVolumeCountPred defines the predicate that decides how many CSI volumes should be attached.
|
|
MaxCSIVolumeCountPred = "MaxCSIVolumeCountPred"
|
|
// NoVolumeZoneConflictPred defines the name of predicate NoVolumeZoneConflict.
|
|
NoVolumeZoneConflictPred = "NoVolumeZoneConflict"
|
|
// EvenPodsSpreadPred defines the name of predicate EvenPodsSpread.
|
|
EvenPodsSpreadPred = "EvenPodsSpread"
|
|
)
|
|
|
|
// IMPORTANT NOTE for predicate developers:
|
|
// We are using cached predicate result for pods belonging to the same equivalence class.
|
|
// So when updating an existing predicate, you should consider whether your change will introduce new
|
|
// dependency to attributes of any API object like Pod, Node, Service etc.
|
|
// If yes, you are expected to invalidate the cached predicate result for related API object change.
|
|
// For example:
|
|
// https://github.com/kubernetes/kubernetes/blob/36a218e/plugin/pkg/scheduler/factory/factory.go#L422
|
|
|
|
// IMPORTANT NOTE: this list contains the ordering of the predicates, if you develop a new predicate
|
|
// it is mandatory to add its name to this list.
|
|
// Otherwise it won't be processed, see generic_scheduler#podFitsOnNode().
|
|
// The order is based on the restrictiveness & complexity of predicates.
|
|
// Design doc: https://github.com/kubernetes/community/blob/master/contributors/design-proposals/scheduling/predicates-ordering.md
|
|
var (
|
|
predicatesOrdering = []string{CheckNodeUnschedulablePred,
|
|
GeneralPred, HostNamePred, PodFitsHostPortsPred,
|
|
MatchNodeSelectorPred, PodFitsResourcesPred, NoDiskConflictPred,
|
|
PodToleratesNodeTaintsPred, PodToleratesNodeNoExecuteTaintsPred, CheckNodeLabelPresencePred,
|
|
CheckServiceAffinityPred, MaxEBSVolumeCountPred, MaxGCEPDVolumeCountPred, MaxCSIVolumeCountPred,
|
|
MaxAzureDiskVolumeCountPred, MaxCinderVolumeCountPred, CheckVolumeBindingPred, NoVolumeZoneConflictPred,
|
|
EvenPodsSpreadPred, MatchInterPodAffinityPred}
|
|
)
|
|
|
|
// Ordering returns the ordering of predicates.
|
|
func Ordering() []string {
|
|
return predicatesOrdering
|
|
}
|
|
|
|
// FitPredicate is a function that indicates if a pod fits into an existing node.
|
|
// The failure information is given by the error.
|
|
type FitPredicate func(pod *v1.Pod, meta Metadata, nodeInfo *schedulernodeinfo.NodeInfo) (bool, []PredicateFailureReason, error)
|
|
|
|
// GetResourceRequest returns a *schedulernodeinfo.Resource that covers the largest
|
|
// width in each resource dimension. Because init-containers run sequentially, we collect
|
|
// the max in each dimension iteratively. In contrast, we sum the resource vectors for
|
|
// regular containers since they run simultaneously.
|
|
//
|
|
// If Pod Overhead is specified and the feature gate is set, the resources defined for Overhead
|
|
// are added to the calculated Resource request sum
|
|
//
|
|
// Example:
|
|
//
|
|
// Pod:
|
|
// InitContainers
|
|
// IC1:
|
|
// CPU: 2
|
|
// Memory: 1G
|
|
// IC2:
|
|
// CPU: 2
|
|
// Memory: 3G
|
|
// Containers
|
|
// C1:
|
|
// CPU: 2
|
|
// Memory: 1G
|
|
// C2:
|
|
// CPU: 1
|
|
// Memory: 1G
|
|
//
|
|
// Result: CPU: 3, Memory: 3G
|
|
func GetResourceRequest(pod *v1.Pod) *schedulernodeinfo.Resource {
|
|
result := &schedulernodeinfo.Resource{}
|
|
for _, container := range pod.Spec.Containers {
|
|
result.Add(container.Resources.Requests)
|
|
}
|
|
|
|
// take max_resource(sum_pod, any_init_container)
|
|
for _, container := range pod.Spec.InitContainers {
|
|
result.SetMaxResource(container.Resources.Requests)
|
|
}
|
|
|
|
// If Overhead is being utilized, add to the total requests for the pod
|
|
if pod.Spec.Overhead != nil && utilfeature.DefaultFeatureGate.Enabled(features.PodOverhead) {
|
|
result.Add(pod.Spec.Overhead)
|
|
}
|
|
|
|
return result
|
|
}
|
|
|
|
func podName(pod *v1.Pod) string {
|
|
return pod.Namespace + "/" + pod.Name
|
|
}
|
|
|
|
// PodFitsResources is a wrapper around PodFitsResourcesPredicate that implements FitPredicate interface.
|
|
// TODO(#85822): remove this function once predicate registration logic is deleted.
|
|
func PodFitsResources(pod *v1.Pod, _ Metadata, nodeInfo *schedulernodeinfo.NodeInfo) (bool, []PredicateFailureReason, error) {
|
|
return PodFitsResourcesPredicate(pod, nil, nil, nodeInfo)
|
|
}
|
|
|
|
// PodFitsResourcesPredicate checks if a node has sufficient resources, such as cpu, memory, gpu, opaque int resources etc to run a pod.
|
|
// First return value indicates whether a node has sufficient resources to run a pod while the second return value indicates the
|
|
// predicate failure reasons if the node has insufficient resources to run the pod
|
|
func PodFitsResourcesPredicate(pod *v1.Pod, podRequest *schedulernodeinfo.Resource, ignoredExtendedResources sets.String, nodeInfo *schedulernodeinfo.NodeInfo) (bool, []PredicateFailureReason, error) {
|
|
node := nodeInfo.Node()
|
|
if node == nil {
|
|
return false, nil, fmt.Errorf("node not found")
|
|
}
|
|
|
|
var predicateFails []PredicateFailureReason
|
|
allowedPodNumber := nodeInfo.AllowedPodNumber()
|
|
if len(nodeInfo.Pods())+1 > allowedPodNumber {
|
|
predicateFails = append(predicateFails, NewInsufficientResourceError(v1.ResourcePods, 1, int64(len(nodeInfo.Pods())), int64(allowedPodNumber)))
|
|
}
|
|
|
|
if ignoredExtendedResources == nil {
|
|
ignoredExtendedResources = sets.NewString()
|
|
}
|
|
|
|
if podRequest == nil {
|
|
podRequest = GetResourceRequest(pod)
|
|
}
|
|
if podRequest.MilliCPU == 0 &&
|
|
podRequest.Memory == 0 &&
|
|
podRequest.EphemeralStorage == 0 &&
|
|
len(podRequest.ScalarResources) == 0 {
|
|
return len(predicateFails) == 0, predicateFails, nil
|
|
}
|
|
|
|
allocatable := nodeInfo.AllocatableResource()
|
|
if allocatable.MilliCPU < podRequest.MilliCPU+nodeInfo.RequestedResource().MilliCPU {
|
|
predicateFails = append(predicateFails, NewInsufficientResourceError(v1.ResourceCPU, podRequest.MilliCPU, nodeInfo.RequestedResource().MilliCPU, allocatable.MilliCPU))
|
|
}
|
|
if allocatable.Memory < podRequest.Memory+nodeInfo.RequestedResource().Memory {
|
|
predicateFails = append(predicateFails, NewInsufficientResourceError(v1.ResourceMemory, podRequest.Memory, nodeInfo.RequestedResource().Memory, allocatable.Memory))
|
|
}
|
|
if allocatable.EphemeralStorage < podRequest.EphemeralStorage+nodeInfo.RequestedResource().EphemeralStorage {
|
|
predicateFails = append(predicateFails, NewInsufficientResourceError(v1.ResourceEphemeralStorage, podRequest.EphemeralStorage, nodeInfo.RequestedResource().EphemeralStorage, allocatable.EphemeralStorage))
|
|
}
|
|
|
|
for rName, rQuant := range podRequest.ScalarResources {
|
|
if v1helper.IsExtendedResourceName(rName) {
|
|
// If this resource is one of the extended resources that should be
|
|
// ignored, we will skip checking it.
|
|
if ignoredExtendedResources.Has(string(rName)) {
|
|
continue
|
|
}
|
|
}
|
|
if allocatable.ScalarResources[rName] < rQuant+nodeInfo.RequestedResource().ScalarResources[rName] {
|
|
predicateFails = append(predicateFails, NewInsufficientResourceError(rName, podRequest.ScalarResources[rName], nodeInfo.RequestedResource().ScalarResources[rName], allocatable.ScalarResources[rName]))
|
|
}
|
|
}
|
|
|
|
if klog.V(10) && len(predicateFails) == 0 {
|
|
// We explicitly don't do klog.V(10).Infof() to avoid computing all the parameters if this is
|
|
// not logged. There is visible performance gain from it.
|
|
klog.Infof("Schedule Pod %+v on Node %+v is allowed, Node is running only %v out of %v Pods.",
|
|
podName(pod), node.Name, len(nodeInfo.Pods()), allowedPodNumber)
|
|
}
|
|
return len(predicateFails) == 0, predicateFails, nil
|
|
}
|
|
|
|
// PodMatchNodeSelector checks if a pod node selector matches the node label.
|
|
func PodMatchNodeSelector(pod *v1.Pod, meta Metadata, nodeInfo *schedulernodeinfo.NodeInfo) (bool, []PredicateFailureReason, error) {
|
|
node := nodeInfo.Node()
|
|
if node == nil {
|
|
return false, nil, fmt.Errorf("node not found")
|
|
}
|
|
if pluginhelper.PodMatchesNodeSelectorAndAffinityTerms(pod, node) {
|
|
return true, nil, nil
|
|
}
|
|
return false, []PredicateFailureReason{ErrNodeSelectorNotMatch}, nil
|
|
}
|
|
|
|
// PodFitsHost checks if a pod spec node name matches the current node.
|
|
func PodFitsHost(pod *v1.Pod, meta Metadata, nodeInfo *schedulernodeinfo.NodeInfo) (bool, []PredicateFailureReason, error) {
|
|
if len(pod.Spec.NodeName) == 0 {
|
|
return true, nil, nil
|
|
}
|
|
node := nodeInfo.Node()
|
|
if node == nil {
|
|
return false, nil, fmt.Errorf("node not found")
|
|
}
|
|
if pod.Spec.NodeName == node.Name {
|
|
return true, nil, nil
|
|
}
|
|
return false, []PredicateFailureReason{ErrPodNotMatchHostName}, nil
|
|
}
|
|
|
|
// PodFitsHostPorts is a wrapper around PodFitsHostPortsPredicate. This is needed until
|
|
// we are able to get rid of the FitPredicate function signature.
|
|
// TODO(#85822): remove this function once predicate registration logic is deleted.
|
|
func PodFitsHostPorts(pod *v1.Pod, _ Metadata, nodeInfo *schedulernodeinfo.NodeInfo) (bool, []PredicateFailureReason, error) {
|
|
return PodFitsHostPortsPredicate(pod, nil, nodeInfo)
|
|
}
|
|
|
|
// PodFitsHostPortsPredicate checks if a node has free ports for the requested pod ports.
|
|
func PodFitsHostPortsPredicate(pod *v1.Pod, meta []*v1.ContainerPort, nodeInfo *schedulernodeinfo.NodeInfo) (bool, []PredicateFailureReason, error) {
|
|
wantPorts := meta
|
|
if wantPorts == nil {
|
|
// Fallback to computing it.
|
|
wantPorts = schedutil.GetContainerPorts(pod)
|
|
}
|
|
if len(wantPorts) == 0 {
|
|
return true, nil, nil
|
|
}
|
|
|
|
existingPorts := nodeInfo.UsedPorts()
|
|
|
|
// try to see whether existingPorts and wantPorts will conflict or not
|
|
if portsConflict(existingPorts, wantPorts) {
|
|
return false, []PredicateFailureReason{ErrPodNotFitsHostPorts}, nil
|
|
}
|
|
|
|
return true, nil, nil
|
|
}
|
|
|
|
// GeneralPredicates checks a group of predicates that the kubelet cares about.
|
|
// DEPRECATED: this exist only because kubelet uses it. We should change kubelet to execute the individual predicates it requires.
|
|
func GeneralPredicates(pod *v1.Pod, meta Metadata, nodeInfo *schedulernodeinfo.NodeInfo) (bool, []PredicateFailureReason, error) {
|
|
var predicateFails []PredicateFailureReason
|
|
for _, predicate := range []FitPredicate{PodFitsResources, PodFitsHost, PodFitsHostPorts, PodMatchNodeSelector} {
|
|
fit, reasons, err := predicate(pod, meta, nodeInfo)
|
|
if err != nil {
|
|
return false, predicateFails, err
|
|
}
|
|
if !fit {
|
|
predicateFails = append(predicateFails, reasons...)
|
|
}
|
|
}
|
|
|
|
return len(predicateFails) == 0, predicateFails, nil
|
|
}
|
|
|
|
// PodToleratesNodeTaints checks if a pod tolerations can tolerate the node taints.
|
|
func PodToleratesNodeTaints(pod *v1.Pod, meta Metadata, nodeInfo *schedulernodeinfo.NodeInfo) (bool, []PredicateFailureReason, error) {
|
|
if nodeInfo == nil || nodeInfo.Node() == nil {
|
|
return false, []PredicateFailureReason{ErrNodeUnknownCondition}, nil
|
|
}
|
|
|
|
return podToleratesNodeTaints(pod, nodeInfo, func(t *v1.Taint) bool {
|
|
// PodToleratesNodeTaints is only interested in NoSchedule and NoExecute taints.
|
|
return t.Effect == v1.TaintEffectNoSchedule || t.Effect == v1.TaintEffectNoExecute
|
|
})
|
|
}
|
|
|
|
// PodToleratesNodeNoExecuteTaints checks if a pod tolerations can tolerate the node's NoExecute taints.
|
|
func PodToleratesNodeNoExecuteTaints(pod *v1.Pod, meta Metadata, nodeInfo *schedulernodeinfo.NodeInfo) (bool, []PredicateFailureReason, error) {
|
|
return podToleratesNodeTaints(pod, nodeInfo, func(t *v1.Taint) bool {
|
|
return t.Effect == v1.TaintEffectNoExecute
|
|
})
|
|
}
|
|
|
|
func podToleratesNodeTaints(pod *v1.Pod, nodeInfo *schedulernodeinfo.NodeInfo, filter func(t *v1.Taint) bool) (bool, []PredicateFailureReason, error) {
|
|
taints, err := nodeInfo.Taints()
|
|
if err != nil {
|
|
return false, nil, err
|
|
}
|
|
|
|
if v1helper.TolerationsTolerateTaintsWithFilter(pod.Spec.Tolerations, taints, filter) {
|
|
return true, nil, nil
|
|
}
|
|
return false, []PredicateFailureReason{ErrTaintsTolerationsNotMatch}, nil
|
|
}
|