Merge pull request #28769 from wojtek-t/optimize_priorities
Automatic merge from submit-queue Optimize priorities in scheduler Ref #28590 It's probably easier to review it commit by commit, since those changes are kind of independent from each other. @davidopp - FYI
This commit is contained in:
@@ -65,10 +65,10 @@ func (c *CachedNodeInfo) GetNodeInfo(id string) (*api.Node, error) {
|
||||
return node.(*api.Node), nil
|
||||
}
|
||||
|
||||
// podMetadata defines a type, that is an expected type that is passed
|
||||
// as metadata for predicate functions
|
||||
// podMetadata is a type that is passed as metadata for predicate functions
|
||||
type predicateMetadata struct {
|
||||
podBestEffort bool
|
||||
podRequest *resourceRequest
|
||||
}
|
||||
|
||||
func PredicateMetadata(pod *api.Pod) interface{} {
|
||||
@@ -78,6 +78,7 @@ func PredicateMetadata(pod *api.Pod) interface{} {
|
||||
}
|
||||
return &predicateMetadata{
|
||||
podBestEffort: isPodBestEffort(pod),
|
||||
podRequest: getResourceRequest(pod),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -405,7 +406,7 @@ type resourceRequest struct {
|
||||
nvidiaGPU int64
|
||||
}
|
||||
|
||||
func getResourceRequest(pod *api.Pod) resourceRequest {
|
||||
func getResourceRequest(pod *api.Pod) *resourceRequest {
|
||||
result := resourceRequest{}
|
||||
for _, container := range pod.Spec.Containers {
|
||||
requests := container.Resources.Requests
|
||||
@@ -423,7 +424,7 @@ func getResourceRequest(pod *api.Pod) resourceRequest {
|
||||
result.milliCPU = cpu
|
||||
}
|
||||
}
|
||||
return result
|
||||
return &result
|
||||
}
|
||||
|
||||
func CheckPodsExceedingFreeResources(pods []*api.Pod, allocatable api.ResourceList) (fitting []*api.Pod, notFittingCPU, notFittingMemory, notFittingNvidiaGPU []*api.Pod) {
|
||||
@@ -471,17 +472,25 @@ func PodFitsResources(pod *api.Pod, meta interface{}, nodeInfo *schedulercache.N
|
||||
if node == nil {
|
||||
return false, fmt.Errorf("node not found")
|
||||
}
|
||||
allocatable := node.Status.Allocatable
|
||||
allowedPodNumber := allocatable.Pods().Value()
|
||||
if int64(len(nodeInfo.Pods()))+1 > allowedPodNumber {
|
||||
allowedPodNumber := nodeInfo.AllowedPodNumber()
|
||||
if len(nodeInfo.Pods())+1 > allowedPodNumber {
|
||||
return false,
|
||||
newInsufficientResourceError(podCountResourceName, 1, int64(len(nodeInfo.Pods())), allowedPodNumber)
|
||||
newInsufficientResourceError(podCountResourceName, 1, int64(len(nodeInfo.Pods())), int64(allowedPodNumber))
|
||||
}
|
||||
|
||||
var podRequest *resourceRequest
|
||||
predicateMeta, ok := meta.(*predicateMetadata)
|
||||
if ok {
|
||||
podRequest = predicateMeta.podRequest
|
||||
} else {
|
||||
// We couldn't parse metadata - fallback to computing it.
|
||||
podRequest = getResourceRequest(pod)
|
||||
}
|
||||
podRequest := getResourceRequest(pod)
|
||||
if podRequest.milliCPU == 0 && podRequest.memory == 0 && podRequest.nvidiaGPU == 0 {
|
||||
return true, nil
|
||||
}
|
||||
|
||||
allocatable := node.Status.Allocatable
|
||||
totalMilliCPU := allocatable.Cpu().MilliValue()
|
||||
totalMemory := allocatable.Memory().Value()
|
||||
totalNvidiaGPU := allocatable.NvidiaGPU().Value()
|
||||
@@ -498,8 +507,12 @@ func PodFitsResources(pod *api.Pod, meta interface{}, nodeInfo *schedulercache.N
|
||||
return false,
|
||||
newInsufficientResourceError(nvidiaGpuResourceName, podRequest.nvidiaGPU, nodeInfo.RequestedResource().NvidiaGPU, totalNvidiaGPU)
|
||||
}
|
||||
glog.V(10).Infof("Schedule Pod %+v on Node %+v is allowed, Node is running only %v out of %v Pods.",
|
||||
podName(pod), node.Name, len(nodeInfo.Pods()), allowedPodNumber)
|
||||
if glog.V(10) {
|
||||
// We explicitly don't do glog.V(10).Infof() to avoid computing all the parameters if this is
|
||||
// not logged. There is visible performance gain from it.
|
||||
glog.Infof("Schedule Pod %+v on Node %+v is allowed, Node is running only %v out of %v Pods.",
|
||||
podName(pod), node.Name, len(nodeInfo.Pods()), allowedPodNumber)
|
||||
}
|
||||
return true, nil
|
||||
}
|
||||
|
||||
@@ -758,8 +771,10 @@ func getUsedPorts(pods ...*api.Pod) map[int]bool {
|
||||
// TODO: Aggregate it at the NodeInfo level.
|
||||
ports := make(map[int]bool)
|
||||
for _, pod := range pods {
|
||||
for _, container := range pod.Spec.Containers {
|
||||
for _, podPort := range container.Ports {
|
||||
for j := range pod.Spec.Containers {
|
||||
container := &pod.Spec.Containers[j]
|
||||
for k := range container.Ports {
|
||||
podPort := &container.Ports[k]
|
||||
// "0" is explicitly ignored in PodFitsHostPorts,
|
||||
// which is the only function that uses this value.
|
||||
if podPort.HostPort != 0 {
|
||||
@@ -999,19 +1014,11 @@ func (checker *PodAffinityChecker) NodeMatchPodAffinityAntiAffinity(pod *api.Pod
|
||||
return true
|
||||
}
|
||||
|
||||
type TolerationMatch struct {
|
||||
info NodeInfo
|
||||
}
|
||||
|
||||
func NewTolerationMatchPredicate(info NodeInfo) algorithm.FitPredicate {
|
||||
tolerationMatch := &TolerationMatch{
|
||||
info: info,
|
||||
}
|
||||
return tolerationMatch.PodToleratesNodeTaints
|
||||
}
|
||||
|
||||
func (t *TolerationMatch) PodToleratesNodeTaints(pod *api.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo) (bool, error) {
|
||||
func PodToleratesNodeTaints(pod *api.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo) (bool, error) {
|
||||
node := nodeInfo.Node()
|
||||
if node == nil {
|
||||
return false, fmt.Errorf("node not found")
|
||||
}
|
||||
|
||||
taints, err := api.GetTaintsFromNodeAnnotations(node.Annotations)
|
||||
if err != nil {
|
||||
@@ -1040,7 +1047,8 @@ func tolerationsToleratesTaints(tolerations []api.Toleration, taints []api.Taint
|
||||
return false
|
||||
}
|
||||
|
||||
for _, taint := range taints {
|
||||
for i := range taints {
|
||||
taint := &taints[i]
|
||||
// skip taints that have effect PreferNoSchedule, since it is for priorities
|
||||
if taint.Effect == api.TaintEffectPreferNoSchedule {
|
||||
continue
|
||||
|
@@ -2724,10 +2724,9 @@ func TestPodToleratesTaints(t *testing.T) {
|
||||
}
|
||||
|
||||
for _, test := range podTolerateTaintsTests {
|
||||
tolerationMatch := TolerationMatch{FakeNodeInfo(test.node)}
|
||||
nodeInfo := schedulercache.NewNodeInfo()
|
||||
nodeInfo.SetNode(&test.node)
|
||||
fits, err := tolerationMatch.PodToleratesNodeTaints(test.pod, PredicateMetadata(test.pod), nodeInfo)
|
||||
fits, err := PodToleratesNodeTaints(test.pod, PredicateMetadata(test.pod), nodeInfo)
|
||||
if fits == false && !reflect.DeepEqual(err, ErrTaintsTolerationsNotMatch) {
|
||||
t.Errorf("%s, unexpected error: %v", test.test, err)
|
||||
}
|
||||
|
@@ -25,31 +25,20 @@ import (
|
||||
"k8s.io/kubernetes/plugin/pkg/scheduler/schedulercache"
|
||||
)
|
||||
|
||||
type NodeAffinity struct {
|
||||
nodeLister algorithm.NodeLister
|
||||
}
|
||||
|
||||
func NewNodeAffinityPriority(nodeLister algorithm.NodeLister) algorithm.PriorityFunction {
|
||||
nodeAffinity := &NodeAffinity{
|
||||
nodeLister: nodeLister,
|
||||
}
|
||||
return nodeAffinity.CalculateNodeAffinityPriority
|
||||
}
|
||||
|
||||
// CalculateNodeAffinityPriority prioritizes nodes according to node affinity scheduling preferences
|
||||
// indicated in PreferredDuringSchedulingIgnoredDuringExecution. Each time a node match a preferredSchedulingTerm,
|
||||
// it will a get an add of preferredSchedulingTerm.Weight. Thus, the more preferredSchedulingTerms
|
||||
// the node satisfies and the more the preferredSchedulingTerm that is satisfied weights, the higher
|
||||
// score the node gets.
|
||||
func (s *NodeAffinity) CalculateNodeAffinityPriority(pod *api.Pod, nodeNameToInfo map[string]*schedulercache.NodeInfo, nodeLister algorithm.NodeLister) (schedulerapi.HostPriorityList, error) {
|
||||
var maxCount int
|
||||
counts := map[string]int{}
|
||||
|
||||
func CalculateNodeAffinityPriority(pod *api.Pod, nodeNameToInfo map[string]*schedulercache.NodeInfo, nodeLister algorithm.NodeLister) (schedulerapi.HostPriorityList, error) {
|
||||
nodes, err := nodeLister.List()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var maxCount float64
|
||||
counts := make(map[string]float64, len(nodes.Items))
|
||||
|
||||
affinity, err := api.GetAffinityFromPodAnnotations(pod.Annotations)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
@@ -72,7 +61,7 @@ func (s *NodeAffinity) CalculateNodeAffinityPriority(pod *api.Pod, nodeNameToInf
|
||||
|
||||
for _, node := range nodes.Items {
|
||||
if nodeSelector.Matches(labels.Set(node.Labels)) {
|
||||
counts[node.Name] += int(preferredSchedulingTerm.Weight)
|
||||
counts[node.Name] += float64(preferredSchedulingTerm.Weight)
|
||||
}
|
||||
|
||||
if counts[node.Name] > maxCount {
|
||||
@@ -82,15 +71,20 @@ func (s *NodeAffinity) CalculateNodeAffinityPriority(pod *api.Pod, nodeNameToInf
|
||||
}
|
||||
}
|
||||
|
||||
result := []schedulerapi.HostPriority{}
|
||||
result := make(schedulerapi.HostPriorityList, 0, len(nodes.Items))
|
||||
for i := range nodes.Items {
|
||||
node := &nodes.Items[i]
|
||||
fScore := float64(0)
|
||||
if maxCount > 0 {
|
||||
fScore = 10 * (float64(counts[node.Name]) / float64(maxCount))
|
||||
fScore := 10 * (counts[node.Name] / maxCount)
|
||||
result = append(result, schedulerapi.HostPriority{Host: node.Name, Score: int(fScore)})
|
||||
if glog.V(10) {
|
||||
// We explicitly don't do glog.V(10).Infof() to avoid computing all the parameters if this is
|
||||
// not logged. There is visible performance gain from it.
|
||||
glog.Infof("%v -> %v: NodeAffinityPriority, Score: (%d)", pod.Name, node.Name, int(fScore))
|
||||
}
|
||||
} else {
|
||||
result = append(result, schedulerapi.HostPriority{Host: node.Name, Score: 0})
|
||||
}
|
||||
result = append(result, schedulerapi.HostPriority{Host: node.Name, Score: int(fScore)})
|
||||
glog.V(10).Infof("%v -> %v: NodeAffinityPriority, Score: (%d)", pod.Name, node.Name, int(fScore))
|
||||
}
|
||||
return result, nil
|
||||
}
|
||||
|
@@ -156,8 +156,7 @@ func TestNodeAffinityPriority(t *testing.T) {
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
nodeAffinity := NodeAffinity{nodeLister: algorithm.FakeNodeLister(api.NodeList{Items: test.nodes})}
|
||||
list, err := nodeAffinity.CalculateNodeAffinityPriority(test.pod, schedulercache.CreateNodeNameToInfoMap(nil), algorithm.FakeNodeLister(api.NodeList{Items: test.nodes}))
|
||||
list, err := CalculateNodeAffinityPriority(test.pod, schedulercache.CreateNodeNameToInfoMap(nil), algorithm.FakeNodeLister(api.NodeList{Items: test.nodes}))
|
||||
if err != nil {
|
||||
t.Errorf("unexpected error: %v", err)
|
||||
}
|
||||
|
@@ -24,22 +24,10 @@ import (
|
||||
"k8s.io/kubernetes/plugin/pkg/scheduler/schedulercache"
|
||||
)
|
||||
|
||||
// NodeTaints hold the node lister
|
||||
type TaintToleration struct {
|
||||
nodeLister algorithm.NodeLister
|
||||
}
|
||||
|
||||
// NewTaintTolerationPriority
|
||||
func NewTaintTolerationPriority(nodeLister algorithm.NodeLister) algorithm.PriorityFunction {
|
||||
taintToleration := &TaintToleration{
|
||||
nodeLister: nodeLister,
|
||||
}
|
||||
return taintToleration.ComputeTaintTolerationPriority
|
||||
}
|
||||
|
||||
// CountIntolerableTaintsPreferNoSchedule gives the count of intolerable taints of a pod with effect PreferNoSchedule
|
||||
func countIntolerableTaintsPreferNoSchedule(taints []api.Taint, tolerations []api.Toleration) (intolerableTaints int) {
|
||||
for _, taint := range taints {
|
||||
func countIntolerableTaintsPreferNoSchedule(taints []api.Taint, tolerations []api.Toleration) (intolerableTaints float64) {
|
||||
for i := range taints {
|
||||
taint := &taints[i]
|
||||
// check only on taints that have effect PreferNoSchedule
|
||||
if taint.Effect != api.TaintEffectPreferNoSchedule {
|
||||
continue
|
||||
@@ -54,27 +42,27 @@ func countIntolerableTaintsPreferNoSchedule(taints []api.Taint, tolerations []ap
|
||||
|
||||
// getAllTolerationEffectPreferNoSchedule gets the list of all Toleration with Effect PreferNoSchedule
|
||||
func getAllTolerationPreferNoSchedule(tolerations []api.Toleration) (tolerationList []api.Toleration) {
|
||||
for _, toleration := range tolerations {
|
||||
for i := range tolerations {
|
||||
toleration := &tolerations[i]
|
||||
if len(toleration.Effect) == 0 || toleration.Effect == api.TaintEffectPreferNoSchedule {
|
||||
tolerationList = append(tolerationList, toleration)
|
||||
tolerationList = append(tolerationList, *toleration)
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// ComputeTaintTolerationPriority prepares the priority list for all the nodes based on the number of intolerable taints on the node
|
||||
func (s *TaintToleration) ComputeTaintTolerationPriority(pod *api.Pod, nodeNameToInfo map[string]*schedulercache.NodeInfo, nodeLister algorithm.NodeLister) (schedulerapi.HostPriorityList, error) {
|
||||
// counts hold the count of intolerable taints of a pod for a given node
|
||||
counts := make(map[string]int)
|
||||
|
||||
// the max value of counts
|
||||
var maxCount int
|
||||
|
||||
func ComputeTaintTolerationPriority(pod *api.Pod, nodeNameToInfo map[string]*schedulercache.NodeInfo, nodeLister algorithm.NodeLister) (schedulerapi.HostPriorityList, error) {
|
||||
nodes, err := nodeLister.List()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// the max value of counts
|
||||
var maxCount float64
|
||||
// counts hold the count of intolerable taints of a pod for a given node
|
||||
counts := make(map[string]float64, len(nodes.Items))
|
||||
|
||||
tolerations, err := api.GetTolerationsFromPodAnnotations(pod.Annotations)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
@@ -99,14 +87,19 @@ func (s *TaintToleration) ComputeTaintTolerationPriority(pod *api.Pod, nodeNameT
|
||||
|
||||
// The maximum priority value to give to a node
|
||||
// Priority values range from 0 - maxPriority
|
||||
const maxPriority = 10
|
||||
const maxPriority = float64(10)
|
||||
result := make(schedulerapi.HostPriorityList, 0, len(nodes.Items))
|
||||
for _, node := range nodes.Items {
|
||||
fScore := float64(maxPriority)
|
||||
for i := range nodes.Items {
|
||||
node := &nodes.Items[i]
|
||||
fScore := maxPriority
|
||||
if maxCount > 0 {
|
||||
fScore = (1.0 - float64(counts[node.Name])/float64(maxCount)) * 10
|
||||
fScore = (1.0 - counts[node.Name]/maxCount) * 10
|
||||
}
|
||||
if glog.V(10) {
|
||||
// We explicitly don't do glog.V(10).Infof() to avoid computing all the parameters if this is
|
||||
// not logged. There is visible performance gain from it.
|
||||
glog.Infof("%v -> %v: Taint Toleration Priority, Score: (%d)", pod.Name, node.Name, int(fScore))
|
||||
}
|
||||
glog.V(10).Infof("%v -> %v: Taint Toleration Priority, Score: (%d)", pod.Name, node.Name, int(fScore))
|
||||
|
||||
result = append(result, schedulerapi.HostPriority{Host: node.Name, Score: int(fScore)})
|
||||
}
|
||||
|
@@ -212,8 +212,7 @@ func TestTaintAndToleration(t *testing.T) {
|
||||
}
|
||||
for _, test := range tests {
|
||||
nodeNameToInfo := schedulercache.CreateNodeNameToInfoMap([]*api.Pod{{}})
|
||||
taintToleration := TaintToleration{nodeLister: algorithm.FakeNodeLister(api.NodeList{Items: test.nodes})}
|
||||
list, err := taintToleration.ComputeTaintTolerationPriority(
|
||||
list, err := ComputeTaintTolerationPriority(
|
||||
test.pod,
|
||||
nodeNameToInfo,
|
||||
algorithm.FakeNodeLister(api.NodeList{Items: test.nodes}))
|
||||
|
Reference in New Issue
Block a user