From 0cd1ee42597e22a041a65ae63d4a41a252954c71 Mon Sep 17 00:00:00 2001 From: Kensei Nakada Date: Sun, 7 Jan 2024 04:59:21 +0000 Subject: [PATCH] add(scheduler/framework): implement smaller Pod update events --- pkg/scheduler/eventhandlers.go | 29 ++--- .../plugins/interpodaffinity/plugin.go | 4 +- .../framework/plugins/noderesources/fit.go | 4 +- .../plugins/noderesources/fit_test.go | 2 +- .../plugins/podtopologyspread/plugin.go | 4 +- pkg/scheduler/framework/types.go | 50 +++++++-- pkg/scheduler/framework/types_test.go | 12 +- pkg/scheduler/internal/queue/events.go | 51 ++++++++- pkg/scheduler/internal/queue/events_test.go | 105 ++++++++++++++++++ .../internal/queue/scheduling_queue.go | 45 ++++---- pkg/scheduler/scheduler_test.go | 28 ++++- 11 files changed, 267 insertions(+), 67 deletions(-) diff --git a/pkg/scheduler/eventhandlers.go b/pkg/scheduler/eventhandlers.go index a93449fa650..89446fb91de 100644 --- a/pkg/scheduler/eventhandlers.go +++ b/pkg/scheduler/eventhandlers.go @@ -270,19 +270,22 @@ func (sched *Scheduler) updatePodInCache(oldObj, newObj interface{}) { logger.Error(err, "Scheduler cache UpdatePod failed", "pod", klog.KObj(oldPod)) } - // SchedulingQueue.AssignedPodUpdated has a problem: - // It internally pre-filters Pods to move to activeQ, - // while taking only in-tree plugins into consideration. - // Consequently, if custom plugins that subscribes Pod/Update events reject Pods, - // those Pods will never be requeued to activeQ by an assigned Pod related events, - // and they may be stuck in unschedulableQ. - // - // Here we use MoveAllToActiveOrBackoffQueue only when QueueingHint is enabled. - // (We cannot switch to MoveAllToActiveOrBackoffQueue right away because of throughput concern.) - if utilfeature.DefaultFeatureGate.Enabled(features.SchedulerQueueingHints) { - sched.SchedulingQueue.MoveAllToActiveOrBackoffQueue(logger, queue.AssignedPodUpdate, oldPod, newPod, nil) - } else { - sched.SchedulingQueue.AssignedPodUpdated(logger, oldPod, newPod) + events := queue.PodSchedulingPropertiesChange(newPod, oldPod) + for _, evt := range events { + // SchedulingQueue.AssignedPodUpdated has a problem: + // It internally pre-filters Pods to move to activeQ, + // while taking only in-tree plugins into consideration. + // Consequently, if custom plugins that subscribes Pod/Update events reject Pods, + // those Pods will never be requeued to activeQ by an assigned Pod related events, + // and they may be stuck in unschedulableQ. + // + // Here we use MoveAllToActiveOrBackoffQueue only when QueueingHint is enabled. + // (We cannot switch to MoveAllToActiveOrBackoffQueue right away because of throughput concern.) + if utilfeature.DefaultFeatureGate.Enabled(features.SchedulerQueueingHints) { + sched.SchedulingQueue.MoveAllToActiveOrBackoffQueue(logger, queue.AssignedPodUpdate, oldPod, newPod, nil) + } else { + sched.SchedulingQueue.AssignedPodUpdated(logger, oldPod, newPod, evt) + } } } diff --git a/pkg/scheduler/framework/plugins/interpodaffinity/plugin.go b/pkg/scheduler/framework/plugins/interpodaffinity/plugin.go index 76ff0bca5f1..77aacab2cf6 100644 --- a/pkg/scheduler/framework/plugins/interpodaffinity/plugin.go +++ b/pkg/scheduler/framework/plugins/interpodaffinity/plugin.go @@ -62,7 +62,7 @@ func (pl *InterPodAffinity) EventsToRegister(_ context.Context) ([]framework.Clu // All ActionType includes the following events: // - Delete. An unschedulable Pod may fail due to violating an existing Pod's anti-affinity constraints, // deleting an existing Pod may make it schedulable. - // - Update. Updating on an existing Pod's labels (e.g., removal) may make + // - UpdatePodLabel. Updating on an existing Pod's labels (e.g., removal) may make // an unschedulable Pod schedulable. // - Add. An unschedulable Pod may fail due to violating pod-affinity constraints, // adding an assigned Pod may make it schedulable. @@ -75,7 +75,7 @@ func (pl *InterPodAffinity) EventsToRegister(_ context.Context) ([]framework.Clu // As a workaround, we add UpdateNodeTaint event to catch the case. // We can remove UpdateNodeTaint when we remove the preCheck feature. // See: https://github.com/kubernetes/kubernetes/issues/110175 - {Event: framework.ClusterEvent{Resource: framework.Pod, ActionType: framework.All}, QueueingHintFn: pl.isSchedulableAfterPodChange}, + {Event: framework.ClusterEvent{Resource: framework.Pod, ActionType: framework.Add | framework.UpdatePodLabel | framework.Delete}, QueueingHintFn: pl.isSchedulableAfterPodChange}, {Event: framework.ClusterEvent{Resource: framework.Node, ActionType: framework.Add | framework.UpdateNodeLabel | framework.UpdateNodeTaint}, QueueingHintFn: pl.isSchedulableAfterNodeChange}, }, nil } diff --git a/pkg/scheduler/framework/plugins/noderesources/fit.go b/pkg/scheduler/framework/plugins/noderesources/fit.go index 7b8c8ec174f..98708b4a8ab 100644 --- a/pkg/scheduler/framework/plugins/noderesources/fit.go +++ b/pkg/scheduler/framework/plugins/noderesources/fit.go @@ -250,9 +250,9 @@ func getPreFilterState(cycleState *framework.CycleState) (*preFilterState, error func (f *Fit) EventsToRegister(_ context.Context) ([]framework.ClusterEventWithHint, error) { podActionType := framework.Delete if f.enableInPlacePodVerticalScaling { - // If InPlacePodVerticalScaling (KEP 1287) is enabled, then PodUpdate event should be registered + // If InPlacePodVerticalScaling (KEP 1287) is enabled, then PodRequestUpdate event should be registered // for this plugin since a Pod update may free up resources that make other Pods schedulable. - podActionType |= framework.Update + podActionType |= framework.UpdatePodRequest } return []framework.ClusterEventWithHint{ {Event: framework.ClusterEvent{Resource: framework.Pod, ActionType: podActionType}, QueueingHintFn: f.isSchedulableAfterPodChange}, diff --git a/pkg/scheduler/framework/plugins/noderesources/fit_test.go b/pkg/scheduler/framework/plugins/noderesources/fit_test.go index d0cdddfbb6a..19a80935225 100644 --- a/pkg/scheduler/framework/plugins/noderesources/fit_test.go +++ b/pkg/scheduler/framework/plugins/noderesources/fit_test.go @@ -1095,7 +1095,7 @@ func TestEventsToRegister(t *testing.T) { "Register events with InPlacePodVerticalScaling feature enabled", true, []framework.ClusterEventWithHint{ - {Event: framework.ClusterEvent{Resource: "Pod", ActionType: framework.Update | framework.Delete}}, + {Event: framework.ClusterEvent{Resource: "Pod", ActionType: framework.UpdatePodRequest | framework.Delete}}, {Event: framework.ClusterEvent{Resource: "Node", ActionType: framework.Add | framework.Update}}, }, }, diff --git a/pkg/scheduler/framework/plugins/podtopologyspread/plugin.go b/pkg/scheduler/framework/plugins/podtopologyspread/plugin.go index cafec02b0aa..7bcfbd7f3b0 100644 --- a/pkg/scheduler/framework/plugins/podtopologyspread/plugin.go +++ b/pkg/scheduler/framework/plugins/podtopologyspread/plugin.go @@ -139,11 +139,11 @@ func (pl *PodTopologySpread) EventsToRegister(_ context.Context) ([]framework.Cl // All ActionType includes the following events: // - Add. An unschedulable Pod may fail due to violating topology spread constraints, // adding an assigned Pod may make it schedulable. - // - Update. Updating on an existing Pod's labels (e.g., removal) may make + // - UpdatePodLabel. Updating on an existing Pod's labels (e.g., removal) may make // an unschedulable Pod schedulable. // - Delete. An unschedulable Pod may fail due to violating an existing Pod's topology spread constraints, // deleting an existing Pod may make it schedulable. - {Event: framework.ClusterEvent{Resource: framework.Pod, ActionType: framework.All}, QueueingHintFn: pl.isSchedulableAfterPodChange}, + {Event: framework.ClusterEvent{Resource: framework.Pod, ActionType: framework.Add | framework.UpdatePodLabel | framework.Delete}, QueueingHintFn: pl.isSchedulableAfterPodChange}, // Node add|delete|update maybe lead an topology key changed, // and make these pod in scheduling schedulable or unschedulable. // diff --git a/pkg/scheduler/framework/types.go b/pkg/scheduler/framework/types.go index d066059276f..f2a50de8f8c 100644 --- a/pkg/scheduler/framework/types.go +++ b/pkg/scheduler/framework/types.go @@ -46,25 +46,39 @@ type ActionType int64 // Constants for ActionTypes. const ( - Add ActionType = 1 << iota // 1 - Delete // 10 + Add ActionType = 1 << iota + Delete // UpdateNodeXYZ is only applicable for Node events. - UpdateNodeAllocatable // 100 - UpdateNodeLabel // 1000 - UpdateNodeTaint // 10000 - UpdateNodeCondition // 100000 - UpdateNodeAnnotation // 1000000 + // If you use UpdateNodeXYZ, + // your plugin's QueueingHint is only executed for the specific sub-Update event. + // It's better to narrow down the scope of the event by specifying them + // for better performance in requeueing. + UpdateNodeAllocatable + UpdateNodeLabel + UpdateNodeTaint + UpdateNodeCondition + UpdateNodeAnnotation - All ActionType = 1<