when the hint fn returns error, the scheduling queue logs the error and treats it as QueueAfterBackoff.

Co-authored-by: Kensei Nakada <handbomusic@gmail.com> Co-authored-by: Kante Yin <kerthcet@gmail.com> Co-authored-by: XsWack <xushiwei5@huawei.com>
2023-07-13 21:45:26 +08:00
parent 09200e9c92
commit 0105a002bc
13 changed files with 216 additions and 97 deletions
--- a/pkg/scheduler/internal/queue/scheduling_queue.go
+++ b/pkg/scheduler/internal/queue/scheduling_queue.go
@@ -433,7 +433,20 @@ func (p *PriorityQueue) isPodWorthRequeuing(logger klog.Logger, pInfo *framework
 				continue
 			}

-			switch h := hintfn.QueueingHintFn(logger, pod, oldObj, newObj); h {
+			h, err := hintfn.QueueingHintFn(logger, pod, oldObj, newObj)
+			if err != nil {
+				// If the QueueingHintFn returned an error, we should treat the event as QueueAfterBackoff so that we can prevent
+				// the Pod from stucking in the unschedulable pod pool.
+				oldObjMeta, newObjMeta, asErr := util.As[klog.KMetadata](oldObj, newObj)
+				if asErr != nil {
+					logger.Error(err, "QueueingHintFn returns error", "event", event, "plugin", hintfn.PluginName, "pod", klog.KObj(pod))
+				} else {
+					logger.Error(err, "QueueingHintFn returns error", "event", event, "plugin", hintfn.PluginName, "pod", klog.KObj(pod), "oldObj", klog.KObj(oldObjMeta), "newObj", klog.KObj(newObjMeta))
+				}
+				h = framework.QueueAfterBackoff
+			}
+
+			switch h {
 			case framework.QueueSkip:
 				continue
 			case framework.QueueImmediately: