scheduler: publish PodSchedulingContext during PreBind

Blocking API calls during a scheduling cycle like the DRA plugin is doing slow
down overall scheduling, i.e. also affecting pods which don't use DRA.

It is easy to move the blocking calls into a goroutine while the scheduling
cycle ends with "pod unschedulable". The hard part is handling an error when
those API calls then fail in the background. There is a solution for that
(see https://github.com/kubernetes/kubernetes/pull/120963), but it's complex.

Instead, publishing the modified PodSchedulingContext can also be done
later. In the more common case of a pod which is ready for binding except for
its claims, that'll be in PreBind, which runs in a separate goroutine already.

In the less common case that a pod cannot be scheduled, that'll be in
Unreserve which is still blocking.
This commit is contained in:
Patrick Ohly
2023-10-02 13:08:55 +02:00
parent 5d1509126f
commit a809a6353b
2 changed files with 34 additions and 14 deletions

View File

@@ -958,16 +958,15 @@ func (pl *dynamicResources) Reserve(ctx context.Context, cs *framework.CycleStat
state.podSchedulingState.schedulingCtx.Spec.SelectedNode != nodeName {
state.podSchedulingState.selectedNode = &nodeName
logger.V(5).Info("start allocation", "pod", klog.KObj(pod), "node", klog.ObjectRef{Name: nodeName})
if err := state.podSchedulingState.publish(ctx, pod, pl.clientset); err != nil {
return statusError(logger, err)
}
return statusPending(logger, "waiting for resource driver to allocate resource", "pod", klog.KObj(pod), "node", klog.ObjectRef{Name: nodeName})
// The actual publish happens in PreBind or Unreserve.
return nil
}
}
// May have been modified earlier in PreScore or above.
if err := state.podSchedulingState.publish(ctx, pod, pl.clientset); err != nil {
return statusError(logger, err)
if state.podSchedulingState.isDirty() {
// The actual publish happens in PreBind or Unreserve.
return nil
}
// More than one pending claim and not enough information about all of them.
@@ -1004,6 +1003,18 @@ func (pl *dynamicResources) Unreserve(ctx context.Context, cs *framework.CycleSt
}
logger := klog.FromContext(ctx)
// Was publishing delayed? If yes, do it now.
//
// The most common scenario is that a different set of potential nodes
// was identified. This revised set needs to be published to enable DRA
// drivers to provide better guidance for future scheduling attempts.
if state.podSchedulingState.isDirty() {
if err := state.podSchedulingState.publish(ctx, pod, pl.clientset); err != nil {
logger.Error(err, "publish PodSchedulingContext")
}
}
for _, claim := range state.claims {
if claim.Status.Allocation != nil &&
resourceclaim.IsReservedForPod(pod, claim) {
@@ -1042,6 +1053,15 @@ func (pl *dynamicResources) PreBind(ctx context.Context, cs *framework.CycleStat
}
logger := klog.FromContext(ctx)
// Was publishing delayed? If yes, do it now and then cause binding to stop.
if state.podSchedulingState.isDirty() {
if err := state.podSchedulingState.publish(ctx, pod, pl.clientset); err != nil {
return statusError(logger, err)
}
return statusPending(logger, "waiting for resource driver", "pod", klog.KObj(pod), "node", klog.ObjectRef{Name: nodeName})
}
for index, claim := range state.claims {
if !resourceclaim.IsReservedForPod(pod, claim) {
// The claim might be stale, for example because the claim can get shared and some