scheduler/dra: reduce pod scheduling latency

This is a combination of two related enhancements:
- By implementing a PreEnqueue check, the initial pod scheduling
  attempt for a pod with a claim template gets avoided when the claim
  does not exist yet.
- By implementing cluster event checks, only those pods get
  scheduled for which something changed, and they get scheduled
  immediately without delay.
This commit is contained in:
Patrick Ohly
2023-02-13 09:34:11 +01:00
parent e01db32573
commit 6f1a29520f
2 changed files with 516 additions and 31 deletions

View File

@@ -23,8 +23,11 @@ import (
"sort"
"sync"
"github.com/google/go-cmp/cmp"
v1 "k8s.io/api/core/v1"
resourcev1alpha2 "k8s.io/api/resource/v1alpha2"
apiequality "k8s.io/apimachinery/pkg/api/equality"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
@@ -38,6 +41,7 @@ import (
"k8s.io/kubernetes/pkg/scheduler/framework"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/feature"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/names"
schedutil "k8s.io/kubernetes/pkg/scheduler/util"
)
const (
@@ -208,14 +212,25 @@ func statusForClaim(schedulingCtx *resourcev1alpha2.PodSchedulingContext, podCla
// dynamicResources is a plugin that ensures that ResourceClaims are allocated.
type dynamicResources struct {
enabled bool
fh framework.Handle
clientset kubernetes.Interface
claimLister resourcev1alpha2listers.ResourceClaimLister
classLister resourcev1alpha2listers.ResourceClassLister
podSchedulingContextLister resourcev1alpha2listers.PodSchedulingContextLister
// logger is only meant to be used by background activities which don't
// have some other logger in their parent callstack.
logger klog.Logger
}
// New initializes a new plugin and returns it.
func New(plArgs runtime.Object, fh framework.Handle, fts feature.Features) (framework.Plugin, error) {
// TODO: the runtime should set up logging for each plugin, including
// adding a name for each one (same as in kube-controller-manager).
return NewWithLogger(klog.TODO(), plArgs, fh, fts)
}
func NewWithLogger(logger klog.Logger, plArgs runtime.Object, fh framework.Handle, fts feature.Features) (framework.Plugin, error) {
if !fts.EnableDynamicResourceAllocation {
// Disabled, won't do anything.
return &dynamicResources{}, nil
@@ -223,13 +238,16 @@ func New(plArgs runtime.Object, fh framework.Handle, fts feature.Features) (fram
return &dynamicResources{
enabled: true,
fh: fh,
clientset: fh.ClientSet(),
claimLister: fh.SharedInformerFactory().Resource().V1alpha2().ResourceClaims().Lister(),
classLister: fh.SharedInformerFactory().Resource().V1alpha2().ResourceClasses().Lister(),
podSchedulingContextLister: fh.SharedInformerFactory().Resource().V1alpha2().PodSchedulingContexts().Lister(),
logger: logger,
}, nil
}
var _ framework.PreEnqueuePlugin = &dynamicResources{}
var _ framework.PreFilterPlugin = &dynamicResources{}
var _ framework.FilterPlugin = &dynamicResources{}
var _ framework.PostFilterPlugin = &dynamicResources{}
@@ -251,12 +269,10 @@ func (pl *dynamicResources) EventsToRegister() []framework.ClusterEventWithHint
}
events := []framework.ClusterEventWithHint{
// Allocation is tracked in ResourceClaims, so any changes may make the pods schedulable.
{Event: framework.ClusterEvent{Resource: framework.ResourceClaim, ActionType: framework.Add | framework.Update}},
{Event: framework.ClusterEvent{Resource: framework.ResourceClaim, ActionType: framework.Add | framework.Update}, QueueingHintFn: pl.isSchedulableAfterClaimChange},
// When a driver has provided additional information, a pod waiting for that information
// may be schedulable.
// TODO (#113702): can we change this so that such an event does not trigger *all* pods?
// Yes: https://github.com/kubernetes/kubernetes/blob/abcbaed0784baf5ed2382aae9705a8918f2daa18/pkg/scheduler/eventhandlers.go#L70
{Event: framework.ClusterEvent{Resource: framework.PodSchedulingContext, ActionType: framework.Add | framework.Update}},
{Event: framework.ClusterEvent{Resource: framework.PodSchedulingContext, ActionType: framework.Add | framework.Update}, QueueingHintFn: pl.isSchedulableAfterPodSchedulingContextChange},
// A resource might depend on node labels for topology filtering.
// A new or updated node may make pods schedulable.
{Event: framework.ClusterEvent{Resource: framework.Node, ActionType: framework.Add | framework.UpdateNodeLabel}},
@@ -264,13 +280,237 @@ func (pl *dynamicResources) EventsToRegister() []framework.ClusterEventWithHint
return events
}
// PreEnqueue checks if there are known reasons why a pod currently cannot be
// scheduled. When this fails, one of the registered events can trigger another
// attempt.
func (pl *dynamicResources) PreEnqueue(ctx context.Context, pod *v1.Pod) (status *framework.Status) {
if err := pl.foreachPodResourceClaim(pod, nil); err != nil {
return statusUnschedulable(klog.FromContext(ctx), err.Error())
}
return nil
}
// isSchedulableAfterClaimChange is invoked for all claim events reported by
// an informer. It checks whether that change made a previously unschedulable
// pod schedulable. It errs on the side of letting a pod scheduling attempt
// happen.
func (pl *dynamicResources) isSchedulableAfterClaimChange(pod *v1.Pod, oldObj, newObj interface{}) framework.QueueingHint {
if newObj == nil {
// Deletes don't make a pod schedulable.
return framework.QueueSkip
}
_, modifiedClaim, err := schedutil.As[*resourcev1alpha2.ResourceClaim](nil, newObj)
if err != nil {
// Shouldn't happen.
pl.logger.Error(err, "unexpected new object in isSchedulableAfterClaimChange")
return framework.QueueAfterBackoff
}
usesClaim := false
if err := pl.foreachPodResourceClaim(pod, func(_ string, claim *resourcev1alpha2.ResourceClaim) {
if claim.UID == modifiedClaim.UID {
usesClaim = true
}
}); err != nil {
// This is not an unexpected error: we know that
// foreachPodResourceClaim only returns errors for "not
// schedulable".
pl.logger.V(4).Info("pod is not schedulable", "pod", klog.KObj(pod), "claim", klog.KObj(modifiedClaim), "reason", err.Error())
return framework.QueueSkip
}
if !usesClaim {
// This was not the claim the pod was waiting for.
pl.logger.V(6).Info("unrelated claim got modified", "pod", klog.KObj(pod), "claim", klog.KObj(modifiedClaim))
return framework.QueueSkip
}
if oldObj == nil {
pl.logger.V(4).Info("claim for pod got created", "pod", klog.KObj(pod), "claim", klog.KObj(modifiedClaim))
return framework.QueueImmediately
}
// Modifications may or may not be relevant. If the entire
// status is as before, then something else must have changed
// and we don't care. What happens in practice is that the
// resource driver adds the finalizer.
originalClaim, ok := oldObj.(*resourcev1alpha2.ResourceClaim)
if !ok {
// Shouldn't happen.
pl.logger.Error(nil, "unexpected old object in isSchedulableAfterClaimAddOrUpdate", "obj", oldObj)
return framework.QueueAfterBackoff
}
if apiequality.Semantic.DeepEqual(&originalClaim.Status, &modifiedClaim.Status) {
if loggerV := pl.logger.V(7); loggerV.Enabled() {
// Log more information.
loggerV.Info("claim for pod got modified where the pod doesn't care", "pod", klog.KObj(pod), "claim", klog.KObj(modifiedClaim), "diff", cmp.Diff(originalClaim, modifiedClaim))
} else {
pl.logger.V(6).Info("claim for pod got modified where the pod doesn't care", "pod", klog.KObj(pod), "claim", klog.KObj(modifiedClaim))
}
return framework.QueueSkip
}
pl.logger.V(4).Info("status of claim for pod got updated", "pod", klog.KObj(pod), "claim", klog.KObj(modifiedClaim))
return framework.QueueImmediately
}
// isSchedulableAfterPodSchedulingContextChange is invoked for all
// PodSchedulingContext events reported by an informer. It checks whether that
// change made a previously unschedulable pod schedulable (updated) or a new
// attempt is needed to re-create the object (deleted). It errs on the side of
// letting a pod scheduling attempt happen.
func (pl *dynamicResources) isSchedulableAfterPodSchedulingContextChange(pod *v1.Pod, oldObj, newObj interface{}) framework.QueueingHint {
// Deleted? That can happen because we ourselves delete the PodSchedulingContext while
// working on the pod. This can be ignored.
if oldObj != nil && newObj == nil {
pl.logger.V(4).Info("PodSchedulingContext got deleted")
return framework.QueueSkip
}
oldPodScheduling, newPodScheduling, err := schedutil.As[*resourcev1alpha2.PodSchedulingContext](oldObj, newObj)
if err != nil {
// Shouldn't happen.
pl.logger.Error(nil, "isSchedulableAfterPodSchedulingChange")
return framework.QueueAfterBackoff
}
podScheduling := newPodScheduling // Never nil because deletes are handled above.
if podScheduling.Name != pod.Name || podScheduling.Namespace != pod.Namespace {
pl.logger.V(7).Info("PodSchedulingContext for unrelated pod got modified", "pod", klog.KObj(pod), "podScheduling", klog.KObj(podScheduling))
return framework.QueueSkip
}
// If the drivers have provided information about all
// unallocated claims with delayed allocation, then the next
// scheduling attempt is able to pick a node, so we let it run
// immediately if this occurred for the first time, otherwise
// we allow backoff.
pendingDelayedClaims := 0
if err := pl.foreachPodResourceClaim(pod, func(podResourceName string, claim *resourcev1alpha2.ResourceClaim) {
if claim.Spec.AllocationMode == resourcev1alpha2.AllocationModeWaitForFirstConsumer &&
claim.Status.Allocation == nil &&
!podSchedulingHasClaimInfo(podScheduling, podResourceName) {
pendingDelayedClaims++
}
}); err != nil {
// This is not an unexpected error: we know that
// foreachPodResourceClaim only returns errors for "not
// schedulable".
pl.logger.V(4).Info("pod is not schedulable, keep waiting", "pod", klog.KObj(pod), "reason", err.Error())
return framework.QueueSkip
}
// Some driver responses missing?
if pendingDelayedClaims > 0 {
// We could start a pod scheduling attempt to refresh the
// potential nodes list. But pod scheduling attempts are
// expensive and doing them too often causes the pod to enter
// backoff. Let's wait instead for all drivers to reply.
if loggerV := pl.logger.V(6); loggerV.Enabled() {
loggerV.Info("PodSchedulingContext with missing resource claim information, keep waiting", "pod", klog.KObj(pod), "podSchedulingDiff", cmp.Diff(oldPodScheduling, podScheduling))
} else {
pl.logger.V(5).Info("PodSchedulingContext with missing resource claim information, keep waiting", "pod", klog.KObj(pod))
}
return framework.QueueSkip
}
if oldPodScheduling == nil /* create */ ||
len(oldPodScheduling.Status.ResourceClaims) < len(podScheduling.Status.ResourceClaims) /* new information and not incomplete (checked above) */ {
// This definitely is new information for the scheduler. Try again immediately.
pl.logger.V(4).Info("PodSchedulingContext for pod has all required information, schedule immediately", "pod", klog.KObj(pod))
return framework.QueueImmediately
}
// The other situation where the scheduler needs to do
// something immediately is when the selected node doesn't
// work: waiting in the backoff queue only helps eventually
// resources on the selected node become available again. It's
// much more likely, in particular when trying to fill up the
// cluster, that the choice simply didn't work out. The risk
// here is that in a situation where the cluster really is
// full, backoff won't be used because the scheduler keeps
// trying different nodes. This should not happen when it has
// full knowledge about resource availability (=
// PodSchedulingContext.*.UnsuitableNodes is complete) but may happen
// when it doesn't (= PodSchedulingContext.*.UnsuitableNodes had to be
// truncated).
//
// Truncation only happens for very large clusters and then may slow
// down scheduling, but should not break it completely. This is
// acceptable while DRA is alpha and will be investigated further
// before moving DRA to beta.
if podScheduling.Spec.SelectedNode != "" {
for _, claimStatus := range podScheduling.Status.ResourceClaims {
if sliceContains(claimStatus.UnsuitableNodes, podScheduling.Spec.SelectedNode) {
pl.logger.V(5).Info("PodSchedulingContext has unsuitable selected node, schedule immediately", "pod", klog.KObj(pod), "selectedNode", podScheduling.Spec.SelectedNode, "podResourceName", claimStatus.Name)
return framework.QueueImmediately
}
}
}
// Update with only the spec modified?
if oldPodScheduling != nil &&
!apiequality.Semantic.DeepEqual(&oldPodScheduling.Spec, &podScheduling.Spec) &&
apiequality.Semantic.DeepEqual(&oldPodScheduling.Status, &podScheduling.Status) {
pl.logger.V(5).Info("PodSchedulingContext has only the scheduler spec changes, ignore the update", "pod", klog.KObj(pod))
return framework.QueueSkip
}
// Once we get here, all changes which are known to require special responses
// have been checked for. Whatever the change was, we don't know exactly how
// to handle it and thus return QueueAfterBackoff. This will cause the
// scheduler to treat the event as if no event hint callback had been provided.
// Developers who want to investigate this can enable a diff at log level 6.
if loggerV := pl.logger.V(6); loggerV.Enabled() {
loggerV.Info("PodSchedulingContext for pod with unknown changes, maybe schedule", "pod", klog.KObj(pod), "podSchedulingDiff", cmp.Diff(oldPodScheduling, podScheduling))
} else {
pl.logger.V(5).Info("PodSchedulingContext for pod with unknown changes, maybe schedule", "pod", klog.KObj(pod))
}
return framework.QueueAfterBackoff
}
func podSchedulingHasClaimInfo(podScheduling *resourcev1alpha2.PodSchedulingContext, podResourceName string) bool {
for _, claimStatus := range podScheduling.Status.ResourceClaims {
if claimStatus.Name == podResourceName {
return true
}
}
return false
}
func sliceContains(hay []string, needle string) bool {
for _, item := range hay {
if item == needle {
return true
}
}
return false
}
// podResourceClaims returns the ResourceClaims for all pod.Spec.PodResourceClaims.
func (pl *dynamicResources) podResourceClaims(pod *v1.Pod) ([]*resourcev1alpha2.ResourceClaim, error) {
claims := make([]*resourcev1alpha2.ResourceClaim, 0, len(pod.Spec.ResourceClaims))
if err := pl.foreachPodResourceClaim(pod, func(_ string, claim *resourcev1alpha2.ResourceClaim) {
// We store the pointer as returned by the lister. The
// assumption is that if a claim gets modified while our code
// runs, the cache will store a new pointer, not mutate the
// existing object that we point to here.
claims = append(claims, claim)
}); err != nil {
return nil, err
}
return claims, nil
}
// foreachPodResourceClaim checks that each ResourceClaim for the pod exists.
// It calls an optional handler for those claims that it finds.
func (pl *dynamicResources) foreachPodResourceClaim(pod *v1.Pod, cb func(podResourceName string, claim *resourcev1alpha2.ResourceClaim)) error {
for _, resource := range pod.Spec.ResourceClaims {
claimName, mustCheckOwner, err := resourceclaim.Name(pod, &resource)
if err != nil {
return nil, err
return err
}
// The claim name might be nil if no underlying resource claim
// was generated for the referenced claim. There are valid use
@@ -280,25 +520,23 @@ func (pl *dynamicResources) podResourceClaims(pod *v1.Pod) ([]*resourcev1alpha2.
}
claim, err := pl.claimLister.ResourceClaims(pod.Namespace).Get(*claimName)
if err != nil {
return nil, err
return err
}
if claim.DeletionTimestamp != nil {
return nil, fmt.Errorf("resourceclaim %q is being deleted", claim.Name)
return fmt.Errorf("resourceclaim %q is being deleted", claim.Name)
}
if mustCheckOwner {
if err := resourceclaim.IsForPod(pod, claim); err != nil {
return nil, err
return err
}
}
// We store the pointer as returned by the lister. The
// assumption is that if a claim gets modified while our code
// runs, the cache will store a new pointer, not mutate the
// existing object that we point to here.
claims = append(claims, claim)
if cb != nil {
cb(resource.Name, claim)
}
}
return claims, nil
return nil
}
// PreFilter invoked at the prefilter extension point to check if pod has all