dra scheduler: fall back to SSA for PodSchedulingContext updates

During scheduler_perf testing, roughly 10% of the PodSchedulingContext update
operations failed with a conflict error. Using SSA would avoid that, but
performance measurements showed that this causes a considerable
slowdown (primarily because of the slower encoding with JSON instead of
protobuf, but also because server-side processing is more expensive).

Therefore a normal update is tried first and SSA only gets used when there has
been a conflict. Using SSA in that case instead of giving up outright is better
because it avoids another scheduling attempt.
This commit is contained in:
Patrick Ohly
2023-09-06 19:44:29 +02:00
parent 14ed7e8609
commit 7cac1dcf67
3 changed files with 229 additions and 5 deletions

View File

@@ -33,6 +33,7 @@ import (
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/apimachinery/pkg/util/sets"
resourcev1alpha2apply "k8s.io/client-go/applyconfigurations/resource/v1alpha2"
"k8s.io/client-go/kubernetes"
resourcev1alpha2listers "k8s.io/client-go/listers/resource/v1alpha2"
"k8s.io/component-helpers/scheduling/corev1/nodeaffinity"
@@ -187,6 +188,41 @@ func (p *podSchedulingState) publish(ctx context.Context, pod *v1.Pod, clientset
logger.V(5).Info("Updating PodSchedulingContext", "podSchedulingCtx", klog.KObj(schedulingCtx))
}
_, err = clientset.ResourceV1alpha2().PodSchedulingContexts(schedulingCtx.Namespace).Update(ctx, schedulingCtx, metav1.UpdateOptions{})
if apierrors.IsConflict(err) {
// We don't use SSA by default for performance reasons
// (https://github.com/kubernetes/kubernetes/issues/113700#issuecomment-1698563918)
// because most of the time an Update doesn't encounter
// a conflict and is faster.
//
// We could return an error here and rely on
// backoff+retry, but scheduling attempts are expensive
// and the backoff delay would cause a (small)
// slowdown. Therefore we fall back to SSA here if needed.
//
// Using SSA instead of Get+Update has the advantage that
// there is no delay for the Get. SSA is safe because only
// the scheduler updates these fields.
spec := resourcev1alpha2apply.PodSchedulingContextSpec()
spec.SelectedNode = p.selectedNode
if p.potentialNodes != nil {
spec.PotentialNodes = *p.potentialNodes
} else {
// Unchanged. Has to be set because the object that we send
// must represent the "fully specified intent". Not sending
// the list would clear it.
spec.PotentialNodes = p.schedulingCtx.Spec.PotentialNodes
}
schedulingCtxApply := resourcev1alpha2apply.PodSchedulingContext(pod.Name, pod.Namespace).WithSpec(spec)
if loggerV := logger.V(6); loggerV.Enabled() {
// At a high enough log level, dump the entire object.
loggerV.Info("Patching PodSchedulingContext", "podSchedulingCtx", klog.KObj(pod), "podSchedulingCtxApply", klog.Format(schedulingCtxApply))
} else {
logger.V(5).Info("Patching PodSchedulingContext", "podSchedulingCtx", klog.KObj(pod))
}
_, err = clientset.ResourceV1alpha2().PodSchedulingContexts(pod.Namespace).Apply(ctx, schedulingCtxApply, metav1.ApplyOptions{FieldManager: "kube-scheduler", Force: true})
}
} else {
// Create it.
schedulingCtx := &resourcev1alpha2.PodSchedulingContext{