scheduler/dra: reduce pod scheduling latency

This is a combination of two related enhancements:
- By implementing a PreEnqueue check, the initial pod scheduling
  attempt for a pod with a claim template gets avoided when the claim
  does not exist yet.
- By implementing cluster event checks, only those pods get
  scheduled for which something changed, and they get scheduled
  immediately without delay.
This commit is contained in:
Patrick Ohly
2023-02-13 09:34:11 +01:00
parent e01db32573
commit 6f1a29520f
2 changed files with 516 additions and 31 deletions

View File

@@ -23,6 +23,7 @@ import (
"sort"
"sync"
"testing"
"time"
"github.com/google/go-cmp/cmp"
"github.com/stretchr/testify/assert"
@@ -37,6 +38,7 @@ import (
"k8s.io/client-go/informers"
"k8s.io/client-go/kubernetes/fake"
cgotesting "k8s.io/client-go/testing"
"k8s.io/client-go/tools/cache"
"k8s.io/klog/v2/ktesting"
_ "k8s.io/klog/v2/ktesting/init"
"k8s.io/kubernetes/pkg/scheduler/framework"
@@ -103,6 +105,7 @@ var (
AllocationMode(resourcev1alpha2.AllocationModeImmediate).
Obj()
pendingDelayedClaim = st.FromResourceClaim(claim).
OwnerReference(podName, podUID, podKind).
AllocationMode(resourcev1alpha2.AllocationModeWaitForFirstConsumer).
Obj()
pendingDelayedClaim2 = st.FromResourceClaim(pendingDelayedClaim).
@@ -117,7 +120,6 @@ var (
ReservedFor(resourcev1alpha2.ResourceClaimConsumerReference{Resource: "pods", Name: podName, UID: types.UID(podUID)}).
Obj()
allocatedClaim = st.FromResourceClaim(pendingDelayedClaim).
OwnerReference(podName, podUID, podKind).
Allocation(&resourcev1alpha2.AllocationResult{}).
Obj()
allocatedDelayedClaimWithWrongTopology = st.FromResourceClaim(allocatedClaim).
@@ -183,6 +185,7 @@ func (p perNodeResult) forNode(nodeName string) result {
}
type want struct {
preenqueue result
preFilterResult *framework.PreFilterResult
prefilter result
filter perNodeResult
@@ -268,11 +271,34 @@ func TestPlugin(t *testing.T) {
pod: podWithClaimTemplate, // status not set
claims: []*resourcev1alpha2.ResourceClaim{allocatedClaim, otherClaim},
want: want{
prefilter: result{
preenqueue: result{
status: framework.NewStatus(framework.UnschedulableAndUnresolvable, `pod "default/my-pod": ResourceClaim not created yet`),
},
postfilter: result{
status: framework.NewStatus(framework.Unschedulable, `no new claims to deallocate`),
},
},
"deleted-claim": {
pod: podWithClaimTemplateInStatus,
claims: func() []*resourcev1alpha2.ResourceClaim {
claim := allocatedClaim.DeepCopy()
claim.DeletionTimestamp = &metav1.Time{Time: time.Now()}
return []*resourcev1alpha2.ResourceClaim{claim}
}(),
want: want{
preenqueue: result{
status: framework.NewStatus(framework.UnschedulableAndUnresolvable, `resourceclaim "my-pod-my-resource" is being deleted`),
},
},
},
"wrong-claim": {
pod: podWithClaimTemplateInStatus,
claims: func() []*resourcev1alpha2.ResourceClaim {
claim := allocatedClaim.DeepCopy()
claim.OwnerReferences[0].UID += "123"
return []*resourcev1alpha2.ResourceClaim{claim}
}(),
want: want{
preenqueue: result{
status: framework.NewStatus(framework.UnschedulableAndUnresolvable, `ResourceClaim default/my-pod-my-resource was not created for pod default/my-pod (pod is not owner)`),
},
},
},
@@ -506,8 +532,16 @@ func TestPlugin(t *testing.T) {
}
testCtx := setup(t, nodes, tc.claims, tc.classes, tc.schedulings)
testCtx.p.enabled = !tc.disable
initialObjects := testCtx.listAll(t)
status := testCtx.p.PreEnqueue(testCtx.ctx, tc.pod)
t.Run("PreEnqueue", func(t *testing.T) {
testCtx.verify(t, tc.want.preenqueue, initialObjects, nil, status)
})
if !status.IsSuccess() {
return
}
result, status := testCtx.p.PreFilter(testCtx.ctx, testCtx.state, tc.pod)
t.Run("prefilter", func(t *testing.T) {
assert.Equal(t, tc.want.preFilterResult, result)
@@ -595,11 +629,12 @@ func TestPlugin(t *testing.T) {
}
type testContext struct {
ctx context.Context
client *fake.Clientset
p *dynamicResources
nodeInfos []*framework.NodeInfo
state *framework.CycleState
ctx context.Context
client *fake.Clientset
informerFactory informers.SharedInformerFactory
p *dynamicResources
nodeInfos []*framework.NodeInfo
state *framework.CycleState
}
func (tc *testContext) verify(t *testing.T, expected result, initialObjects []metav1.Object, result interface{}, status *framework.Status) {
@@ -727,7 +762,7 @@ func setup(t *testing.T, nodes []*v1.Node, claims []*resourcev1alpha2.ResourceCl
t.Helper()
tc := &testContext{}
_, ctx := ktesting.NewTestContext(t)
logger, ctx := ktesting.NewTestContext(t)
ctx, cancel := context.WithCancel(ctx)
t.Cleanup(cancel)
tc.ctx = ctx
@@ -736,18 +771,18 @@ func setup(t *testing.T, nodes []*v1.Node, claims []*resourcev1alpha2.ResourceCl
reactor := createReactor(tc.client.Tracker())
tc.client.PrependReactor("*", "*", reactor)
informerFactory := informers.NewSharedInformerFactory(tc.client, 0)
tc.informerFactory = informers.NewSharedInformerFactory(tc.client, 0)
opts := []runtime.Option{
runtime.WithClientSet(tc.client),
runtime.WithInformerFactory(informerFactory),
runtime.WithInformerFactory(tc.informerFactory),
}
fh, err := runtime.NewFramework(ctx, nil, nil, opts...)
if err != nil {
t.Fatal(err)
}
pl, err := New(nil, fh, feature.Features{EnableDynamicResourceAllocation: true})
pl, err := NewWithLogger(logger, nil, fh, feature.Features{EnableDynamicResourceAllocation: true})
if err != nil {
t.Fatal(err)
}
@@ -768,15 +803,15 @@ func setup(t *testing.T, nodes []*v1.Node, claims []*resourcev1alpha2.ResourceCl
require.NoError(t, err, "create pod scheduling")
}
informerFactory.Start(tc.ctx.Done())
tc.informerFactory.Start(tc.ctx.Done())
t.Cleanup(func() {
// Need to cancel before waiting for the shutdown.
cancel()
// Now we can wait for all goroutines to stop.
informerFactory.Shutdown()
tc.informerFactory.Shutdown()
})
informerFactory.WaitForCacheSync(tc.ctx.Done())
tc.informerFactory.WaitForCacheSync(tc.ctx.Done())
for _, node := range nodes {
nodeInfo := framework.NewNodeInfo()
@@ -849,3 +884,215 @@ func createReactor(tracker cgotesting.ObjectTracker) func(action cgotesting.Acti
return false, nil, nil
}
}
func TestClaimChange(t *testing.T) {
testcases := map[string]struct {
pod *v1.Pod
claims []*resourcev1alpha2.ResourceClaim
oldObj, newObj interface{}
expectedHint framework.QueueingHint
}{
"skip-deletes": {
pod: podWithClaimTemplate,
oldObj: allocatedClaim,
newObj: nil,
expectedHint: framework.QueueSkip,
},
"backoff-wrong-new-object": {
pod: podWithClaimTemplate,
newObj: "not-a-claim",
expectedHint: framework.QueueAfterBackoff,
},
"skip-wrong-claim": {
pod: podWithClaimTemplate,
newObj: func() *resourcev1alpha2.ResourceClaim {
claim := allocatedClaim.DeepCopy()
claim.OwnerReferences[0].UID += "123"
return claim
}(),
expectedHint: framework.QueueSkip,
},
"skip-unrelated-claim": {
pod: podWithClaimTemplate,
claims: []*resourcev1alpha2.ResourceClaim{allocatedClaim},
newObj: func() *resourcev1alpha2.ResourceClaim {
claim := allocatedClaim.DeepCopy()
claim.Name += "-foo"
claim.UID += "123"
return claim
}(),
expectedHint: framework.QueueSkip,
},
"queue-on-add": {
pod: podWithClaimName,
newObj: pendingImmediateClaim,
expectedHint: framework.QueueImmediately,
},
"backoff-wrong-old-object": {
pod: podWithClaimName,
oldObj: "not-a-claim",
newObj: pendingImmediateClaim,
expectedHint: framework.QueueAfterBackoff,
},
"skip-adding-finalizer": {
pod: podWithClaimName,
claims: []*resourcev1alpha2.ResourceClaim{pendingImmediateClaim},
oldObj: pendingImmediateClaim,
newObj: func() *resourcev1alpha2.ResourceClaim {
claim := pendingImmediateClaim.DeepCopy()
claim.Finalizers = append(claim.Finalizers, "foo")
return claim
}(),
expectedHint: framework.QueueSkip,
},
"queue-on-status-change": {
pod: podWithClaimName,
claims: []*resourcev1alpha2.ResourceClaim{pendingImmediateClaim},
oldObj: pendingImmediateClaim,
newObj: func() *resourcev1alpha2.ResourceClaim {
claim := pendingImmediateClaim.DeepCopy()
claim.Status.Allocation = &resourcev1alpha2.AllocationResult{}
return claim
}(),
expectedHint: framework.QueueImmediately,
},
}
for name, tc := range testcases {
t.Run(name, func(t *testing.T) {
testCtx := setup(t, nil, tc.claims, nil, nil)
if claim, ok := tc.newObj.(*resourcev1alpha2.ResourceClaim); ok {
// Update the informer because the lister gets called and must have the claim.
store := testCtx.informerFactory.Resource().V1alpha2().ResourceClaims().Informer().GetStore()
if tc.oldObj == nil {
require.NoError(t, store.Add(claim))
} else {
require.NoError(t, store.Update(claim))
}
}
actualHint := testCtx.p.isSchedulableAfterClaimChange(tc.pod, tc.oldObj, tc.newObj)
require.Equal(t, tc.expectedHint, actualHint)
})
}
}
func TestPodSchedulingContextChange(t *testing.T) {
testcases := map[string]struct {
pod *v1.Pod
schedulings []*resourcev1alpha2.PodSchedulingContext
claims []*resourcev1alpha2.ResourceClaim
oldObj, newObj interface{}
expectedHint framework.QueueingHint
}{
"skip-deleted": {
pod: podWithClaimTemplate,
oldObj: scheduling,
expectedHint: framework.QueueSkip,
},
"skip-missed-deleted": {
pod: podWithClaimTemplate,
oldObj: cache.DeletedFinalStateUnknown{
Obj: scheduling,
},
expectedHint: framework.QueueSkip,
},
"backoff-wrong-old-object": {
pod: podWithClaimTemplate,
oldObj: "not-a-scheduling-context",
newObj: scheduling,
expectedHint: framework.QueueAfterBackoff,
},
"backoff-missed-wrong-old-object": {
pod: podWithClaimTemplate,
oldObj: cache.DeletedFinalStateUnknown{
Obj: "not-a-scheduling-context",
},
newObj: scheduling,
expectedHint: framework.QueueAfterBackoff,
},
"skip-unrelated-object": {
pod: podWithClaimTemplate,
claims: []*resourcev1alpha2.ResourceClaim{pendingDelayedClaim},
newObj: func() *resourcev1alpha2.PodSchedulingContext {
scheduling := scheduling.DeepCopy()
scheduling.Name += "-foo"
return scheduling
}(),
expectedHint: framework.QueueSkip,
},
"backoff-wrong-new-object": {
pod: podWithClaimTemplate,
oldObj: scheduling,
newObj: "not-a-scheduling-context",
expectedHint: framework.QueueAfterBackoff,
},
"skip-missing-claim": {
pod: podWithClaimTemplate,
oldObj: scheduling,
newObj: schedulingInfo,
expectedHint: framework.QueueSkip,
},
"skip-missing-infos": {
pod: podWithClaimTemplateInStatus,
claims: []*resourcev1alpha2.ResourceClaim{pendingDelayedClaim},
oldObj: scheduling,
newObj: scheduling,
expectedHint: framework.QueueSkip,
},
"queue-new-infos": {
pod: podWithClaimTemplateInStatus,
claims: []*resourcev1alpha2.ResourceClaim{pendingDelayedClaim},
oldObj: scheduling,
newObj: schedulingInfo,
expectedHint: framework.QueueImmediately,
},
"queue-bad-selected-node": {
pod: podWithClaimTemplateInStatus,
claims: []*resourcev1alpha2.ResourceClaim{pendingDelayedClaim},
oldObj: func() *resourcev1alpha2.PodSchedulingContext {
scheduling := schedulingInfo.DeepCopy()
scheduling.Spec.SelectedNode = workerNode.Name
return scheduling
}(),
newObj: func() *resourcev1alpha2.PodSchedulingContext {
scheduling := schedulingInfo.DeepCopy()
scheduling.Spec.SelectedNode = workerNode.Name
scheduling.Status.ResourceClaims[0].UnsuitableNodes = append(scheduling.Status.ResourceClaims[0].UnsuitableNodes, scheduling.Spec.SelectedNode)
return scheduling
}(),
expectedHint: framework.QueueImmediately,
},
"skip-spec-changes": {
pod: podWithClaimTemplateInStatus,
claims: []*resourcev1alpha2.ResourceClaim{pendingDelayedClaim},
oldObj: schedulingInfo,
newObj: func() *resourcev1alpha2.PodSchedulingContext {
scheduling := schedulingInfo.DeepCopy()
scheduling.Spec.SelectedNode = workerNode.Name
return scheduling
}(),
expectedHint: framework.QueueSkip,
},
"backoff-other-changes": {
pod: podWithClaimTemplateInStatus,
claims: []*resourcev1alpha2.ResourceClaim{pendingDelayedClaim},
oldObj: schedulingInfo,
newObj: func() *resourcev1alpha2.PodSchedulingContext {
scheduling := schedulingInfo.DeepCopy()
scheduling.Finalizers = append(scheduling.Finalizers, "foo")
return scheduling
}(),
expectedHint: framework.QueueAfterBackoff,
},
}
for name, tc := range testcases {
tc := tc
t.Run(name, func(t *testing.T) {
t.Parallel()
testCtx := setup(t, nil, tc.claims, nil, tc.schedulings)
actualHint := testCtx.p.isSchedulableAfterPodSchedulingContextChange(tc.pod, tc.oldObj, tc.newObj)
require.Equal(t, tc.expectedHint, actualHint)
})
}
}