scheduler/dra: reduce pod scheduling latency

This is a combination of two related enhancements: - By implementing a PreEnqueue check, the initial pod scheduling attempt for a pod with a claim template gets avoided when the claim does not exist yet. - By implementing cluster event checks, only those pods get scheduled for which something changed, and they get scheduled immediately without delay.
2023-02-13 09:34:11 +01:00
parent e01db32573
commit 6f1a29520f
2 changed files with 516 additions and 31 deletions
--- a/pkg/scheduler/framework/plugins/dynamicresources/dynamicresources_test.go
+++ b/pkg/scheduler/framework/plugins/dynamicresources/dynamicresources_test.go
@@ -23,6 +23,7 @@ import (
 	"sort"
 	"sync"
 	"testing"
+	"time"

 	"github.com/google/go-cmp/cmp"
 	"github.com/stretchr/testify/assert"
@@ -37,6 +38,7 @@ import (
 	"k8s.io/client-go/informers"
 	"k8s.io/client-go/kubernetes/fake"
 	cgotesting "k8s.io/client-go/testing"
+	"k8s.io/client-go/tools/cache"
 	"k8s.io/klog/v2/ktesting"
 	_ "k8s.io/klog/v2/ktesting/init"
 	"k8s.io/kubernetes/pkg/scheduler/framework"
@@ -103,6 +105,7 @@ var (
 				AllocationMode(resourcev1alpha2.AllocationModeImmediate).
 				Obj()
 	pendingDelayedClaim = st.FromResourceClaim(claim).
+				OwnerReference(podName, podUID, podKind).
 				AllocationMode(resourcev1alpha2.AllocationModeWaitForFirstConsumer).
 				Obj()
 	pendingDelayedClaim2 = st.FromResourceClaim(pendingDelayedClaim).
@@ -117,7 +120,6 @@ var (
 			ReservedFor(resourcev1alpha2.ResourceClaimConsumerReference{Resource: "pods", Name: podName, UID: types.UID(podUID)}).
 			Obj()
 	allocatedClaim = st.FromResourceClaim(pendingDelayedClaim).
-			OwnerReference(podName, podUID, podKind).
 			Allocation(&resourcev1alpha2.AllocationResult{}).
 			Obj()
 	allocatedDelayedClaimWithWrongTopology = st.FromResourceClaim(allocatedClaim).
@@ -183,6 +185,7 @@ func (p perNodeResult) forNode(nodeName string) result {
 }

 type want struct {
+	preenqueue       result
 	preFilterResult  *framework.PreFilterResult
 	prefilter        result
 	filter           perNodeResult
@@ -268,11 +271,34 @@ func TestPlugin(t *testing.T) {
 			pod:    podWithClaimTemplate, // status not set
 			claims: []*resourcev1alpha2.ResourceClaim{allocatedClaim, otherClaim},
 			want: want{
-				prefilter: result{
+				preenqueue: result{
 					status: framework.NewStatus(framework.UnschedulableAndUnresolvable, `pod "default/my-pod": ResourceClaim not created yet`),
 				},
-				postfilter: result{
-					status: framework.NewStatus(framework.Unschedulable, `no new claims to deallocate`),
+			},
+		},
+		"deleted-claim": {
+			pod: podWithClaimTemplateInStatus,
+			claims: func() []*resourcev1alpha2.ResourceClaim {
+				claim := allocatedClaim.DeepCopy()
+				claim.DeletionTimestamp = &metav1.Time{Time: time.Now()}
+				return []*resourcev1alpha2.ResourceClaim{claim}
+			}(),
+			want: want{
+				preenqueue: result{
+					status: framework.NewStatus(framework.UnschedulableAndUnresolvable, `resourceclaim "my-pod-my-resource" is being deleted`),
+				},
+			},
+		},
+		"wrong-claim": {
+			pod: podWithClaimTemplateInStatus,
+			claims: func() []*resourcev1alpha2.ResourceClaim {
+				claim := allocatedClaim.DeepCopy()
+				claim.OwnerReferences[0].UID += "123"
+				return []*resourcev1alpha2.ResourceClaim{claim}
+			}(),
+			want: want{
+				preenqueue: result{
+					status: framework.NewStatus(framework.UnschedulableAndUnresolvable, `ResourceClaim default/my-pod-my-resource was not created for pod default/my-pod (pod is not owner)`),
 				},
 			},
 		},
@@ -506,8 +532,16 @@ func TestPlugin(t *testing.T) {
 			}
 			testCtx := setup(t, nodes, tc.claims, tc.classes, tc.schedulings)
 			testCtx.p.enabled = !tc.disable
-
 			initialObjects := testCtx.listAll(t)
+
+			status := testCtx.p.PreEnqueue(testCtx.ctx, tc.pod)
+			t.Run("PreEnqueue", func(t *testing.T) {
+				testCtx.verify(t, tc.want.preenqueue, initialObjects, nil, status)
+			})
+			if !status.IsSuccess() {
+				return
+			}
+
 			result, status := testCtx.p.PreFilter(testCtx.ctx, testCtx.state, tc.pod)
 			t.Run("prefilter", func(t *testing.T) {
 				assert.Equal(t, tc.want.preFilterResult, result)
@@ -595,11 +629,12 @@ func TestPlugin(t *testing.T) {
 }

 type testContext struct {
-	ctx       context.Context
-	client    *fake.Clientset
-	p         *dynamicResources
-	nodeInfos []*framework.NodeInfo
-	state     *framework.CycleState
+	ctx             context.Context
+	client          *fake.Clientset
+	informerFactory informers.SharedInformerFactory
+	p               *dynamicResources
+	nodeInfos       []*framework.NodeInfo
+	state           *framework.CycleState
 }

 func (tc *testContext) verify(t *testing.T, expected result, initialObjects []metav1.Object, result interface{}, status *framework.Status) {
@@ -727,7 +762,7 @@ func setup(t *testing.T, nodes []*v1.Node, claims []*resourcev1alpha2.ResourceCl
 	t.Helper()

 	tc := &testContext{}
-	_, ctx := ktesting.NewTestContext(t)
+	logger, ctx := ktesting.NewTestContext(t)
 	ctx, cancel := context.WithCancel(ctx)
 	t.Cleanup(cancel)
 	tc.ctx = ctx
@@ -736,18 +771,18 @@ func setup(t *testing.T, nodes []*v1.Node, claims []*resourcev1alpha2.ResourceCl
 	reactor := createReactor(tc.client.Tracker())
 	tc.client.PrependReactor("*", "*", reactor)

-	informerFactory := informers.NewSharedInformerFactory(tc.client, 0)
+	tc.informerFactory = informers.NewSharedInformerFactory(tc.client, 0)

 	opts := []runtime.Option{
 		runtime.WithClientSet(tc.client),
-		runtime.WithInformerFactory(informerFactory),
+		runtime.WithInformerFactory(tc.informerFactory),
 	}
 	fh, err := runtime.NewFramework(ctx, nil, nil, opts...)
 	if err != nil {
 		t.Fatal(err)
 	}

-	pl, err := New(nil, fh, feature.Features{EnableDynamicResourceAllocation: true})
+	pl, err := NewWithLogger(logger, nil, fh, feature.Features{EnableDynamicResourceAllocation: true})
 	if err != nil {
 		t.Fatal(err)
 	}
@@ -768,15 +803,15 @@ func setup(t *testing.T, nodes []*v1.Node, claims []*resourcev1alpha2.ResourceCl
 		require.NoError(t, err, "create pod scheduling")
 	}

-	informerFactory.Start(tc.ctx.Done())
+	tc.informerFactory.Start(tc.ctx.Done())
 	t.Cleanup(func() {
 		// Need to cancel before waiting for the shutdown.
 		cancel()
 		// Now we can wait for all goroutines to stop.
-		informerFactory.Shutdown()
+		tc.informerFactory.Shutdown()
 	})

-	informerFactory.WaitForCacheSync(tc.ctx.Done())
+	tc.informerFactory.WaitForCacheSync(tc.ctx.Done())

 	for _, node := range nodes {
 		nodeInfo := framework.NewNodeInfo()
@@ -849,3 +884,215 @@ func createReactor(tracker cgotesting.ObjectTracker) func(action cgotesting.Acti
 		return false, nil, nil
 	}
 }
+
+func TestClaimChange(t *testing.T) {
+	testcases := map[string]struct {
+		pod            *v1.Pod
+		claims         []*resourcev1alpha2.ResourceClaim
+		oldObj, newObj interface{}
+		expectedHint   framework.QueueingHint
+	}{
+		"skip-deletes": {
+			pod:          podWithClaimTemplate,
+			oldObj:       allocatedClaim,
+			newObj:       nil,
+			expectedHint: framework.QueueSkip,
+		},
+		"backoff-wrong-new-object": {
+			pod:          podWithClaimTemplate,
+			newObj:       "not-a-claim",
+			expectedHint: framework.QueueAfterBackoff,
+		},
+		"skip-wrong-claim": {
+			pod: podWithClaimTemplate,
+			newObj: func() *resourcev1alpha2.ResourceClaim {
+				claim := allocatedClaim.DeepCopy()
+				claim.OwnerReferences[0].UID += "123"
+				return claim
+			}(),
+			expectedHint: framework.QueueSkip,
+		},
+		"skip-unrelated-claim": {
+			pod:    podWithClaimTemplate,
+			claims: []*resourcev1alpha2.ResourceClaim{allocatedClaim},
+			newObj: func() *resourcev1alpha2.ResourceClaim {
+				claim := allocatedClaim.DeepCopy()
+				claim.Name += "-foo"
+				claim.UID += "123"
+				return claim
+			}(),
+			expectedHint: framework.QueueSkip,
+		},
+		"queue-on-add": {
+			pod:          podWithClaimName,
+			newObj:       pendingImmediateClaim,
+			expectedHint: framework.QueueImmediately,
+		},
+		"backoff-wrong-old-object": {
+			pod:          podWithClaimName,
+			oldObj:       "not-a-claim",
+			newObj:       pendingImmediateClaim,
+			expectedHint: framework.QueueAfterBackoff,
+		},
+		"skip-adding-finalizer": {
+			pod:    podWithClaimName,
+			claims: []*resourcev1alpha2.ResourceClaim{pendingImmediateClaim},
+			oldObj: pendingImmediateClaim,
+			newObj: func() *resourcev1alpha2.ResourceClaim {
+				claim := pendingImmediateClaim.DeepCopy()
+				claim.Finalizers = append(claim.Finalizers, "foo")
+				return claim
+			}(),
+			expectedHint: framework.QueueSkip,
+		},
+		"queue-on-status-change": {
+			pod:    podWithClaimName,
+			claims: []*resourcev1alpha2.ResourceClaim{pendingImmediateClaim},
+			oldObj: pendingImmediateClaim,
+			newObj: func() *resourcev1alpha2.ResourceClaim {
+				claim := pendingImmediateClaim.DeepCopy()
+				claim.Status.Allocation = &resourcev1alpha2.AllocationResult{}
+				return claim
+			}(),
+			expectedHint: framework.QueueImmediately,
+		},
+	}
+
+	for name, tc := range testcases {
+		t.Run(name, func(t *testing.T) {
+			testCtx := setup(t, nil, tc.claims, nil, nil)
+			if claim, ok := tc.newObj.(*resourcev1alpha2.ResourceClaim); ok {
+				// Update the informer because the lister gets called and must have the claim.
+				store := testCtx.informerFactory.Resource().V1alpha2().ResourceClaims().Informer().GetStore()
+				if tc.oldObj == nil {
+					require.NoError(t, store.Add(claim))
+				} else {
+					require.NoError(t, store.Update(claim))
+				}
+			}
+			actualHint := testCtx.p.isSchedulableAfterClaimChange(tc.pod, tc.oldObj, tc.newObj)
+			require.Equal(t, tc.expectedHint, actualHint)
+		})
+	}
+}
+
+func TestPodSchedulingContextChange(t *testing.T) {
+	testcases := map[string]struct {
+		pod            *v1.Pod
+		schedulings    []*resourcev1alpha2.PodSchedulingContext
+		claims         []*resourcev1alpha2.ResourceClaim
+		oldObj, newObj interface{}
+		expectedHint   framework.QueueingHint
+	}{
+		"skip-deleted": {
+			pod:          podWithClaimTemplate,
+			oldObj:       scheduling,
+			expectedHint: framework.QueueSkip,
+		},
+		"skip-missed-deleted": {
+			pod: podWithClaimTemplate,
+			oldObj: cache.DeletedFinalStateUnknown{
+				Obj: scheduling,
+			},
+			expectedHint: framework.QueueSkip,
+		},
+		"backoff-wrong-old-object": {
+			pod:          podWithClaimTemplate,
+			oldObj:       "not-a-scheduling-context",
+			newObj:       scheduling,
+			expectedHint: framework.QueueAfterBackoff,
+		},
+		"backoff-missed-wrong-old-object": {
+			pod: podWithClaimTemplate,
+			oldObj: cache.DeletedFinalStateUnknown{
+				Obj: "not-a-scheduling-context",
+			},
+			newObj:       scheduling,
+			expectedHint: framework.QueueAfterBackoff,
+		},
+		"skip-unrelated-object": {
+			pod:    podWithClaimTemplate,
+			claims: []*resourcev1alpha2.ResourceClaim{pendingDelayedClaim},
+			newObj: func() *resourcev1alpha2.PodSchedulingContext {
+				scheduling := scheduling.DeepCopy()
+				scheduling.Name += "-foo"
+				return scheduling
+			}(),
+			expectedHint: framework.QueueSkip,
+		},
+		"backoff-wrong-new-object": {
+			pod:          podWithClaimTemplate,
+			oldObj:       scheduling,
+			newObj:       "not-a-scheduling-context",
+			expectedHint: framework.QueueAfterBackoff,
+		},
+		"skip-missing-claim": {
+			pod:          podWithClaimTemplate,
+			oldObj:       scheduling,
+			newObj:       schedulingInfo,
+			expectedHint: framework.QueueSkip,
+		},
+		"skip-missing-infos": {
+			pod:          podWithClaimTemplateInStatus,
+			claims:       []*resourcev1alpha2.ResourceClaim{pendingDelayedClaim},
+			oldObj:       scheduling,
+			newObj:       scheduling,
+			expectedHint: framework.QueueSkip,
+		},
+		"queue-new-infos": {
+			pod:          podWithClaimTemplateInStatus,
+			claims:       []*resourcev1alpha2.ResourceClaim{pendingDelayedClaim},
+			oldObj:       scheduling,
+			newObj:       schedulingInfo,
+			expectedHint: framework.QueueImmediately,
+		},
+		"queue-bad-selected-node": {
+			pod:    podWithClaimTemplateInStatus,
+			claims: []*resourcev1alpha2.ResourceClaim{pendingDelayedClaim},
+			oldObj: func() *resourcev1alpha2.PodSchedulingContext {
+				scheduling := schedulingInfo.DeepCopy()
+				scheduling.Spec.SelectedNode = workerNode.Name
+				return scheduling
+			}(),
+			newObj: func() *resourcev1alpha2.PodSchedulingContext {
+				scheduling := schedulingInfo.DeepCopy()
+				scheduling.Spec.SelectedNode = workerNode.Name
+				scheduling.Status.ResourceClaims[0].UnsuitableNodes = append(scheduling.Status.ResourceClaims[0].UnsuitableNodes, scheduling.Spec.SelectedNode)
+				return scheduling
+			}(),
+			expectedHint: framework.QueueImmediately,
+		},
+		"skip-spec-changes": {
+			pod:    podWithClaimTemplateInStatus,
+			claims: []*resourcev1alpha2.ResourceClaim{pendingDelayedClaim},
+			oldObj: schedulingInfo,
+			newObj: func() *resourcev1alpha2.PodSchedulingContext {
+				scheduling := schedulingInfo.DeepCopy()
+				scheduling.Spec.SelectedNode = workerNode.Name
+				return scheduling
+			}(),
+			expectedHint: framework.QueueSkip,
+		},
+		"backoff-other-changes": {
+			pod:    podWithClaimTemplateInStatus,
+			claims: []*resourcev1alpha2.ResourceClaim{pendingDelayedClaim},
+			oldObj: schedulingInfo,
+			newObj: func() *resourcev1alpha2.PodSchedulingContext {
+				scheduling := schedulingInfo.DeepCopy()
+				scheduling.Finalizers = append(scheduling.Finalizers, "foo")
+				return scheduling
+			}(),
+			expectedHint: framework.QueueAfterBackoff,
+		},
+	}
+
+	for name, tc := range testcases {
+		tc := tc
+		t.Run(name, func(t *testing.T) {
+			t.Parallel()
+			testCtx := setup(t, nil, tc.claims, nil, tc.schedulings)
+			actualHint := testCtx.p.isSchedulableAfterPodSchedulingContextChange(tc.pod, tc.oldObj, tc.newObj)
+			require.Equal(t, tc.expectedHint, actualHint)
+		})
+	}
+}