Merge pull request #124595 from pohly/dra-scheduler-assume-cache-eventhandlers

DRA: scheduler event handlers via assume cache
2024-06-25 11:56:28 -07:00
parent 8b72795621 1b63639d31
commit 8c478a06d8
12 changed files with 790 additions and 87 deletions
--- a/pkg/scheduler/framework/plugins/dynamicresources/dynamicresources.go
+++ b/pkg/scheduler/framework/plugins/dynamicresources/dynamicresources.go
@@ -276,7 +276,6 @@ type dynamicResources struct {
 	enabled                    bool
 	fh                         framework.Handle
 	clientset                  kubernetes.Interface
-	claimLister                resourcev1alpha2listers.ResourceClaimLister
 	classLister                resourcev1alpha2listers.ResourceClassLister
 	podSchedulingContextLister resourcev1alpha2listers.PodSchedulingContextLister
 	claimParametersLister      resourcev1alpha2listers.ResourceClaimParametersLister
@@ -354,12 +353,10 @@ func New(ctx context.Context, plArgs runtime.Object, fh framework.Handle, fts fe
 		return &dynamicResources{}, nil
 	}

-	logger := klog.FromContext(ctx)
 	pl := &dynamicResources{
 		enabled:                    true,
 		fh:                         fh,
 		clientset:                  fh.ClientSet(),
-		claimLister:                fh.SharedInformerFactory().Resource().V1alpha2().ResourceClaims().Lister(),
 		classLister:                fh.SharedInformerFactory().Resource().V1alpha2().ResourceClasses().Lister(),
 		podSchedulingContextLister: fh.SharedInformerFactory().Resource().V1alpha2().PodSchedulingContexts().Lister(),
 		claimParametersLister:      fh.SharedInformerFactory().Resource().V1alpha2().ResourceClaimParameters().Lister(),
@@ -368,7 +365,7 @@ func New(ctx context.Context, plArgs runtime.Object, fh framework.Handle, fts fe
 		classParametersIndexer:     fh.SharedInformerFactory().Resource().V1alpha2().ResourceClassParameters().Informer().GetIndexer(),
 		resourceSliceLister:        fh.SharedInformerFactory().Resource().V1alpha2().ResourceSlices().Lister(),
 		claimNameLookup:            resourceclaim.NewNameLookup(fh.ClientSet()),
-		claimAssumeCache:           assumecache.NewAssumeCache(logger, fh.SharedInformerFactory().Resource().V1alpha2().ResourceClaims().Informer(), "claim", "", nil),
+		claimAssumeCache:           fh.ResourceClaimCache(),
 	}

 	if err := pl.claimParametersIndexer.AddIndexers(cache.Indexers{generatedFromIndex: claimParametersGeneratedFromIndexFunc}); err != nil {
@@ -651,21 +648,6 @@ func (pl *dynamicResources) isSchedulableAfterClaimChange(logger klog.Logger, po
 		//
 		// TODO (https://github.com/kubernetes/kubernetes/issues/123697):
 		// check that the pending claims depend on structured parameters (depends on refactoring foreachPodResourceClaim, see other TODO).
-		//
-		// There is a small race here:
-		// - The dynamicresources plugin allocates claim A and updates the assume cache.
-		// - A second pod gets marked as unschedulable based on that assume cache.
-		// - Before the informer cache here catches up, the pod runs, terminates and
-		//   the claim gets deallocated without ever sending the claim status with
-		//   allocation to the scheduler.
-		// - The comparison below is for a *very* old claim with no allocation and the
-		//   new claim where the allocation is already removed again, so no
-		//   RemovedClaimAllocation event gets emitted.
-		//
-		// This is extremely unlikely and thus a fix is not needed for alpha in Kubernetes 1.30.
-		// TODO (https://github.com/kubernetes/kubernetes/issues/123698): The solution is to somehow integrate the assume cache
-		// into the event mechanism. This can be tackled together with adding autoscaler
-		// support, which also needs to do something with the assume cache.
 		logger.V(6).Info("claim with structured parameters got deallocated", "pod", klog.KObj(pod), "claim", klog.KObj(modifiedClaim))
 		return framework.Queue, nil
 	}
@@ -852,11 +834,16 @@ func (pl *dynamicResources) foreachPodResourceClaim(pod *v1.Pod, cb func(podReso
 		if claimName == nil {
 			continue
 		}
-		claim, err := pl.claimLister.ResourceClaims(pod.Namespace).Get(*claimName)
+		obj, err := pl.claimAssumeCache.Get(pod.Namespace + "/" + *claimName)
 		if err != nil {
 			return err
 		}

+		claim, ok := obj.(*resourcev1alpha2.ResourceClaim)
+		if !ok {
+			return fmt.Errorf("unexpected object type %T for assumed object %s/%s", obj, pod.Namespace, *claimName)
+		}
+
 		if claim.DeletionTimestamp != nil {
 			return fmt.Errorf("resourceclaim %q is being deleted", claim.Name)
 		}
--- a/pkg/scheduler/framework/plugins/dynamicresources/dynamicresources_test.go
+++ b/pkg/scheduler/framework/plugins/dynamicresources/dynamicresources_test.go
@@ -44,6 +44,7 @@ import (
 	"k8s.io/kubernetes/pkg/scheduler/framework/plugins/feature"
 	"k8s.io/kubernetes/pkg/scheduler/framework/runtime"
 	st "k8s.io/kubernetes/pkg/scheduler/testing"
+	"k8s.io/kubernetes/pkg/scheduler/util/assumecache"
 	"k8s.io/kubernetes/test/utils/ktesting"
 	"k8s.io/utils/ptr"
 )
@@ -1217,12 +1218,13 @@ func TestPlugin(t *testing.T) {
 }

 type testContext struct {
-	ctx             context.Context
-	client          *fake.Clientset
-	informerFactory informers.SharedInformerFactory
-	p               *dynamicResources
-	nodeInfos       []*framework.NodeInfo
-	state           *framework.CycleState
+	ctx              context.Context
+	client           *fake.Clientset
+	informerFactory  informers.SharedInformerFactory
+	claimAssumeCache *assumecache.AssumeCache
+	p                *dynamicResources
+	nodeInfos        []*framework.NodeInfo
+	state            *framework.CycleState
 }

 func (tc *testContext) verify(t *testing.T, expected result, initialObjects []metav1.Object, result interface{}, status *framework.Status) {
@@ -1388,10 +1390,11 @@ func setup(t *testing.T, nodes []*v1.Node, claims []*resourcev1alpha2.ResourceCl
 	tc.client.PrependReactor("list", "resourceclassparameters", createListReactor(tc.client.Tracker(), "ResourceClassParameters"))

 	tc.informerFactory = informers.NewSharedInformerFactory(tc.client, 0)
-
+	tc.claimAssumeCache = assumecache.NewAssumeCache(tCtx.Logger(), tc.informerFactory.Resource().V1alpha2().ResourceClaims().Informer(), "resource claim", "", nil)
 	opts := []runtime.Option{
 		runtime.WithClientSet(tc.client),
 		runtime.WithInformerFactory(tc.informerFactory),
+		runtime.WithResourceClaimCache(tc.claimAssumeCache),
 	}
 	fh, err := runtime.NewFramework(tCtx, nil, nil, opts...)
 	if err != nil {
@@ -1558,6 +1561,7 @@ func Test_isSchedulableAfterClaimChange(t *testing.T) {
 		},
 		"backoff-wrong-old-object": {
 			pod:         podWithClaimName,
+			claims:      []*resourcev1alpha2.ResourceClaim{pendingDelayedClaim},
 			oldObj:      "not-a-claim",
 			newObj:      pendingImmediateClaim,
 			expectedErr: true,
@@ -1586,15 +1590,10 @@ func Test_isSchedulableAfterClaimChange(t *testing.T) {
 		},
 		"structured-claim-deallocate": {
 			pod:    podWithClaimName,
-			claims: []*resourcev1alpha2.ResourceClaim{pendingDelayedClaim},
-			oldObj: func() *resourcev1alpha2.ResourceClaim {
-				claim := structuredAllocatedClaim.DeepCopy()
-				claim.Name += "-other"
-				return claim
-			}(),
+			claims: []*resourcev1alpha2.ResourceClaim{pendingDelayedClaim, otherStructuredAllocatedClaim},
+			oldObj: otherStructuredAllocatedClaim,
 			newObj: func() *resourcev1alpha2.ResourceClaim {
-				claim := structuredAllocatedClaim.DeepCopy()
-				claim.Name += "-other"
+				claim := otherStructuredAllocatedClaim.DeepCopy()
 				claim.Status.Allocation = nil
 				return claim
 			}(),
@@ -1606,18 +1605,48 @@ func Test_isSchedulableAfterClaimChange(t *testing.T) {

 	for name, tc := range testcases {
 		t.Run(name, func(t *testing.T) {
-			logger, _ := ktesting.NewTestContext(t)
+			logger, tCtx := ktesting.NewTestContext(t)
 			testCtx := setup(t, nil, tc.claims, nil, nil, nil)
+			oldObj := tc.oldObj
+			newObj := tc.newObj
 			if claim, ok := tc.newObj.(*resourcev1alpha2.ResourceClaim); ok {
-				// Update the informer because the lister gets called and must have the claim.
-				store := testCtx.informerFactory.Resource().V1alpha2().ResourceClaims().Informer().GetStore()
+				// Add or update through the client and wait until the event is processed.
+				claimKey := claim.Namespace + "/" + claim.Name
 				if tc.oldObj == nil {
-					require.NoError(t, store.Add(claim))
+					// Some test claims already have it. Clear for create.
+					createClaim := claim.DeepCopy()
+					createClaim.UID = ""
+					storedClaim, err := testCtx.client.ResourceV1alpha2().ResourceClaims(createClaim.Namespace).Create(tCtx, createClaim, metav1.CreateOptions{})
+					require.NoError(t, err, "create claim")
+					claim = storedClaim
 				} else {
-					require.NoError(t, store.Update(claim))
+					cachedClaim, err := testCtx.claimAssumeCache.Get(claimKey)
+					require.NoError(t, err, "retrieve old claim")
+					updateClaim := claim.DeepCopy()
+					// The test claim doesn't have those (generated dynamically), so copy them.
+					updateClaim.UID = cachedClaim.(*resourcev1alpha2.ResourceClaim).UID
+					updateClaim.ResourceVersion = cachedClaim.(*resourcev1alpha2.ResourceClaim).ResourceVersion
+
+					storedClaim, err := testCtx.client.ResourceV1alpha2().ResourceClaims(updateClaim.Namespace).Update(tCtx, updateClaim, metav1.UpdateOptions{})
+					require.NoError(t, err, "update claim")
+					claim = storedClaim
 				}
+
+				// Eventually the assume cache will have it, too.
+				require.EventuallyWithT(t, func(t *assert.CollectT) {
+					cachedClaim, err := testCtx.claimAssumeCache.Get(claimKey)
+					require.NoError(t, err, "retrieve claim")
+					if cachedClaim.(*resourcev1alpha2.ResourceClaim).ResourceVersion != claim.ResourceVersion {
+						t.Errorf("cached claim not updated yet")
+					}
+				}, time.Minute, time.Second, "claim assume cache must have new or updated claim")
+
+				// This has the actual UID and ResourceVersion,
+				// which is relevant for
+				// isSchedulableAfterClaimChange.
+				newObj = claim
 			}
-			actualHint, err := testCtx.p.isSchedulableAfterClaimChange(logger, tc.pod, tc.oldObj, tc.newObj)
+			actualHint, err := testCtx.p.isSchedulableAfterClaimChange(logger, tc.pod, oldObj, newObj)
 			if tc.expectedErr {
 				require.Error(t, err)
 				return