From 4a5a242a68d11abf16ca2fb6f91ed92e88e001e9 Mon Sep 17 00:00:00 2001
From: Patrick Ohly <patrick.ohly@intel.com>
Date: Wed, 28 Jun 2023 15:18:14 +0200
Subject: [PATCH 1/2] dra e2e: using logging for background activity

ginkgo.By should be used for steps in the test flow. Creating and deleting CDI
files happens in parallel to that. If reported via ginkgo.By, progress reports
look weird because they contain e.g. step "waiting for...." (from the main
test, which is still on-going) and end with "creating CDI file" (which is
already completed).
---
 test/e2e/dra/deploy.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/e2e/dra/deploy.go b/test/e2e/dra/deploy.go
index a7583462d67..6d762b13c28 100644
--- a/test/e2e/dra/deploy.go
+++ b/test/e2e/dra/deploy.go
@@ -214,11 +214,11 @@ func (d *Driver) SetUp(nodes *Nodes, resources app.Resources) {
 		plugin, err := app.StartPlugin(logger, "/cdi", d.Name, nodename,
 			app.FileOperations{
 				Create: func(name string, content []byte) error {
-					ginkgo.By(fmt.Sprintf("creating CDI file %s on node %s:\n%s", name, nodename, string(content)))
+					klog.Background().Info("creating CDI file", "node", nodename, "filename", name, "content", string(content))
 					return d.createFile(&pod, name, content)
 				},
 				Remove: func(name string) error {
-					ginkgo.By(fmt.Sprintf("deleting CDI file %s on node %s", name, nodename))
+					klog.Background().Info("deleting CDI file", "node", nodename, "filename", name)
 					return d.removeFile(&pod, name)
 				},
 			},

From 1b47e6433b87923a709782fb547eded0d2e3991e Mon Sep 17 00:00:00 2001
From: Patrick Ohly <patrick.ohly@intel.com>
Date: Wed, 28 Jun 2023 15:33:07 +0200
Subject: [PATCH 2/2] dra delayed allocation: deallocate when a pod is done

This releases the underlying resource sooner and ensures that another consumer
can get scheduled without being influenced by a decision that was made for the
previous consumer.

An alternative would have been to have the apiserver trigger the deallocation
whenever it sees the `status.reservedFor` getting reduced to zero. But that
then also triggers deallocation when kube-scheduler removes the last
reservation after a failed scheduling cycle. In that case we want to keep the
claim allocated and let the kube-scheduler decide on a case-by-case basis which
claim should get deallocated.
---
 pkg/controller/resourceclaim/controller.go | 22 ++++++++++++++++++++++
 test/e2e/dra/dra.go                        | 21 +++++++++++++++++++++
 2 files changed, 43 insertions(+)

diff --git a/pkg/controller/resourceclaim/controller.go b/pkg/controller/resourceclaim/controller.go
index 3e06237ce48..4dae8aef38f 100644
--- a/pkg/controller/resourceclaim/controller.go
+++ b/pkg/controller/resourceclaim/controller.go
@@ -456,6 +456,28 @@ func (ec *Controller) syncClaim(ctx context.Context, namespace, name string) err
 		// TODO (#113700): patch
 		claim := claim.DeepCopy()
 		claim.Status.ReservedFor = valid
+
+		// When a ResourceClaim uses delayed allocation, then it makes sense to
+		// deallocate the claim as soon as the last consumer stops using
+		// it. This ensures that the claim can be allocated again as needed by
+		// some future consumer instead of trying to schedule that consumer
+		// onto the node that was chosen for the previous consumer. It also
+		// releases the underlying resources for use by other claims.
+		//
+		// This has to be triggered by the transition from "was being used" to
+		// "is not used anymore" because a DRA driver is not required to set
+		// `status.reservedFor` together with `status.allocation`, i.e. a claim
+		// that is "currently unused" should not get deallocated.
+		//
+		// This does not matter for claims that were created for a pod. For
+		// those, the resource claim controller will trigger deletion when the
+		// pod is done. However, it doesn't hurt to also trigger deallocation
+		// for such claims and not checking for them keeps this code simpler.
+		if len(valid) == 0 &&
+			claim.Spec.AllocationMode == resourcev1alpha2.AllocationModeWaitForFirstConsumer {
+			claim.Status.DeallocationRequested = true
+		}
+
 		_, err := ec.kubeClient.ResourceV1alpha2().ResourceClaims(claim.Namespace).UpdateStatus(ctx, claim, metav1.UpdateOptions{})
 		if err != nil {
 			return err
diff --git a/test/e2e/dra/dra.go b/test/e2e/dra/dra.go
index 3d77fd93bd0..adcfd95bd11 100644
--- a/test/e2e/dra/dra.go
+++ b/test/e2e/dra/dra.go
@@ -254,6 +254,27 @@ var _ = ginkgo.Describe("[sig-node] DRA [Feature:DynamicResourceAllocation]", fu
 		ginkgo.Context("with immediate allocation", func() {
 			claimTests(resourcev1alpha2.AllocationModeImmediate)
 		})
+
+		ginkgo.It("must deallocate after use when using delayed allocation", func(ctx context.Context) {
+			parameters := b.parameters()
+			pod := b.podExternal()
+			claim := b.externalClaim(resourcev1alpha2.AllocationModeWaitForFirstConsumer)
+			b.create(ctx, parameters, claim, pod)
+
+			gomega.Eventually(ctx, func(ctx context.Context) (*resourcev1alpha2.ResourceClaim, error) {
+				return b.f.ClientSet.ResourceV1alpha2().ResourceClaims(b.f.Namespace.Name).Get(ctx, claim.Name, metav1.GetOptions{})
+			}).WithTimeout(f.Timeouts.PodDelete).ShouldNot(gomega.HaveField("Status.Allocation", (*resourcev1alpha2.AllocationResult)(nil)))
+
+			b.testPod(ctx, f.ClientSet, pod)
+
+			ginkgo.By(fmt.Sprintf("deleting pod %s", klog.KObj(pod)))
+			framework.ExpectNoError(b.f.ClientSet.CoreV1().Pods(b.f.Namespace.Name).Delete(ctx, pod.Name, metav1.DeleteOptions{}))
+
+			ginkgo.By("waiting for claim to get deallocated")
+			gomega.Eventually(ctx, func(ctx context.Context) (*resourcev1alpha2.ResourceClaim, error) {
+				return b.f.ClientSet.ResourceV1alpha2().ResourceClaims(b.f.Namespace.Name).Get(ctx, claim.Name, metav1.GetOptions{})
+			}).WithTimeout(f.Timeouts.PodDelete).Should(gomega.HaveField("Status.Allocation", (*resourcev1alpha2.AllocationResult)(nil)))
+		})
 	})
 
 	ginkgo.Context("multiple nodes", func() {