Merge pull request #125753 from SergeyKanzhelev/devicePluginFailuresTests
device plugin failure tests
This commit is contained in:
		
							
								
								
									
										356
									
								
								test/e2e_node/device_plugin_failures_test.go
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										356
									
								
								test/e2e_node/device_plugin_failures_test.go
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,356 @@ | |||||||
|  | /* | ||||||
|  | Copyright 2024 The Kubernetes Authors. | ||||||
|  |  | ||||||
|  | Licensed under the Apache License, Version 2.0 (the "License"); | ||||||
|  | you may not use this file except in compliance with the License. | ||||||
|  | You may obtain a copy of the License at | ||||||
|  |  | ||||||
|  |     http://www.apache.org/licenses/LICENSE-2.0 | ||||||
|  |  | ||||||
|  | Unless required by applicable law or agreed to in writing, software | ||||||
|  | distributed under the License is distributed on an "AS IS" BASIS, | ||||||
|  | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||||
|  | See the License for the specific language governing permissions and | ||||||
|  | limitations under the License. | ||||||
|  | */ | ||||||
|  |  | ||||||
|  | package e2enode | ||||||
|  |  | ||||||
|  | import ( | ||||||
|  | 	"context" | ||||||
|  | 	"fmt" | ||||||
|  | 	"time" | ||||||
|  |  | ||||||
|  | 	"github.com/onsi/ginkgo/v2" | ||||||
|  | 	"github.com/onsi/gomega" | ||||||
|  |  | ||||||
|  | 	v1 "k8s.io/api/core/v1" | ||||||
|  | 	kubeletdevicepluginv1beta1 "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1" | ||||||
|  | 	e2epod "k8s.io/kubernetes/test/e2e/framework/pod" | ||||||
|  | 	admissionapi "k8s.io/pod-security-admission/api" | ||||||
|  |  | ||||||
|  | 	"k8s.io/apimachinery/pkg/api/resource" | ||||||
|  | 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" | ||||||
|  | 	"k8s.io/apimachinery/pkg/util/uuid" | ||||||
|  | 	"k8s.io/kubernetes/test/e2e/framework" | ||||||
|  | 	"k8s.io/kubernetes/test/e2e_node/testdeviceplugin" | ||||||
|  | ) | ||||||
|  |  | ||||||
|  | type ResourceValue struct { | ||||||
|  | 	Allocatable int | ||||||
|  | 	Capacity    int | ||||||
|  | } | ||||||
|  |  | ||||||
|  | // Serial because the test restarts Kubelet | ||||||
|  | var _ = SIGDescribe("Device Plugin Failures:", framework.WithNodeConformance(), func() { | ||||||
|  | 	f := framework.NewDefaultFramework("device-plugin-failures") | ||||||
|  | 	f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged | ||||||
|  |  | ||||||
|  | 	var getNodeResourceValues = func(ctx context.Context, resourceName string) ResourceValue { | ||||||
|  | 		ginkgo.GinkgoHelper() | ||||||
|  | 		node := getLocalNode(ctx, f) | ||||||
|  |  | ||||||
|  | 		// -1 represents that the resource is not found | ||||||
|  | 		result := ResourceValue{ | ||||||
|  | 			Allocatable: -1, | ||||||
|  | 			Capacity:    -1, | ||||||
|  | 		} | ||||||
|  |  | ||||||
|  | 		for key, val := range node.Status.Capacity { | ||||||
|  | 			resource := string(key) | ||||||
|  | 			if resource == resourceName { | ||||||
|  | 				result.Capacity = int(val.Value()) | ||||||
|  | 				break | ||||||
|  | 			} | ||||||
|  | 		} | ||||||
|  |  | ||||||
|  | 		for key, val := range node.Status.Allocatable { | ||||||
|  | 			resource := string(key) | ||||||
|  | 			if resource == resourceName { | ||||||
|  | 				result.Allocatable = int(val.Value()) | ||||||
|  | 				break | ||||||
|  | 			} | ||||||
|  | 		} | ||||||
|  |  | ||||||
|  | 		return result | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	var createPod = func(resourceName string, quantity int) *v1.Pod { | ||||||
|  | 		ginkgo.GinkgoHelper() | ||||||
|  | 		rl := v1.ResourceList{v1.ResourceName(resourceName): *resource.NewQuantity(int64(quantity), resource.DecimalSI)} | ||||||
|  | 		pod := &v1.Pod{ | ||||||
|  | 			ObjectMeta: metav1.ObjectMeta{Name: "device-plugin-failures-test-" + string(uuid.NewUUID())}, | ||||||
|  | 			Spec: v1.PodSpec{ | ||||||
|  | 				RestartPolicy: v1.RestartPolicyAlways, | ||||||
|  | 				Containers: []v1.Container{{ | ||||||
|  | 					Image:   busyboxImage, | ||||||
|  | 					Name:    "container-1", | ||||||
|  | 					Command: []string{"sh", "-c", fmt.Sprintf("env && sleep %s", sleepIntervalForever)}, | ||||||
|  | 					Resources: v1.ResourceRequirements{ | ||||||
|  | 						Limits:   rl, | ||||||
|  | 						Requests: rl, | ||||||
|  | 					}, | ||||||
|  | 				}}, | ||||||
|  | 			}, | ||||||
|  | 		} | ||||||
|  | 		return pod | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	nodeStatusUpdateTimeout := 1 * time.Minute | ||||||
|  | 	devicePluginUpdateTimeout := 1 * time.Minute | ||||||
|  | 	devicePluginGracefulTimeout := 5 * time.Minute // see endpointStopGracePeriod in pkg/kubelet/cm/devicemanager/types.go | ||||||
|  |  | ||||||
|  | 	ginkgo.It("when GetDevicePluginOptions fails, device plugin will not be used", func(ctx context.Context) { | ||||||
|  | 		// randomizing so tests can run in parallel | ||||||
|  | 		resourceName := fmt.Sprintf("test.device/%s", f.UniqueName) | ||||||
|  |  | ||||||
|  | 		expectedErr := fmt.Errorf("GetDevicePluginOptions failed") | ||||||
|  |  | ||||||
|  | 		plugin := testdeviceplugin.NewDevicePlugin(func(name string) error { | ||||||
|  | 			if name == "GetDevicePluginOptions" { | ||||||
|  | 				return expectedErr | ||||||
|  | 			} | ||||||
|  | 			return nil | ||||||
|  | 		}) | ||||||
|  |  | ||||||
|  | 		err := plugin.RegisterDevicePlugin(ctx, f.UniqueName, resourceName, []kubeletdevicepluginv1beta1.Device{{ID: "testdevice", Health: kubeletdevicepluginv1beta1.Healthy}}) | ||||||
|  | 		defer plugin.Stop() // should stop even if registration failed | ||||||
|  | 		gomega.Expect(err).To(gomega.MatchError(gomega.ContainSubstring("failed to get device plugin options"))) | ||||||
|  | 		gomega.Expect(err).To(gomega.MatchError(gomega.ContainSubstring(expectedErr.Error()))) | ||||||
|  |  | ||||||
|  | 		gomega.Expect(plugin.WasCalled("ListAndWatch")).To(gomega.BeFalseBecause("plugin should not be used if GetDevicePluginOptions fails")) | ||||||
|  | 		gomega.Expect(plugin.WasCalled("GetDevicePluginOptions")).To(gomega.BeTrueBecause("get device plugin options should be called exactly once")) | ||||||
|  | 		gomega.Expect(plugin.Calls()).To(gomega.HaveLen(1)) | ||||||
|  |  | ||||||
|  | 		// kubelet will not even register the resource | ||||||
|  | 		gomega.Eventually(getNodeResourceValues, nodeStatusUpdateTimeout, f.Timeouts.Poll).WithContext(ctx).WithArguments(resourceName).Should(gomega.Equal(ResourceValue{Allocatable: -1, Capacity: -1})) | ||||||
|  | 	}) | ||||||
|  |  | ||||||
|  | 	ginkgo.It("will set allocatable to zero when a single device became unhealthy and then back to 1 if it got healthy again", func(ctx context.Context) { | ||||||
|  | 		// randomizing so tests can run in parallel | ||||||
|  | 		resourceName := fmt.Sprintf("test.device/%s", f.UniqueName) | ||||||
|  | 		devices := []kubeletdevicepluginv1beta1.Device{{ID: "testdevice", Health: kubeletdevicepluginv1beta1.Healthy}} | ||||||
|  | 		plugin := testdeviceplugin.NewDevicePlugin(nil) | ||||||
|  |  | ||||||
|  | 		err := plugin.RegisterDevicePlugin(ctx, f.UniqueName, resourceName, devices) | ||||||
|  | 		defer plugin.Stop() // should stop even if registration failed | ||||||
|  | 		gomega.Expect(err).To(gomega.Succeed()) | ||||||
|  |  | ||||||
|  | 		// at first the device is healthy | ||||||
|  | 		gomega.Eventually(getNodeResourceValues, nodeStatusUpdateTimeout, f.Timeouts.Poll).WithContext(ctx).WithArguments(resourceName).Should(gomega.Equal(ResourceValue{Allocatable: 1, Capacity: 1})) | ||||||
|  |  | ||||||
|  | 		// now make the device unhealthy | ||||||
|  | 		devices[0].Health = kubeletdevicepluginv1beta1.Unhealthy | ||||||
|  | 		plugin.UpdateDevices(devices) | ||||||
|  |  | ||||||
|  | 		gomega.Eventually(getNodeResourceValues, nodeStatusUpdateTimeout, f.Timeouts.Poll).WithContext(ctx).WithArguments(resourceName).Should(gomega.Equal(ResourceValue{Allocatable: 0, Capacity: 1})) | ||||||
|  |  | ||||||
|  | 		// now make the device healthy again | ||||||
|  | 		devices[0].Health = kubeletdevicepluginv1beta1.Healthy | ||||||
|  | 		plugin.UpdateDevices(devices) | ||||||
|  |  | ||||||
|  | 		gomega.Eventually(getNodeResourceValues, nodeStatusUpdateTimeout, f.Timeouts.Poll).WithContext(ctx).WithArguments(resourceName).Should(gomega.Equal(ResourceValue{Allocatable: 1, Capacity: 1})) | ||||||
|  | 	}) | ||||||
|  |  | ||||||
|  | 	ginkgo.It("will set allocatable to zero when a single device became unhealthy, but capacity will stay at 1", func(ctx context.Context) { | ||||||
|  | 		// randomizing so tests can run in parallel | ||||||
|  | 		resourceName := fmt.Sprintf("test.device/%s", f.UniqueName) | ||||||
|  | 		devices := []kubeletdevicepluginv1beta1.Device{{ID: "testdevice", Health: kubeletdevicepluginv1beta1.Healthy}} | ||||||
|  | 		plugin := testdeviceplugin.NewDevicePlugin(nil) | ||||||
|  |  | ||||||
|  | 		err := plugin.RegisterDevicePlugin(ctx, f.UniqueName, resourceName, devices) | ||||||
|  | 		defer plugin.Stop() // should stop even if registration failed | ||||||
|  | 		gomega.Expect(err).To(gomega.Succeed()) | ||||||
|  |  | ||||||
|  | 		ginkgo.By("initial state: capacity and allocatable are set") | ||||||
|  | 		gomega.Eventually(getNodeResourceValues, nodeStatusUpdateTimeout, f.Timeouts.Poll).WithContext(ctx).WithArguments(resourceName).Should(gomega.Equal(ResourceValue{Allocatable: 1, Capacity: 1})) | ||||||
|  |  | ||||||
|  | 		// schedule a pod that requests the device | ||||||
|  | 		client := e2epod.NewPodClient(f) | ||||||
|  | 		pod := client.Create(ctx, createPod(resourceName, 1)) | ||||||
|  |  | ||||||
|  | 		// wait for the pod to be running | ||||||
|  | 		gomega.Expect(e2epod.WaitForPodRunningInNamespace(ctx, f.ClientSet, pod)).To(gomega.Succeed()) | ||||||
|  |  | ||||||
|  | 		ginkgo.By("once pod is running, it does not affect allocatable value") | ||||||
|  | 		gomega.Eventually(getNodeResourceValues, nodeStatusUpdateTimeout, f.Timeouts.Poll).WithContext(ctx).WithArguments(resourceName).Should(gomega.Equal(ResourceValue{Allocatable: 1, Capacity: 1})) | ||||||
|  |  | ||||||
|  | 		// now make the device unhealthy | ||||||
|  | 		devices[0].Health = kubeletdevicepluginv1beta1.Unhealthy | ||||||
|  | 		plugin.UpdateDevices(devices) | ||||||
|  |  | ||||||
|  | 		ginkgo.By("even when device became unhealthy. pod is still running and keeping the capacity") | ||||||
|  | 		// we keep the allocatable at the same value even though device is not healthy any longer | ||||||
|  | 		gomega.Eventually(getNodeResourceValues, nodeStatusUpdateTimeout, f.Timeouts.Poll).WithContext(ctx).WithArguments(resourceName).Should(gomega.Equal(ResourceValue{Allocatable: 0, Capacity: 1})) | ||||||
|  |  | ||||||
|  | 		// pod is not affected by the device becoming unhealthy | ||||||
|  |  | ||||||
|  | 		gomega.Consistently(func() v1.PodPhase { | ||||||
|  | 			pod, err = f.ClientSet.CoreV1().Pods(f.Namespace.Name).Get(ctx, pod.Name, metav1.GetOptions{}) | ||||||
|  | 			return pod.Status.Phase | ||||||
|  | 		}, devicePluginUpdateTimeout, f.Timeouts.Poll).Should(gomega.Equal(v1.PodRunning)) | ||||||
|  |  | ||||||
|  | 		// deleting the pod | ||||||
|  | 		err = f.ClientSet.CoreV1().Pods(f.Namespace.Name).Delete(ctx, pod.Name, metav1.DeleteOptions{}) | ||||||
|  | 		gomega.Expect(err).To(gomega.Succeed()) | ||||||
|  |  | ||||||
|  | 		// wait for the pod to be deleted | ||||||
|  | 		gomega.Eventually(func() error { | ||||||
|  | 			_, err := f.ClientSet.CoreV1().Pods(f.Namespace.Name).Get(ctx, pod.Name, metav1.GetOptions{}) | ||||||
|  | 			return err | ||||||
|  | 		}, f.Timeouts.PodDelete, f.Timeouts.Poll).Should(gomega.MatchError((gomega.ContainSubstring("not found")))) | ||||||
|  |  | ||||||
|  | 		ginkgo.By("when pod is deleted, nothing changes") | ||||||
|  | 		gomega.Eventually(getNodeResourceValues, devicePluginGracefulTimeout+1*time.Minute, f.Timeouts.Poll).WithContext(ctx).WithArguments(resourceName).Should(gomega.Equal(ResourceValue{Allocatable: 0, Capacity: 1})) | ||||||
|  | 	}) | ||||||
|  |  | ||||||
|  | 	ginkgo.It("will lower allocatable to a number of unhealthy devices and then back if they became healthy again", func(ctx context.Context) { | ||||||
|  | 		// randomizing so tests can run in parallel | ||||||
|  | 		resourceName := fmt.Sprintf("test.device/%s", f.UniqueName) | ||||||
|  |  | ||||||
|  | 		devices := []kubeletdevicepluginv1beta1.Device{ | ||||||
|  | 			{ID: "0", Health: kubeletdevicepluginv1beta1.Healthy}, | ||||||
|  | 			{ID: "1", Health: kubeletdevicepluginv1beta1.Healthy}, | ||||||
|  | 			{ID: "2", Health: kubeletdevicepluginv1beta1.Healthy}, | ||||||
|  | 			{ID: "3", Health: kubeletdevicepluginv1beta1.Healthy}, | ||||||
|  | 		} | ||||||
|  | 		plugin := testdeviceplugin.NewDevicePlugin(nil) | ||||||
|  |  | ||||||
|  | 		err := plugin.RegisterDevicePlugin(ctx, f.UniqueName, resourceName, devices) | ||||||
|  | 		defer plugin.Stop() // should stop even if registration failed | ||||||
|  | 		gomega.Expect(err).To(gomega.Succeed()) | ||||||
|  |  | ||||||
|  | 		// at first all the devices are healthy | ||||||
|  | 		gomega.Eventually(getNodeResourceValues, nodeStatusUpdateTimeout, f.Timeouts.Poll).WithContext(ctx).WithArguments(resourceName).Should(gomega.Equal(ResourceValue{Allocatable: 4, Capacity: 4})) | ||||||
|  |  | ||||||
|  | 		// now make one device unhealthy | ||||||
|  | 		devices[3].Health = kubeletdevicepluginv1beta1.Unhealthy | ||||||
|  | 		plugin.UpdateDevices(devices) | ||||||
|  |  | ||||||
|  | 		gomega.Eventually(getNodeResourceValues, nodeStatusUpdateTimeout, f.Timeouts.Poll).WithContext(ctx).WithArguments(resourceName).Should(gomega.Equal(ResourceValue{Allocatable: 3, Capacity: 4})) | ||||||
|  |  | ||||||
|  | 		// now make the device healthy again | ||||||
|  | 		devices[3].Health = kubeletdevicepluginv1beta1.Healthy | ||||||
|  | 		plugin.UpdateDevices(devices) | ||||||
|  |  | ||||||
|  | 		gomega.Eventually(getNodeResourceValues, nodeStatusUpdateTimeout, f.Timeouts.Poll).WithContext(ctx).WithArguments(resourceName).Should(gomega.Equal(ResourceValue{Allocatable: 4, Capacity: 4})) | ||||||
|  |  | ||||||
|  | 		// now make two devices unhealthy | ||||||
|  | 		devices[1].Health = kubeletdevicepluginv1beta1.Unhealthy | ||||||
|  | 		devices[3].Health = kubeletdevicepluginv1beta1.Unhealthy | ||||||
|  | 		plugin.UpdateDevices(devices) | ||||||
|  |  | ||||||
|  | 		gomega.Eventually(getNodeResourceValues, nodeStatusUpdateTimeout, f.Timeouts.Poll).WithContext(ctx).WithArguments(resourceName).Should(gomega.Equal(ResourceValue{Allocatable: 2, Capacity: 4})) | ||||||
|  |  | ||||||
|  | 		// now make the device healthy again | ||||||
|  | 		devices[3].Health = kubeletdevicepluginv1beta1.Healthy | ||||||
|  | 		plugin.UpdateDevices(devices) | ||||||
|  |  | ||||||
|  | 		gomega.Eventually(getNodeResourceValues, nodeStatusUpdateTimeout, f.Timeouts.Poll).WithContext(ctx).WithArguments(resourceName).Should(gomega.Equal(ResourceValue{Allocatable: 3, Capacity: 4})) | ||||||
|  |  | ||||||
|  | 		// now make the device healthy again | ||||||
|  | 		devices[1].Health = kubeletdevicepluginv1beta1.Healthy | ||||||
|  | 		plugin.UpdateDevices(devices) | ||||||
|  |  | ||||||
|  | 		gomega.Eventually(getNodeResourceValues, nodeStatusUpdateTimeout, f.Timeouts.Poll).WithContext(ctx).WithArguments(resourceName).Should(gomega.Equal(ResourceValue{Allocatable: 4, Capacity: 4})) | ||||||
|  | 	}) | ||||||
|  |  | ||||||
|  | 	ginkgo.It("when ListAndWatch fails immediately, node allocatable will be set to zero and kubelet will not retry to list resources", func(ctx context.Context) { | ||||||
|  | 		// randomizing so tests can run in parallel | ||||||
|  | 		resourceName := fmt.Sprintf("test.device/%s", f.UniqueName) | ||||||
|  | 		devices := []kubeletdevicepluginv1beta1.Device{{ID: "testdevice", Health: kubeletdevicepluginv1beta1.Healthy}} | ||||||
|  |  | ||||||
|  | 		// Initially, there are no allocatable of this resource | ||||||
|  | 		gomega.Eventually(getNodeResourceValues, nodeStatusUpdateTimeout, f.Timeouts.Poll).WithContext(ctx).WithArguments(resourceName).Should(gomega.Equal(ResourceValue{Allocatable: -1, Capacity: -1})) | ||||||
|  |  | ||||||
|  | 		plugin := testdeviceplugin.NewDevicePlugin(func(name string) error { | ||||||
|  | 			if name == "ListAndWatch" { | ||||||
|  | 				return fmt.Errorf("ListAndWatch failed") | ||||||
|  | 			} | ||||||
|  | 			return nil | ||||||
|  | 		}) | ||||||
|  |  | ||||||
|  | 		err := plugin.RegisterDevicePlugin(ctx, f.UniqueName, resourceName, devices) | ||||||
|  | 		defer plugin.Stop() // should stop even if registration failed | ||||||
|  | 		gomega.Expect(err).To(gomega.Succeed()) | ||||||
|  |  | ||||||
|  | 		// kubelet registers the resource, but will not have any allocatable | ||||||
|  | 		gomega.Eventually(getNodeResourceValues, nodeStatusUpdateTimeout, f.Timeouts.Poll).WithContext(ctx).WithArguments(resourceName).Should(gomega.Equal(ResourceValue{Allocatable: 0, Capacity: 0})) | ||||||
|  |  | ||||||
|  | 		// kubelet will never retry ListAndWatch (this will sleep for a long time) | ||||||
|  | 		gomega.Consistently(plugin.Calls, devicePluginUpdateTimeout, f.Timeouts.Poll).Should(gomega.HaveLen(2)) | ||||||
|  |  | ||||||
|  | 		// however kubelet will not delete the resource | ||||||
|  | 		gomega.Eventually(getNodeResourceValues, nodeStatusUpdateTimeout, f.Timeouts.Poll).WithContext(ctx).WithArguments(resourceName).Should(gomega.Equal(ResourceValue{Allocatable: 0, Capacity: 0})) | ||||||
|  | 	}) | ||||||
|  |  | ||||||
|  | 	ginkgo.It("when ListAndWatch fails after provisioning devices, node allocatable will be set to zero and kubelet will not retry to list resources", func(ctx context.Context) { | ||||||
|  | 		// randomizing so tests can run in parallel | ||||||
|  | 		resourceName := fmt.Sprintf("test.device/%s", f.UniqueName) | ||||||
|  | 		devices := []kubeletdevicepluginv1beta1.Device{ | ||||||
|  | 			{ID: "0", Health: kubeletdevicepluginv1beta1.Healthy}, | ||||||
|  | 			{ID: "1", Health: kubeletdevicepluginv1beta1.Healthy}, | ||||||
|  | 		} | ||||||
|  |  | ||||||
|  | 		failing := false | ||||||
|  | 		plugin := testdeviceplugin.NewDevicePlugin(func(name string) error { | ||||||
|  | 			if name == "ListAndWatch" { | ||||||
|  | 				if failing { | ||||||
|  | 					return fmt.Errorf("ListAndWatch failed") | ||||||
|  | 				} | ||||||
|  | 			} | ||||||
|  | 			return nil | ||||||
|  | 		}) | ||||||
|  |  | ||||||
|  | 		err := plugin.RegisterDevicePlugin(ctx, f.UniqueName, resourceName, devices) | ||||||
|  | 		defer plugin.Stop() // should stop even if registration failed | ||||||
|  | 		gomega.Expect(err).To(gomega.Succeed()) | ||||||
|  |  | ||||||
|  | 		// at first the device is healthy | ||||||
|  | 		gomega.Eventually(getNodeResourceValues, nodeStatusUpdateTimeout, f.Timeouts.Poll).WithContext(ctx).WithArguments(resourceName).Should(gomega.Equal(ResourceValue{Allocatable: 2, Capacity: 2})) | ||||||
|  |  | ||||||
|  | 		// let's make ListAndWatch fail | ||||||
|  | 		failing = true | ||||||
|  |  | ||||||
|  | 		// kubelet will mark all devices as unhealthy | ||||||
|  | 		gomega.Eventually(getNodeResourceValues, nodeStatusUpdateTimeout, f.Timeouts.Poll).WithContext(ctx).WithArguments(resourceName).Should(gomega.Equal(ResourceValue{Allocatable: 0, Capacity: 2})) | ||||||
|  |  | ||||||
|  | 		// kubelet will never retry ListAndWatch (this will sleep for a long time) | ||||||
|  | 		gomega.Consistently(plugin.Calls, devicePluginUpdateTimeout, f.Timeouts.Poll).Should(gomega.HaveLen(2)) | ||||||
|  |  | ||||||
|  | 		// however kubelet will not delete the resource and will keep the capacity | ||||||
|  | 		gomega.Eventually(getNodeResourceValues, nodeStatusUpdateTimeout, f.Timeouts.Poll).WithContext(ctx).WithArguments(resourceName).Should(gomega.Equal(ResourceValue{Allocatable: 0, Capacity: 2})) | ||||||
|  |  | ||||||
|  | 		// after the graceful period devices capacity will reset to zero | ||||||
|  | 		gomega.Eventually(getNodeResourceValues, devicePluginGracefulTimeout+1*time.Minute, f.Timeouts.Poll).WithContext(ctx).WithArguments(resourceName).Should(gomega.Equal(ResourceValue{Allocatable: 0, Capacity: 0})) | ||||||
|  | 	}) | ||||||
|  |  | ||||||
|  | 	ginkgo.It("when device plugin is stopped after provisioning devices, node allocatable will be set to zero", func(ctx context.Context) { | ||||||
|  | 		// randomizing so tests can run in parallel | ||||||
|  | 		resourceName := fmt.Sprintf("test.device/%s", f.UniqueName) | ||||||
|  | 		devices := []kubeletdevicepluginv1beta1.Device{ | ||||||
|  | 			{ID: "0", Health: kubeletdevicepluginv1beta1.Healthy}, | ||||||
|  | 			{ID: "1", Health: kubeletdevicepluginv1beta1.Healthy}, | ||||||
|  | 		} | ||||||
|  |  | ||||||
|  | 		gomega.Eventually(getNodeResourceValues, nodeStatusUpdateTimeout, f.Timeouts.Poll).WithContext(ctx).WithArguments(resourceName).Should(gomega.Equal(ResourceValue{Allocatable: -1, Capacity: -1})) | ||||||
|  |  | ||||||
|  | 		plugin := testdeviceplugin.NewDevicePlugin(nil) | ||||||
|  |  | ||||||
|  | 		err := plugin.RegisterDevicePlugin(ctx, f.UniqueName, resourceName, devices) | ||||||
|  | 		defer plugin.Stop() // should stop even if registration failed | ||||||
|  | 		gomega.Expect(err).To(gomega.Succeed()) | ||||||
|  |  | ||||||
|  | 		// at first the device is healthy | ||||||
|  | 		gomega.Eventually(getNodeResourceValues, nodeStatusUpdateTimeout, f.Timeouts.Poll).WithContext(ctx).WithArguments(resourceName).Should(gomega.Equal(ResourceValue{Allocatable: 2, Capacity: 2})) | ||||||
|  |  | ||||||
|  | 		// let's unload the plugin | ||||||
|  | 		plugin.Stop() | ||||||
|  |  | ||||||
|  | 		// kubelet will mark all devices as unhealthy | ||||||
|  | 		gomega.Eventually(getNodeResourceValues, nodeStatusUpdateTimeout, f.Timeouts.Poll).WithContext(ctx).WithArguments(resourceName).Should(gomega.Equal(ResourceValue{Allocatable: 0, Capacity: 2})) | ||||||
|  |  | ||||||
|  | 		// after the graceful period devices capacity will reset to zero | ||||||
|  | 		gomega.Eventually(getNodeResourceValues, devicePluginGracefulTimeout+1*time.Minute, f.Timeouts.Poll).WithContext(ctx).WithArguments(resourceName).Should(gomega.Equal(ResourceValue{Allocatable: 0, Capacity: 0})) | ||||||
|  | 	}) | ||||||
|  | }) | ||||||
							
								
								
									
										237
									
								
								test/e2e_node/testdeviceplugin/device-plugin.go
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										237
									
								
								test/e2e_node/testdeviceplugin/device-plugin.go
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,237 @@ | |||||||
|  | /* | ||||||
|  | Copyright 2024 The Kubernetes Authors. | ||||||
|  |  | ||||||
|  | Licensed under the Apache License, Version 2.0 (the "License"); | ||||||
|  | you may not use this file except in compliance with the License. | ||||||
|  | You may obtain a copy of the License at | ||||||
|  |  | ||||||
|  |     http://www.apache.org/licenses/LICENSE-2.0 | ||||||
|  |  | ||||||
|  | Unless required by applicable law or agreed to in writing, software | ||||||
|  | distributed under the License is distributed on an "AS IS" BASIS, | ||||||
|  | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||||
|  | See the License for the specific language governing permissions and | ||||||
|  | limitations under the License. | ||||||
|  | */ | ||||||
|  |  | ||||||
|  | package testdeviceplugin | ||||||
|  |  | ||||||
|  | import ( | ||||||
|  | 	"context" | ||||||
|  | 	"fmt" | ||||||
|  | 	"net" | ||||||
|  | 	"os" | ||||||
|  | 	"sync" | ||||||
|  | 	"time" | ||||||
|  |  | ||||||
|  | 	"github.com/onsi/ginkgo/v2" | ||||||
|  | 	"github.com/onsi/gomega" | ||||||
|  | 	"google.golang.org/grpc" | ||||||
|  | 	"google.golang.org/grpc/credentials/insecure" | ||||||
|  | 	kubeletdevicepluginv1beta1 "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1" | ||||||
|  | ) | ||||||
|  |  | ||||||
|  | type DevicePlugin struct { | ||||||
|  | 	server     *grpc.Server | ||||||
|  | 	uniqueName string | ||||||
|  |  | ||||||
|  | 	devices     []kubeletdevicepluginv1beta1.Device | ||||||
|  | 	devicesSync sync.Mutex | ||||||
|  |  | ||||||
|  | 	devicesUpdateCh chan struct{} | ||||||
|  |  | ||||||
|  | 	calls     []string | ||||||
|  | 	callsSync sync.Mutex | ||||||
|  |  | ||||||
|  | 	errorInjector func(string) error | ||||||
|  | } | ||||||
|  |  | ||||||
|  | func NewDevicePlugin(errorInjector func(string) error) *DevicePlugin { | ||||||
|  | 	return &DevicePlugin{ | ||||||
|  | 		calls:           []string{}, | ||||||
|  | 		devicesUpdateCh: make(chan struct{}), | ||||||
|  | 		errorInjector:   errorInjector, | ||||||
|  | 	} | ||||||
|  | } | ||||||
|  |  | ||||||
|  | func (dp *DevicePlugin) GetDevicePluginOptions(context.Context, *kubeletdevicepluginv1beta1.Empty) (*kubeletdevicepluginv1beta1.DevicePluginOptions, error) { | ||||||
|  | 	// lock the mutex and add to a list of calls | ||||||
|  | 	dp.callsSync.Lock() | ||||||
|  | 	dp.calls = append(dp.calls, "GetDevicePluginOptions") | ||||||
|  | 	dp.callsSync.Unlock() | ||||||
|  |  | ||||||
|  | 	if dp.errorInjector != nil { | ||||||
|  | 		return &kubeletdevicepluginv1beta1.DevicePluginOptions{}, dp.errorInjector("GetDevicePluginOptions") | ||||||
|  | 	} | ||||||
|  | 	return &kubeletdevicepluginv1beta1.DevicePluginOptions{}, nil | ||||||
|  | } | ||||||
|  |  | ||||||
|  | func (dp *DevicePlugin) sendDevices(stream kubeletdevicepluginv1beta1.DevicePlugin_ListAndWatchServer) error { | ||||||
|  | 	resp := new(kubeletdevicepluginv1beta1.ListAndWatchResponse) | ||||||
|  |  | ||||||
|  | 	dp.devicesSync.Lock() | ||||||
|  | 	for _, d := range dp.devices { | ||||||
|  | 		resp.Devices = append(resp.Devices, &d) | ||||||
|  | 	} | ||||||
|  | 	dp.devicesSync.Unlock() | ||||||
|  |  | ||||||
|  | 	return stream.Send(resp) | ||||||
|  | } | ||||||
|  |  | ||||||
|  | func (dp *DevicePlugin) ListAndWatch(empty *kubeletdevicepluginv1beta1.Empty, stream kubeletdevicepluginv1beta1.DevicePlugin_ListAndWatchServer) error { | ||||||
|  | 	dp.callsSync.Lock() | ||||||
|  | 	dp.calls = append(dp.calls, "ListAndWatch") | ||||||
|  | 	dp.callsSync.Unlock() | ||||||
|  |  | ||||||
|  | 	if dp.errorInjector != nil { | ||||||
|  | 		if err := dp.errorInjector("ListAndWatch"); err != nil { | ||||||
|  | 			return err | ||||||
|  | 		} | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	if err := dp.sendDevices(stream); err != nil { | ||||||
|  | 		return err | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	// when the devices are updated, send the new devices to the kubelet | ||||||
|  | 	// also start a timer and every second call into the dp.errorInjector | ||||||
|  | 	// to simulate a device plugin failure | ||||||
|  | 	ticker := time.NewTicker(time.Second) | ||||||
|  | 	defer ticker.Stop() | ||||||
|  | 	for { | ||||||
|  | 		select { | ||||||
|  | 		case <-dp.devicesUpdateCh: | ||||||
|  | 			if err := dp.sendDevices(stream); err != nil { | ||||||
|  | 				return err | ||||||
|  | 			} | ||||||
|  | 		case <-ticker.C: | ||||||
|  | 			if dp.errorInjector != nil { | ||||||
|  | 				if err := dp.errorInjector("ListAndWatch"); err != nil { | ||||||
|  | 					return err | ||||||
|  | 				} | ||||||
|  | 			} | ||||||
|  | 		} | ||||||
|  | 	} | ||||||
|  | } | ||||||
|  |  | ||||||
|  | func (dp *DevicePlugin) Allocate(ctx context.Context, request *kubeletdevicepluginv1beta1.AllocateRequest) (*kubeletdevicepluginv1beta1.AllocateResponse, error) { | ||||||
|  | 	result := new(kubeletdevicepluginv1beta1.AllocateResponse) | ||||||
|  |  | ||||||
|  | 	dp.callsSync.Lock() | ||||||
|  | 	dp.calls = append(dp.calls, "Allocate") | ||||||
|  | 	dp.callsSync.Unlock() | ||||||
|  |  | ||||||
|  | 	for _, r := range request.ContainerRequests { | ||||||
|  | 		response := &kubeletdevicepluginv1beta1.ContainerAllocateResponse{} | ||||||
|  | 		for _, id := range r.DevicesIDs { | ||||||
|  | 			fpath, err := os.CreateTemp("/tmp", fmt.Sprintf("%s-%s", dp.uniqueName, id)) | ||||||
|  | 			gomega.Expect(err).To(gomega.Succeed()) | ||||||
|  |  | ||||||
|  | 			response.Mounts = append(response.Mounts, &kubeletdevicepluginv1beta1.Mount{ | ||||||
|  | 				ContainerPath: fpath.Name(), | ||||||
|  | 				HostPath:      fpath.Name(), | ||||||
|  | 			}) | ||||||
|  | 		} | ||||||
|  | 		result.ContainerResponses = append(result.ContainerResponses, response) | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	return result, nil | ||||||
|  | } | ||||||
|  |  | ||||||
|  | func (dp *DevicePlugin) PreStartContainer(ctx context.Context, request *kubeletdevicepluginv1beta1.PreStartContainerRequest) (*kubeletdevicepluginv1beta1.PreStartContainerResponse, error) { | ||||||
|  | 	return nil, nil | ||||||
|  | } | ||||||
|  |  | ||||||
|  | func (dp *DevicePlugin) GetPreferredAllocation(ctx context.Context, request *kubeletdevicepluginv1beta1.PreferredAllocationRequest) (*kubeletdevicepluginv1beta1.PreferredAllocationResponse, error) { | ||||||
|  | 	return nil, nil | ||||||
|  | } | ||||||
|  |  | ||||||
|  | func (dp *DevicePlugin) RegisterDevicePlugin(ctx context.Context, uniqueName, resourceName string, devices []kubeletdevicepluginv1beta1.Device) error { | ||||||
|  | 	ginkgo.GinkgoHelper() | ||||||
|  |  | ||||||
|  | 	dp.devicesSync.Lock() | ||||||
|  | 	dp.devices = devices | ||||||
|  | 	dp.devicesSync.Unlock() | ||||||
|  |  | ||||||
|  | 	devicePluginEndpoint := fmt.Sprintf("%s-%s.sock", "test-device-plugin", uniqueName) | ||||||
|  | 	dp.uniqueName = uniqueName | ||||||
|  |  | ||||||
|  | 	// Implement the logic to register the device plugin with the kubelet | ||||||
|  | 	// Create a new gRPC server | ||||||
|  | 	dp.server = grpc.NewServer() | ||||||
|  | 	// Register the device plugin with the server | ||||||
|  | 	kubeletdevicepluginv1beta1.RegisterDevicePluginServer(dp.server, dp) | ||||||
|  | 	// Create a listener on a specific port | ||||||
|  | 	lis, err := net.Listen("unix", kubeletdevicepluginv1beta1.DevicePluginPath+devicePluginEndpoint) | ||||||
|  | 	if err != nil { | ||||||
|  | 		return err | ||||||
|  | 	} | ||||||
|  | 	// Start the gRPC server | ||||||
|  | 	go func() { | ||||||
|  | 		err := dp.server.Serve(lis) | ||||||
|  | 		gomega.Expect(err).To(gomega.Succeed()) | ||||||
|  | 	}() | ||||||
|  |  | ||||||
|  | 	// Create a connection to the kubelet | ||||||
|  | 	conn, err := grpc.NewClient("unix://"+kubeletdevicepluginv1beta1.KubeletSocket, | ||||||
|  | 		grpc.WithTransportCredentials(insecure.NewCredentials()), | ||||||
|  | 	) | ||||||
|  | 	if err != nil { | ||||||
|  | 		return err | ||||||
|  | 	} | ||||||
|  | 	defer func() { | ||||||
|  | 		err := conn.Close() | ||||||
|  | 		gomega.Expect(err).To(gomega.Succeed()) | ||||||
|  | 	}() | ||||||
|  |  | ||||||
|  | 	// Create a client for the kubelet | ||||||
|  | 	client := kubeletdevicepluginv1beta1.NewRegistrationClient(conn) | ||||||
|  |  | ||||||
|  | 	// Register the device plugin with the kubelet | ||||||
|  | 	_, err = client.Register(ctx, &kubeletdevicepluginv1beta1.RegisterRequest{ | ||||||
|  | 		Version:      kubeletdevicepluginv1beta1.Version, | ||||||
|  | 		Endpoint:     devicePluginEndpoint, | ||||||
|  | 		ResourceName: resourceName, | ||||||
|  | 	}) | ||||||
|  | 	if err != nil { | ||||||
|  | 		return err | ||||||
|  | 	} | ||||||
|  | 	return nil | ||||||
|  | } | ||||||
|  |  | ||||||
|  | func (dp *DevicePlugin) Stop() { | ||||||
|  | 	if dp.server != nil { | ||||||
|  | 		dp.server.Stop() | ||||||
|  | 		dp.server = nil | ||||||
|  | 	} | ||||||
|  | } | ||||||
|  |  | ||||||
|  | func (dp *DevicePlugin) WasCalled(method string) bool { | ||||||
|  | 	// lock mutex and then search if the method was called | ||||||
|  | 	dp.callsSync.Lock() | ||||||
|  | 	defer dp.callsSync.Unlock() | ||||||
|  | 	for _, call := range dp.calls { | ||||||
|  | 		if call == method { | ||||||
|  | 			return true | ||||||
|  | 		} | ||||||
|  | 	} | ||||||
|  | 	return false | ||||||
|  | } | ||||||
|  |  | ||||||
|  | func (dp *DevicePlugin) Calls() []string { | ||||||
|  | 	// lock the mutex and return the calls | ||||||
|  | 	dp.callsSync.Lock() | ||||||
|  | 	defer dp.callsSync.Unlock() | ||||||
|  | 	// return a copy of the calls | ||||||
|  | 	calls := make([]string, len(dp.calls)) | ||||||
|  | 	copy(calls, dp.calls) | ||||||
|  | 	return calls | ||||||
|  | } | ||||||
|  |  | ||||||
|  | func (dp *DevicePlugin) UpdateDevices(devices []kubeletdevicepluginv1beta1.Device) { | ||||||
|  | 	// lock the mutex and update the devices | ||||||
|  | 	dp.devicesSync.Lock() | ||||||
|  | 	defer dp.devicesSync.Unlock() | ||||||
|  | 	dp.devices = devices | ||||||
|  | 	dp.devicesUpdateCh <- struct{}{} | ||||||
|  | } | ||||||
		Reference in New Issue
	
	Block a user
	 Kubernetes Prow Robot
					Kubernetes Prow Robot