357 lines
17 KiB
Go
357 lines
17 KiB
Go
/*
|
|
Copyright 2024 The Kubernetes Authors.
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
*/
|
|
|
|
package e2enode
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"time"
|
|
|
|
"github.com/onsi/ginkgo/v2"
|
|
"github.com/onsi/gomega"
|
|
|
|
v1 "k8s.io/api/core/v1"
|
|
kubeletdevicepluginv1beta1 "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
|
|
e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
|
|
admissionapi "k8s.io/pod-security-admission/api"
|
|
|
|
"k8s.io/apimachinery/pkg/api/resource"
|
|
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
|
"k8s.io/apimachinery/pkg/util/uuid"
|
|
"k8s.io/kubernetes/test/e2e/framework"
|
|
"k8s.io/kubernetes/test/e2e_node/testdeviceplugin"
|
|
)
|
|
|
|
type ResourceValue struct {
|
|
Allocatable int
|
|
Capacity int
|
|
}
|
|
|
|
// Serial because the test restarts Kubelet
|
|
var _ = SIGDescribe("Device Plugin Failures:", framework.WithNodeConformance(), func() {
|
|
f := framework.NewDefaultFramework("device-plugin-failures")
|
|
f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged
|
|
|
|
var getNodeResourceValues = func(ctx context.Context, resourceName string) ResourceValue {
|
|
ginkgo.GinkgoHelper()
|
|
node := getLocalNode(ctx, f)
|
|
|
|
// -1 represents that the resource is not found
|
|
result := ResourceValue{
|
|
Allocatable: -1,
|
|
Capacity: -1,
|
|
}
|
|
|
|
for key, val := range node.Status.Capacity {
|
|
resource := string(key)
|
|
if resource == resourceName {
|
|
result.Capacity = int(val.Value())
|
|
break
|
|
}
|
|
}
|
|
|
|
for key, val := range node.Status.Allocatable {
|
|
resource := string(key)
|
|
if resource == resourceName {
|
|
result.Allocatable = int(val.Value())
|
|
break
|
|
}
|
|
}
|
|
|
|
return result
|
|
}
|
|
|
|
var createPod = func(resourceName string, quantity int) *v1.Pod {
|
|
ginkgo.GinkgoHelper()
|
|
rl := v1.ResourceList{v1.ResourceName(resourceName): *resource.NewQuantity(int64(quantity), resource.DecimalSI)}
|
|
pod := &v1.Pod{
|
|
ObjectMeta: metav1.ObjectMeta{Name: "device-plugin-failures-test-" + string(uuid.NewUUID())},
|
|
Spec: v1.PodSpec{
|
|
RestartPolicy: v1.RestartPolicyAlways,
|
|
Containers: []v1.Container{{
|
|
Image: busyboxImage,
|
|
Name: "container-1",
|
|
Command: []string{"sh", "-c", fmt.Sprintf("env && sleep %s", sleepIntervalForever)},
|
|
Resources: v1.ResourceRequirements{
|
|
Limits: rl,
|
|
Requests: rl,
|
|
},
|
|
}},
|
|
},
|
|
}
|
|
return pod
|
|
}
|
|
|
|
nodeStatusUpdateTimeout := 1 * time.Minute
|
|
devicePluginUpdateTimeout := 1 * time.Minute
|
|
devicePluginGracefulTimeout := 5 * time.Minute // see endpointStopGracePeriod in pkg/kubelet/cm/devicemanager/types.go
|
|
|
|
ginkgo.It("when GetDevicePluginOptions fails, device plugin will not be used", func(ctx context.Context) {
|
|
// randomizing so tests can run in parallel
|
|
resourceName := fmt.Sprintf("test.device/%s", f.UniqueName)
|
|
|
|
expectedErr := fmt.Errorf("GetDevicePluginOptions failed")
|
|
|
|
plugin := testdeviceplugin.NewDevicePlugin(func(name string) error {
|
|
if name == "GetDevicePluginOptions" {
|
|
return expectedErr
|
|
}
|
|
return nil
|
|
})
|
|
|
|
err := plugin.RegisterDevicePlugin(ctx, f.UniqueName, resourceName, []kubeletdevicepluginv1beta1.Device{{ID: "testdevice", Health: kubeletdevicepluginv1beta1.Healthy}})
|
|
defer plugin.Stop() // should stop even if registration failed
|
|
gomega.Expect(err).To(gomega.MatchError(gomega.ContainSubstring("failed to get device plugin options")))
|
|
gomega.Expect(err).To(gomega.MatchError(gomega.ContainSubstring(expectedErr.Error())))
|
|
|
|
gomega.Expect(plugin.WasCalled("ListAndWatch")).To(gomega.BeFalseBecause("plugin should not be used if GetDevicePluginOptions fails"))
|
|
gomega.Expect(plugin.WasCalled("GetDevicePluginOptions")).To(gomega.BeTrueBecause("get device plugin options should be called exactly once"))
|
|
gomega.Expect(plugin.Calls()).To(gomega.HaveLen(1))
|
|
|
|
// kubelet will not even register the resource
|
|
gomega.Eventually(getNodeResourceValues, nodeStatusUpdateTimeout, f.Timeouts.Poll).WithContext(ctx).WithArguments(resourceName).Should(gomega.Equal(ResourceValue{Allocatable: -1, Capacity: -1}))
|
|
})
|
|
|
|
ginkgo.It("will set allocatable to zero when a single device became unhealthy and then back to 1 if it got healthy again", func(ctx context.Context) {
|
|
// randomizing so tests can run in parallel
|
|
resourceName := fmt.Sprintf("test.device/%s", f.UniqueName)
|
|
devices := []kubeletdevicepluginv1beta1.Device{{ID: "testdevice", Health: kubeletdevicepluginv1beta1.Healthy}}
|
|
plugin := testdeviceplugin.NewDevicePlugin(nil)
|
|
|
|
err := plugin.RegisterDevicePlugin(ctx, f.UniqueName, resourceName, devices)
|
|
defer plugin.Stop() // should stop even if registration failed
|
|
gomega.Expect(err).To(gomega.Succeed())
|
|
|
|
// at first the device is healthy
|
|
gomega.Eventually(getNodeResourceValues, nodeStatusUpdateTimeout, f.Timeouts.Poll).WithContext(ctx).WithArguments(resourceName).Should(gomega.Equal(ResourceValue{Allocatable: 1, Capacity: 1}))
|
|
|
|
// now make the device unhealthy
|
|
devices[0].Health = kubeletdevicepluginv1beta1.Unhealthy
|
|
plugin.UpdateDevices(devices)
|
|
|
|
gomega.Eventually(getNodeResourceValues, nodeStatusUpdateTimeout, f.Timeouts.Poll).WithContext(ctx).WithArguments(resourceName).Should(gomega.Equal(ResourceValue{Allocatable: 0, Capacity: 1}))
|
|
|
|
// now make the device healthy again
|
|
devices[0].Health = kubeletdevicepluginv1beta1.Healthy
|
|
plugin.UpdateDevices(devices)
|
|
|
|
gomega.Eventually(getNodeResourceValues, nodeStatusUpdateTimeout, f.Timeouts.Poll).WithContext(ctx).WithArguments(resourceName).Should(gomega.Equal(ResourceValue{Allocatable: 1, Capacity: 1}))
|
|
})
|
|
|
|
ginkgo.It("will set allocatable to zero when a single device became unhealthy, but capacity will stay at 1", func(ctx context.Context) {
|
|
// randomizing so tests can run in parallel
|
|
resourceName := fmt.Sprintf("test.device/%s", f.UniqueName)
|
|
devices := []kubeletdevicepluginv1beta1.Device{{ID: "testdevice", Health: kubeletdevicepluginv1beta1.Healthy}}
|
|
plugin := testdeviceplugin.NewDevicePlugin(nil)
|
|
|
|
err := plugin.RegisterDevicePlugin(ctx, f.UniqueName, resourceName, devices)
|
|
defer plugin.Stop() // should stop even if registration failed
|
|
gomega.Expect(err).To(gomega.Succeed())
|
|
|
|
ginkgo.By("initial state: capacity and allocatable are set")
|
|
gomega.Eventually(getNodeResourceValues, nodeStatusUpdateTimeout, f.Timeouts.Poll).WithContext(ctx).WithArguments(resourceName).Should(gomega.Equal(ResourceValue{Allocatable: 1, Capacity: 1}))
|
|
|
|
// schedule a pod that requests the device
|
|
client := e2epod.NewPodClient(f)
|
|
pod := client.Create(ctx, createPod(resourceName, 1))
|
|
|
|
// wait for the pod to be running
|
|
gomega.Expect(e2epod.WaitForPodRunningInNamespace(ctx, f.ClientSet, pod)).To(gomega.Succeed())
|
|
|
|
ginkgo.By("once pod is running, it does not affect allocatable value")
|
|
gomega.Eventually(getNodeResourceValues, nodeStatusUpdateTimeout, f.Timeouts.Poll).WithContext(ctx).WithArguments(resourceName).Should(gomega.Equal(ResourceValue{Allocatable: 1, Capacity: 1}))
|
|
|
|
// now make the device unhealthy
|
|
devices[0].Health = kubeletdevicepluginv1beta1.Unhealthy
|
|
plugin.UpdateDevices(devices)
|
|
|
|
ginkgo.By("even when device became unhealthy. pod is still running and keeping the capacity")
|
|
// we keep the allocatable at the same value even though device is not healthy any longer
|
|
gomega.Eventually(getNodeResourceValues, nodeStatusUpdateTimeout, f.Timeouts.Poll).WithContext(ctx).WithArguments(resourceName).Should(gomega.Equal(ResourceValue{Allocatable: 0, Capacity: 1}))
|
|
|
|
// pod is not affected by the device becoming unhealthy
|
|
|
|
gomega.Consistently(func() v1.PodPhase {
|
|
pod, err = f.ClientSet.CoreV1().Pods(f.Namespace.Name).Get(ctx, pod.Name, metav1.GetOptions{})
|
|
return pod.Status.Phase
|
|
}, devicePluginUpdateTimeout, f.Timeouts.Poll).Should(gomega.Equal(v1.PodRunning))
|
|
|
|
// deleting the pod
|
|
err = f.ClientSet.CoreV1().Pods(f.Namespace.Name).Delete(ctx, pod.Name, metav1.DeleteOptions{})
|
|
gomega.Expect(err).To(gomega.Succeed())
|
|
|
|
// wait for the pod to be deleted
|
|
gomega.Eventually(func() error {
|
|
_, err := f.ClientSet.CoreV1().Pods(f.Namespace.Name).Get(ctx, pod.Name, metav1.GetOptions{})
|
|
return err
|
|
}, f.Timeouts.PodDelete, f.Timeouts.Poll).Should(gomega.MatchError((gomega.ContainSubstring("not found"))))
|
|
|
|
ginkgo.By("when pod is deleted, nothing changes")
|
|
gomega.Eventually(getNodeResourceValues, devicePluginGracefulTimeout+1*time.Minute, f.Timeouts.Poll).WithContext(ctx).WithArguments(resourceName).Should(gomega.Equal(ResourceValue{Allocatable: 0, Capacity: 1}))
|
|
})
|
|
|
|
ginkgo.It("will lower allocatable to a number of unhealthy devices and then back if they became healthy again", func(ctx context.Context) {
|
|
// randomizing so tests can run in parallel
|
|
resourceName := fmt.Sprintf("test.device/%s", f.UniqueName)
|
|
|
|
devices := []kubeletdevicepluginv1beta1.Device{
|
|
{ID: "0", Health: kubeletdevicepluginv1beta1.Healthy},
|
|
{ID: "1", Health: kubeletdevicepluginv1beta1.Healthy},
|
|
{ID: "2", Health: kubeletdevicepluginv1beta1.Healthy},
|
|
{ID: "3", Health: kubeletdevicepluginv1beta1.Healthy},
|
|
}
|
|
plugin := testdeviceplugin.NewDevicePlugin(nil)
|
|
|
|
err := plugin.RegisterDevicePlugin(ctx, f.UniqueName, resourceName, devices)
|
|
defer plugin.Stop() // should stop even if registration failed
|
|
gomega.Expect(err).To(gomega.Succeed())
|
|
|
|
// at first all the devices are healthy
|
|
gomega.Eventually(getNodeResourceValues, nodeStatusUpdateTimeout, f.Timeouts.Poll).WithContext(ctx).WithArguments(resourceName).Should(gomega.Equal(ResourceValue{Allocatable: 4, Capacity: 4}))
|
|
|
|
// now make one device unhealthy
|
|
devices[3].Health = kubeletdevicepluginv1beta1.Unhealthy
|
|
plugin.UpdateDevices(devices)
|
|
|
|
gomega.Eventually(getNodeResourceValues, nodeStatusUpdateTimeout, f.Timeouts.Poll).WithContext(ctx).WithArguments(resourceName).Should(gomega.Equal(ResourceValue{Allocatable: 3, Capacity: 4}))
|
|
|
|
// now make the device healthy again
|
|
devices[3].Health = kubeletdevicepluginv1beta1.Healthy
|
|
plugin.UpdateDevices(devices)
|
|
|
|
gomega.Eventually(getNodeResourceValues, nodeStatusUpdateTimeout, f.Timeouts.Poll).WithContext(ctx).WithArguments(resourceName).Should(gomega.Equal(ResourceValue{Allocatable: 4, Capacity: 4}))
|
|
|
|
// now make two devices unhealthy
|
|
devices[1].Health = kubeletdevicepluginv1beta1.Unhealthy
|
|
devices[3].Health = kubeletdevicepluginv1beta1.Unhealthy
|
|
plugin.UpdateDevices(devices)
|
|
|
|
gomega.Eventually(getNodeResourceValues, nodeStatusUpdateTimeout, f.Timeouts.Poll).WithContext(ctx).WithArguments(resourceName).Should(gomega.Equal(ResourceValue{Allocatable: 2, Capacity: 4}))
|
|
|
|
// now make the device healthy again
|
|
devices[3].Health = kubeletdevicepluginv1beta1.Healthy
|
|
plugin.UpdateDevices(devices)
|
|
|
|
gomega.Eventually(getNodeResourceValues, nodeStatusUpdateTimeout, f.Timeouts.Poll).WithContext(ctx).WithArguments(resourceName).Should(gomega.Equal(ResourceValue{Allocatable: 3, Capacity: 4}))
|
|
|
|
// now make the device healthy again
|
|
devices[1].Health = kubeletdevicepluginv1beta1.Healthy
|
|
plugin.UpdateDevices(devices)
|
|
|
|
gomega.Eventually(getNodeResourceValues, nodeStatusUpdateTimeout, f.Timeouts.Poll).WithContext(ctx).WithArguments(resourceName).Should(gomega.Equal(ResourceValue{Allocatable: 4, Capacity: 4}))
|
|
})
|
|
|
|
ginkgo.It("when ListAndWatch fails immediately, node allocatable will be set to zero and kubelet will not retry to list resources", func(ctx context.Context) {
|
|
// randomizing so tests can run in parallel
|
|
resourceName := fmt.Sprintf("test.device/%s", f.UniqueName)
|
|
devices := []kubeletdevicepluginv1beta1.Device{{ID: "testdevice", Health: kubeletdevicepluginv1beta1.Healthy}}
|
|
|
|
// Initially, there are no allocatable of this resource
|
|
gomega.Eventually(getNodeResourceValues, nodeStatusUpdateTimeout, f.Timeouts.Poll).WithContext(ctx).WithArguments(resourceName).Should(gomega.Equal(ResourceValue{Allocatable: -1, Capacity: -1}))
|
|
|
|
plugin := testdeviceplugin.NewDevicePlugin(func(name string) error {
|
|
if name == "ListAndWatch" {
|
|
return fmt.Errorf("ListAndWatch failed")
|
|
}
|
|
return nil
|
|
})
|
|
|
|
err := plugin.RegisterDevicePlugin(ctx, f.UniqueName, resourceName, devices)
|
|
defer plugin.Stop() // should stop even if registration failed
|
|
gomega.Expect(err).To(gomega.Succeed())
|
|
|
|
// kubelet registers the resource, but will not have any allocatable
|
|
gomega.Eventually(getNodeResourceValues, nodeStatusUpdateTimeout, f.Timeouts.Poll).WithContext(ctx).WithArguments(resourceName).Should(gomega.Equal(ResourceValue{Allocatable: 0, Capacity: 0}))
|
|
|
|
// kubelet will never retry ListAndWatch (this will sleep for a long time)
|
|
gomega.Consistently(plugin.Calls, devicePluginUpdateTimeout, f.Timeouts.Poll).Should(gomega.HaveLen(2))
|
|
|
|
// however kubelet will not delete the resource
|
|
gomega.Eventually(getNodeResourceValues, nodeStatusUpdateTimeout, f.Timeouts.Poll).WithContext(ctx).WithArguments(resourceName).Should(gomega.Equal(ResourceValue{Allocatable: 0, Capacity: 0}))
|
|
})
|
|
|
|
ginkgo.It("when ListAndWatch fails after provisioning devices, node allocatable will be set to zero and kubelet will not retry to list resources", func(ctx context.Context) {
|
|
// randomizing so tests can run in parallel
|
|
resourceName := fmt.Sprintf("test.device/%s", f.UniqueName)
|
|
devices := []kubeletdevicepluginv1beta1.Device{
|
|
{ID: "0", Health: kubeletdevicepluginv1beta1.Healthy},
|
|
{ID: "1", Health: kubeletdevicepluginv1beta1.Healthy},
|
|
}
|
|
|
|
failing := false
|
|
plugin := testdeviceplugin.NewDevicePlugin(func(name string) error {
|
|
if name == "ListAndWatch" {
|
|
if failing {
|
|
return fmt.Errorf("ListAndWatch failed")
|
|
}
|
|
}
|
|
return nil
|
|
})
|
|
|
|
err := plugin.RegisterDevicePlugin(ctx, f.UniqueName, resourceName, devices)
|
|
defer plugin.Stop() // should stop even if registration failed
|
|
gomega.Expect(err).To(gomega.Succeed())
|
|
|
|
// at first the device is healthy
|
|
gomega.Eventually(getNodeResourceValues, nodeStatusUpdateTimeout, f.Timeouts.Poll).WithContext(ctx).WithArguments(resourceName).Should(gomega.Equal(ResourceValue{Allocatable: 2, Capacity: 2}))
|
|
|
|
// let's make ListAndWatch fail
|
|
failing = true
|
|
|
|
// kubelet will mark all devices as unhealthy
|
|
gomega.Eventually(getNodeResourceValues, nodeStatusUpdateTimeout, f.Timeouts.Poll).WithContext(ctx).WithArguments(resourceName).Should(gomega.Equal(ResourceValue{Allocatable: 0, Capacity: 2}))
|
|
|
|
// kubelet will never retry ListAndWatch (this will sleep for a long time)
|
|
gomega.Consistently(plugin.Calls, devicePluginUpdateTimeout, f.Timeouts.Poll).Should(gomega.HaveLen(2))
|
|
|
|
// however kubelet will not delete the resource and will keep the capacity
|
|
gomega.Eventually(getNodeResourceValues, nodeStatusUpdateTimeout, f.Timeouts.Poll).WithContext(ctx).WithArguments(resourceName).Should(gomega.Equal(ResourceValue{Allocatable: 0, Capacity: 2}))
|
|
|
|
// after the graceful period devices capacity will reset to zero
|
|
gomega.Eventually(getNodeResourceValues, devicePluginGracefulTimeout+1*time.Minute, f.Timeouts.Poll).WithContext(ctx).WithArguments(resourceName).Should(gomega.Equal(ResourceValue{Allocatable: 0, Capacity: 0}))
|
|
})
|
|
|
|
ginkgo.It("when device plugin is stopped after provisioning devices, node allocatable will be set to zero", func(ctx context.Context) {
|
|
// randomizing so tests can run in parallel
|
|
resourceName := fmt.Sprintf("test.device/%s", f.UniqueName)
|
|
devices := []kubeletdevicepluginv1beta1.Device{
|
|
{ID: "0", Health: kubeletdevicepluginv1beta1.Healthy},
|
|
{ID: "1", Health: kubeletdevicepluginv1beta1.Healthy},
|
|
}
|
|
|
|
gomega.Eventually(getNodeResourceValues, nodeStatusUpdateTimeout, f.Timeouts.Poll).WithContext(ctx).WithArguments(resourceName).Should(gomega.Equal(ResourceValue{Allocatable: -1, Capacity: -1}))
|
|
|
|
plugin := testdeviceplugin.NewDevicePlugin(nil)
|
|
|
|
err := plugin.RegisterDevicePlugin(ctx, f.UniqueName, resourceName, devices)
|
|
defer plugin.Stop() // should stop even if registration failed
|
|
gomega.Expect(err).To(gomega.Succeed())
|
|
|
|
// at first the device is healthy
|
|
gomega.Eventually(getNodeResourceValues, nodeStatusUpdateTimeout, f.Timeouts.Poll).WithContext(ctx).WithArguments(resourceName).Should(gomega.Equal(ResourceValue{Allocatable: 2, Capacity: 2}))
|
|
|
|
// let's unload the plugin
|
|
plugin.Stop()
|
|
|
|
// kubelet will mark all devices as unhealthy
|
|
gomega.Eventually(getNodeResourceValues, nodeStatusUpdateTimeout, f.Timeouts.Poll).WithContext(ctx).WithArguments(resourceName).Should(gomega.Equal(ResourceValue{Allocatable: 0, Capacity: 2}))
|
|
|
|
// after the graceful period devices capacity will reset to zero
|
|
gomega.Eventually(getNodeResourceValues, devicePluginGracefulTimeout+1*time.Minute, f.Timeouts.Poll).WithContext(ctx).WithArguments(resourceName).Should(gomega.Equal(ResourceValue{Allocatable: 0, Capacity: 0}))
|
|
})
|
|
})
|