node: e2e: add test for the checkpoint recovery

Add a e2e test to exercise the checkpoint recovery flow.
This means we need to actually create a old (V1, pre-1.20) checkpoint,
but if we do it only in the e2e test, it's still fine.

Signed-off-by: Francesco Romani <fromani@redhat.com>
This commit is contained in:
Francesco Romani
2021-10-26 09:31:04 +02:00
parent 2f426fdba6
commit b382b6cd0a
3 changed files with 412 additions and 7 deletions

View File

@@ -505,6 +505,15 @@ type sriovData struct {
}
func setupSRIOVConfigOrFail(f *framework.Framework, configMap *v1.ConfigMap) *sriovData {
sd := createSRIOVConfigOrFail(f, configMap)
e2enode.WaitForNodeToBeReady(f.ClientSet, framework.TestContext.NodeName, 5*time.Minute)
sd.pod = createSRIOVPodOrFail(f)
return sd
}
func createSRIOVConfigOrFail(f *framework.Framework, configMap *v1.ConfigMap) *sriovData {
var err error
ginkgo.By(fmt.Sprintf("Creating configMap %v/%v", metav1.NamespaceSystem, configMap.Name))
@@ -522,8 +531,13 @@ func setupSRIOVConfigOrFail(f *framework.Framework, configMap *v1.ConfigMap) *sr
framework.Failf("unable to create test serviceAccount %s: %v", serviceAccount.Name, err)
}
e2enode.WaitForNodeToBeReady(f.ClientSet, framework.TestContext.NodeName, 5*time.Minute)
return &sriovData{
configMap: configMap,
serviceAccount: serviceAccount,
}
}
func createSRIOVPodOrFail(f *framework.Framework) *v1.Pod {
dp := getSRIOVDevicePluginPod()
dp.Spec.NodeName = framework.TestContext.NodeName
@@ -536,11 +550,7 @@ func setupSRIOVConfigOrFail(f *framework.Framework, configMap *v1.ConfigMap) *sr
}
framework.ExpectNoError(err)
return &sriovData{
configMap: configMap,
serviceAccount: serviceAccount,
pod: dpPod,
}
return dpPod
}
// waitForSRIOVResources waits until enough SRIOV resources are avaailable, expecting to complete within the timeout.
@@ -560,7 +570,7 @@ func waitForSRIOVResources(f *framework.Framework, sd *sriovData) {
framework.Logf("Detected SRIOV allocatable devices name=%q amount=%d", sd.resourceName, sd.resourceAmount)
}
func teardownSRIOVConfigOrFail(f *framework.Framework, sd *sriovData) {
func deleteSRIOVPodOrFail(f *framework.Framework, sd *sriovData) {
var err error
gp := int64(0)
deleteOptions := metav1.DeleteOptions{
@@ -571,6 +581,14 @@ func teardownSRIOVConfigOrFail(f *framework.Framework, sd *sriovData) {
err = f.ClientSet.CoreV1().Pods(sd.pod.Namespace).Delete(context.TODO(), sd.pod.Name, deleteOptions)
framework.ExpectNoError(err)
waitForAllContainerRemoval(sd.pod.Name, sd.pod.Namespace)
}
func removeSRIOVConfigOrFail(f *framework.Framework, sd *sriovData) {
var err error
gp := int64(0)
deleteOptions := metav1.DeleteOptions{
GracePeriodSeconds: &gp,
}
ginkgo.By(fmt.Sprintf("Deleting configMap %v/%v", metav1.NamespaceSystem, sd.configMap.Name))
err = f.ClientSet.CoreV1().ConfigMaps(metav1.NamespaceSystem).Delete(context.TODO(), sd.configMap.Name, deleteOptions)
@@ -581,6 +599,11 @@ func teardownSRIOVConfigOrFail(f *framework.Framework, sd *sriovData) {
framework.ExpectNoError(err)
}
func teardownSRIOVConfigOrFail(f *framework.Framework, sd *sriovData) {
deleteSRIOVPodOrFail(f, sd)
removeSRIOVConfigOrFail(f, sd)
}
func runTMScopeResourceAlignmentTestSuite(f *framework.Framework, configMap *v1.ConfigMap, reservedSystemCPUs, policy string, numaNodes, coreCount int) {
threadsPerCore := getSMTLevel()
sd := setupSRIOVConfigOrFail(f, configMap)