diff --git a/hack/generate-bindata.sh b/hack/generate-bindata.sh index f8cb2768ff2..25cd5d75d96 100755 --- a/hack/generate-bindata.sh +++ b/hack/generate-bindata.sh @@ -47,6 +47,7 @@ BINDATA_OUTPUT="test/e2e/generated/bindata.go" go-bindata -nometadata -o "${BINDATA_OUTPUT}.tmp" -pkg generated \ -ignore .jpg -ignore .png -ignore .md -ignore 'BUILD(\.bazel)?' \ "test/e2e/testing-manifests/..." \ + "test/e2e_node/testing-manifests/..." \ "test/images/..." \ "test/fixtures/..." diff --git a/test/e2e/framework/test_context.go b/test/e2e/framework/test_context.go index c9b8a37db4f..a9391b72052 100644 --- a/test/e2e/framework/test_context.go +++ b/test/e2e/framework/test_context.go @@ -167,6 +167,9 @@ type TestContextType struct { // ProgressReportURL is the URL which progress updates will be posted to as tests complete. If empty, no updates are sent. ProgressReportURL string + + // SriovdpConfigMapFile is the path to the ConfigMap to configure the SRIOV device plugin on this host. + SriovdpConfigMapFile string } // NodeKillerConfig describes configuration of NodeKiller -- a utility to diff --git a/test/e2e/framework/util.go b/test/e2e/framework/util.go index 00d0fd07c6e..fd368184d27 100644 --- a/test/e2e/framework/util.go +++ b/test/e2e/framework/util.go @@ -1882,7 +1882,6 @@ func DumpDebugInfo(c clientset.Interface, ns string) { // DsFromManifest reads a .json/yaml file and returns the daemonset in it. func DsFromManifest(url string) (*appsv1.DaemonSet, error) { - var ds appsv1.DaemonSet Logf("Parsing ds from %v", url) var response *http.Response @@ -1908,7 +1907,12 @@ func DsFromManifest(url string) (*appsv1.DaemonSet, error) { if err != nil { return nil, fmt.Errorf("Failed to read html response body: %v", err) } + return DsFromData(data) +} +// DsFromData reads a byte slice and returns the daemonset in it. +func DsFromData(data []byte) (*appsv1.DaemonSet, error) { + var ds appsv1.DaemonSet dataJSON, err := utilyaml.ToJSON(data) if err != nil { return nil, fmt.Errorf("Failed to parse data to json: %v", err) diff --git a/test/e2e/generated/BUILD b/test/e2e/generated/BUILD index c5d8f822a81..de04cfe7b5e 100644 --- a/test/e2e/generated/BUILD +++ b/test/e2e/generated/BUILD @@ -24,6 +24,7 @@ go_bindata( name = "bindata", srcs = [ "//test/e2e/testing-manifests:all-srcs", + "//test/e2e_node/testing-manifests:all-srcs", "//test/fixtures:all-srcs", "//test/images:all-srcs", ], diff --git a/test/e2e_node/BUILD b/test/e2e_node/BUILD index 29d740a2a97..88e895289d3 100644 --- a/test/e2e_node/BUILD +++ b/test/e2e_node/BUILD @@ -15,8 +15,10 @@ go_library( "framework.go", "image_list.go", "node_problem_detector_linux.go", + "numa_alignment.go", "resource_collector.go", "util.go", + "util_sriov.go", "util_xfs_linux.go", "util_xfs_unsupported.go", ], @@ -30,6 +32,7 @@ go_library( "//pkg/kubelet/apis/podresources/v1alpha1:go_default_library", "//pkg/kubelet/apis/stats/v1alpha1:go_default_library", "//pkg/kubelet/cm:go_default_library", + "//pkg/kubelet/cm/cpuset:go_default_library", "//pkg/kubelet/kubeletconfig/util/codec:go_default_library", "//pkg/kubelet/metrics:go_default_library", "//pkg/kubelet/remote:go_default_library", @@ -49,6 +52,7 @@ go_library( "//test/e2e/framework/gpu:go_default_library", "//test/e2e/framework/metrics:go_default_library", "//test/e2e/framework/node:go_default_library", + "//test/e2e/framework/testfiles:go_default_library", "//test/utils/image:go_default_library", "//vendor/github.com/blang/semver:go_default_library", "//vendor/github.com/coreos/go-systemd/util:go_default_library", @@ -266,6 +270,7 @@ filegroup( "//test/e2e_node/runner/remote:all-srcs", "//test/e2e_node/services:all-srcs", "//test/e2e_node/system:all-srcs", + "//test/e2e_node/testing-manifests:all-srcs", ], tags = ["automanaged"], visibility = ["//visibility:public"], diff --git a/test/e2e_node/e2e_node_suite_test.go b/test/e2e_node/e2e_node_suite_test.go index 33b381b0f3b..0237f7d38b9 100644 --- a/test/e2e_node/e2e_node_suite_test.go +++ b/test/e2e_node/e2e_node_suite_test.go @@ -80,6 +80,7 @@ func registerNodeFlags(flags *flag.FlagSet) { flags.StringVar(&framework.TestContext.ImageDescription, "image-description", "", "The description of the image which the test will be running on.") flags.StringVar(&framework.TestContext.SystemSpecName, "system-spec-name", "", "The name of the system spec (e.g., gke) that's used in the node e2e test. The system specs are in test/e2e_node/system/specs/. This is used by the test framework to determine which tests to run for validating the system requirements.") flags.Var(cliflag.NewMapStringString(&framework.TestContext.ExtraEnvs), "extra-envs", "The extra environment variables needed for node e2e tests. Format: a list of key=value pairs, e.g., env1=val1,env2=val2") + flags.StringVar(&framework.TestContext.SriovdpConfigMapFile, "sriovdp-configmap-file", "", "The name of the SRIOV device plugin Config Map to load.") } func init() { diff --git a/test/e2e_node/image_list.go b/test/e2e_node/image_list.go index 2aef9388a27..e896ae36522 100644 --- a/test/e2e_node/image_list.go +++ b/test/e2e_node/image_list.go @@ -31,6 +31,7 @@ import ( commontest "k8s.io/kubernetes/test/e2e/common" "k8s.io/kubernetes/test/e2e/framework" "k8s.io/kubernetes/test/e2e/framework/gpu" + "k8s.io/kubernetes/test/e2e/framework/testfiles" imageutils "k8s.io/kubernetes/test/utils/image" ) @@ -68,6 +69,7 @@ func updateImageWhiteList() { framework.ImageWhiteList = NodeImageWhiteList.Union(commontest.CommonImageWhiteList) // Images from extra envs framework.ImageWhiteList.Insert(getNodeProblemDetectorImage()) + framework.ImageWhiteList.Insert(getSRIOVDevicePluginImage()) } func getNodeProblemDetectorImage() string { @@ -184,3 +186,26 @@ func getGPUDevicePluginImage() string { } return ds.Spec.Template.Spec.Containers[0].Image } + +// getSRIOVDevicePluginImage returns the image of SRIOV device plugin. +func getSRIOVDevicePluginImage() string { + data, err := testfiles.Read(SRIOVDevicePluginDSYAML) + if err != nil { + klog.Errorf("Failed to read the device plugin manifest: %v", err) + return "" + } + ds, err := framework.DsFromData(data) + if err != nil { + klog.Errorf("Failed to parse the device plugin image: %v", err) + return "" + } + if ds == nil { + klog.Errorf("Failed to parse the device plugin image: the extracted DaemonSet is nil") + return "" + } + if len(ds.Spec.Template.Spec.Containers) < 1 { + klog.Errorf("Failed to parse the device plugin image: cannot extract the container from YAML") + return "" + } + return ds.Spec.Template.Spec.Containers[0].Image +} diff --git a/test/e2e_node/numa_alignment.go b/test/e2e_node/numa_alignment.go new file mode 100644 index 00000000000..33edd964e8c --- /dev/null +++ b/test/e2e_node/numa_alignment.go @@ -0,0 +1,212 @@ +/* +Copyright 2020 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package e2enode + +import ( + "fmt" + "io/ioutil" + "sort" + "strconv" + "strings" + + v1 "k8s.io/api/core/v1" + "k8s.io/kubernetes/pkg/kubelet/cm/cpuset" + + "k8s.io/kubernetes/test/e2e/framework" +) + +type numaPodResources struct { + CPUToNUMANode map[int]int + PCIDevsToNUMANode map[string]int +} + +func (R *numaPodResources) CheckAlignment() bool { + nodeNum := -1 // not set + for _, cpuNode := range R.CPUToNUMANode { + if nodeNum == -1 { + nodeNum = cpuNode + } else if nodeNum != cpuNode { + return false + } + } + for _, devNode := range R.PCIDevsToNUMANode { + // TODO: explain -1 + if devNode != -1 && nodeNum != devNode { + return false + } + } + return true +} + +func (R *numaPodResources) String() string { + var b strings.Builder + // To store the keys in slice in sorted order + var cpuKeys []int + for ck := range R.CPUToNUMANode { + cpuKeys = append(cpuKeys, ck) + } + sort.Ints(cpuKeys) + for _, k := range cpuKeys { + nodeNum := R.CPUToNUMANode[k] + b.WriteString(fmt.Sprintf("CPU cpu#%03d=%02d\n", k, nodeNum)) + } + var pciKeys []string + for pk := range R.PCIDevsToNUMANode { + pciKeys = append(pciKeys, pk) + } + sort.Strings(pciKeys) + for _, k := range pciKeys { + nodeNum := R.PCIDevsToNUMANode[k] + b.WriteString(fmt.Sprintf("PCI %s=%02d\n", k, nodeNum)) + } + return b.String() +} + +func getCPUsPerNUMANode(nodeNum int) ([]int, error) { + nodeCPUList, err := ioutil.ReadFile(fmt.Sprintf("/sys/devices/system/node/node%d/cpulist", nodeNum)) + if err != nil { + return nil, err + } + cpus, err := cpuset.Parse(strings.TrimSpace(string(nodeCPUList))) + if err != nil { + return nil, err + } + return cpus.ToSlice(), nil +} + +func getCPUToNUMANodeMapFromEnv(f *framework.Framework, pod *v1.Pod, environ map[string]string, numaNodes int) (map[int]int, error) { + var cpuIDs []int + cpuListAllowedEnvVar := "CPULIST_ALLOWED" + + for name, value := range environ { + if name == cpuListAllowedEnvVar { + cpus, err := cpuset.Parse(value) + if err != nil { + return nil, err + } + cpuIDs = cpus.ToSlice() + } + } + if len(cpuIDs) == 0 { + return nil, fmt.Errorf("variable %q found in environ", cpuListAllowedEnvVar) + } + + cpusPerNUMA := make(map[int][]int) + for numaNode := 0; numaNode < numaNodes; numaNode++ { + nodeCPUList := f.ExecCommandInContainer(pod.Name, pod.Spec.Containers[0].Name, + "/bin/cat", fmt.Sprintf("/sys/devices/system/node/node%d/cpulist", numaNode)) + + cpus, err := cpuset.Parse(nodeCPUList) + if err != nil { + return nil, err + } + cpusPerNUMA[numaNode] = cpus.ToSlice() + } + + // CPU IDs -> NUMA Node ID + CPUToNUMANode := make(map[int]int) + for nodeNum, cpus := range cpusPerNUMA { + for _, cpu := range cpus { + CPUToNUMANode[cpu] = nodeNum + } + } + + // filter out only the allowed CPUs + CPUMap := make(map[int]int) + for _, cpuID := range cpuIDs { + _, ok := CPUToNUMANode[cpuID] + if !ok { + return nil, fmt.Errorf("CPU %d not found on NUMA map: %v", cpuID, CPUToNUMANode) + } + CPUMap[cpuID] = CPUToNUMANode[cpuID] + } + return CPUMap, nil +} + +func getPCIDeviceToNumaNodeMapFromEnv(f *framework.Framework, pod *v1.Pod, environ map[string]string) (map[string]int, error) { + pciDevPrefix := "PCIDEVICE_" + // at this point we don't care which plugin selected the device, + // we only need to know which devices were assigned to the POD. + // Hence, do prefix search for the variable and fetch the device(s). + + NUMAPerDev := make(map[string]int) + for name, value := range environ { + if !strings.HasPrefix(name, pciDevPrefix) { + continue + } + + // a single plugin can allocate more than a single device + pciDevs := strings.Split(value, ",") + for _, pciDev := range pciDevs { + pciDevNUMANode := f.ExecCommandInContainer(pod.Name, pod.Spec.Containers[0].Name, + "/bin/cat", fmt.Sprintf("/sys/bus/pci/devices/%s/numa_node", pciDev)) + + nodeNum, err := strconv.Atoi(pciDevNUMANode) + if err != nil { + return nil, err + } + NUMAPerDev[pciDev] = nodeNum + } + } + if len(NUMAPerDev) == 0 { + return nil, fmt.Errorf("no PCI devices found in environ") + } + return NUMAPerDev, nil +} + +func makeEnvMap(logs string) (map[string]string, error) { + podEnv := strings.Split(logs, "\n") + envMap := make(map[string]string) + for _, envVar := range podEnv { + if len(envVar) == 0 { + continue + } + pair := strings.SplitN(envVar, "=", 2) + if len(pair) != 2 { + return nil, fmt.Errorf("unable to split %q", envVar) + } + envMap[pair[0]] = pair[1] + } + return envMap, nil +} + +func checkNUMAAlignment(f *framework.Framework, pod *v1.Pod, logs string, numaNodes int) (numaPodResources, error) { + podEnv, err := makeEnvMap(logs) + if err != nil { + return numaPodResources{}, err + } + + CPUToNUMANode, err := getCPUToNUMANodeMapFromEnv(f, pod, podEnv, numaNodes) + if err != nil { + return numaPodResources{}, err + } + + PCIDevsToNUMANode, err := getPCIDeviceToNumaNodeMapFromEnv(f, pod, podEnv) + if err != nil { + return numaPodResources{}, err + } + + numaRes := numaPodResources{ + CPUToNUMANode: CPUToNUMANode, + PCIDevsToNUMANode: PCIDevsToNUMANode, + } + aligned := numaRes.CheckAlignment() + if !aligned { + return numaRes, fmt.Errorf("NUMA resources not aligned") + } + return numaRes, nil +} diff --git a/test/e2e_node/testing-manifests/BUILD b/test/e2e_node/testing-manifests/BUILD new file mode 100644 index 00000000000..7e76248ad95 --- /dev/null +++ b/test/e2e_node/testing-manifests/BUILD @@ -0,0 +1,14 @@ +package(default_visibility = ["//visibility:public"]) + +filegroup( + name = "package-srcs", + srcs = glob(["**"]), + tags = ["automanaged"], + visibility = ["//visibility:private"], +) + +filegroup( + name = "all-srcs", + srcs = [":package-srcs"], + tags = ["automanaged"], +) diff --git a/test/e2e_node/testing-manifests/sriovdp-cm.yaml b/test/e2e_node/testing-manifests/sriovdp-cm.yaml new file mode 100644 index 00000000000..f37ae0c8d2b --- /dev/null +++ b/test/e2e_node/testing-manifests/sriovdp-cm.yaml @@ -0,0 +1,36 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: sriovdp-config + namespace: kube-system +data: + config.json: | + { + "resourceList": [{ + "resourceName": "intel_sriov_netdevice", + "selectors": { + "vendors": ["8086"], + "devices": ["154c", "10ed"], + "drivers": ["i40evf", "ixgbevf"] + } + }, + { + "resourceName": "intel_sriov_dpdk", + "selectors": { + "vendors": ["8086"], + "devices": ["154c", "10ed"], + "drivers": ["vfio-pci"], + "pfNames": ["enp0s0f0","enp2s2f1"] + } + }, + { + "resourceName": "mlnx_sriov_rdma", + "isRdma": true, + "selectors": { + "vendors": ["15b3"], + "devices": ["1018"], + "drivers": ["mlx5_ib"] + } + } + ] + } diff --git a/test/e2e_node/testing-manifests/sriovdp-ds.yaml b/test/e2e_node/testing-manifests/sriovdp-ds.yaml new file mode 100644 index 00000000000..30f76ff470b --- /dev/null +++ b/test/e2e_node/testing-manifests/sriovdp-ds.yaml @@ -0,0 +1,58 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: kube-sriov-device-plugin-amd64 + namespace: kube-system + labels: + tier: node + app: sriovdp +spec: + selector: + matchLabels: + name: sriov-device-plugin + template: + metadata: + labels: + name: sriov-device-plugin + tier: node + app: sriovdp + spec: + hostNetwork: true + hostPID: true + nodeSelector: + beta.kubernetes.io/arch: amd64 + tolerations: + - key: node-role.kubernetes.io/master + operator: Exists + effect: NoSchedule + serviceAccountName: sriov-device-plugin + containers: + - name: kube-sriovdp + image: docker.io/nfvpe/sriov-device-plugin:v3.1 + imagePullPolicy: Never + args: + - --log-dir=sriovdp + - --log-level=10 + securityContext: + privileged: true + volumeMounts: + - name: devicesock + mountPath: /var/lib/kubelet/ + readOnly: false + - name: log + mountPath: /var/log + - name: config-volume + mountPath: /etc/pcidp + volumes: + - name: devicesock + hostPath: + path: /var/lib/kubelet/ + - name: log + hostPath: + path: /var/log + - name: config-volume + configMap: + name: sriovdp-config + items: + - key: config.json + path: config.json diff --git a/test/e2e_node/testing-manifests/sriovdp-sa.yaml b/test/e2e_node/testing-manifests/sriovdp-sa.yaml new file mode 100644 index 00000000000..73bf1199ee2 --- /dev/null +++ b/test/e2e_node/testing-manifests/sriovdp-sa.yaml @@ -0,0 +1,5 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: sriov-device-plugin + namespace: kube-system diff --git a/test/e2e_node/topology_manager_test.go b/test/e2e_node/topology_manager_test.go index c9f5b2df1e1..51cf16b4bde 100644 --- a/test/e2e_node/topology_manager_test.go +++ b/test/e2e_node/topology_manager_test.go @@ -17,10 +17,18 @@ limitations under the License. package e2enode import ( + "context" "fmt" + "io/ioutil" + "os/exec" + "regexp" + "strconv" + "strings" "time" v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config" @@ -29,24 +37,70 @@ import ( "k8s.io/kubernetes/pkg/kubelet/cm/topologymanager" "k8s.io/kubernetes/test/e2e/framework" e2enode "k8s.io/kubernetes/test/e2e/framework/node" + e2epod "k8s.io/kubernetes/test/e2e/framework/pod" e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper" + "k8s.io/kubernetes/test/e2e/framework/testfiles" "github.com/onsi/ginkgo" "github.com/onsi/gomega" ) +const ( + numalignCmd = `export CPULIST_ALLOWED=$( awk -F":\t*" '/Cpus_allowed_list/ { print $2 }' /proc/self/status); env; sleep 1d` + + minNumaNodes = 2 + minCoreCount = 4 +) + // Helper for makeTopologyManagerPod(). type tmCtnAttribute struct { - ctnName string - cpuRequest string - cpuLimit string + ctnName string + cpuRequest string + cpuLimit string + deviceName string + deviceRequest string + deviceLimit string +} + +func detectNUMANodes() int { + outData, err := exec.Command("/bin/sh", "-c", "lscpu | grep \"NUMA node(s):\" | cut -d \":\" -f 2").Output() + framework.ExpectNoError(err) + + numaNodes, err := strconv.Atoi(strings.TrimSpace(string(outData))) + framework.ExpectNoError(err) + + return numaNodes +} + +func detectCoresPerSocket() int { + outData, err := exec.Command("/bin/sh", "-c", "lscpu | grep \"Core(s) per socket:\" | cut -d \":\" -f 2").Output() + framework.ExpectNoError(err) + + coreCount, err := strconv.Atoi(strings.TrimSpace(string(outData))) + framework.ExpectNoError(err) + + return coreCount +} + +func detectSRIOVDevices() int { + outData, err := exec.Command("/bin/sh", "-c", "ls /sys/bus/pci/devices/*/sriov_totalvfs | wc -w").Output() + framework.ExpectNoError(err) + + devCount, err := strconv.Atoi(strings.TrimSpace(string(outData))) + framework.ExpectNoError(err) + + return devCount } // makeTopologyMangerPod returns a pod with the provided tmCtnAttributes. func makeTopologyManagerPod(podName string, tmCtnAttributes []tmCtnAttribute) *v1.Pod { + cpusetCmd := "grep Cpus_allowed_list /proc/self/status | cut -f2 && sleep 1d" + return makeTopologyManagerTestPod(podName, cpusetCmd, tmCtnAttributes) +} + +func makeTopologyManagerTestPod(podName, podCmd string, tmCtnAttributes []tmCtnAttribute) *v1.Pod { var containers []v1.Container for _, ctnAttr := range tmCtnAttributes { - cpusetCmd := fmt.Sprintf("grep Cpus_allowed_list /proc/self/status | cut -f2 && sleep 1d") ctn := v1.Container{ Name: ctnAttr.ctnName, Image: busyboxImage, @@ -60,7 +114,11 @@ func makeTopologyManagerPod(podName string, tmCtnAttributes []tmCtnAttribute) *v v1.ResourceName(v1.ResourceMemory): resource.MustParse("100Mi"), }, }, - Command: []string{"sh", "-c", cpusetCmd}, + Command: []string{"sh", "-c", podCmd}, + } + if ctnAttr.deviceName != "" { + ctn.Resources.Requests[v1.ResourceName(ctnAttr.deviceName)] = resource.MustParse(ctnAttr.deviceRequest) + ctn.Resources.Limits[v1.ResourceName(ctnAttr.deviceName)] = resource.MustParse(ctnAttr.deviceLimit) } containers = append(containers, ctn) } @@ -76,10 +134,28 @@ func makeTopologyManagerPod(podName string, tmCtnAttributes []tmCtnAttribute) *v } } -func configureTopologyManagerInKubelet(f *framework.Framework, policy string) { +func findNUMANodeWithoutSRIOVDevices(configMap *v1.ConfigMap, numaNodes int) (int, bool) { + for nodeNum := 0; nodeNum < numaNodes; nodeNum++ { + value, ok := configMap.Annotations[fmt.Sprintf("pcidevice_node%d", nodeNum)] + if !ok { + framework.Logf("missing pcidevice annotation for NUMA node %d", nodeNum) + return -1, false + } + v, err := strconv.Atoi(value) + if err != nil { + framework.Failf("error getting the PCI device count on NUMA node %d: %v", nodeNum, err) + } + if v == 0 { + framework.Logf("NUMA node %d has no SRIOV devices attached", nodeNum) + return nodeNum, true + } + framework.Logf("NUMA node %d has %d SRIOV devices attached", nodeNum, v) + } + return -1, false +} + +func configureTopologyManagerInKubelet(f *framework.Framework, oldCfg *kubeletconfig.KubeletConfiguration, policy string, configMap *v1.ConfigMap, numaNodes int) string { // Configure Topology Manager in Kubelet with policy. - oldCfg, err := getCurrentKubeletConfig() - framework.ExpectNoError(err) newCfg := oldCfg.DeepCopy() if newCfg.FeatureGates == nil { newCfg.FeatureGates = make(map[string]bool) @@ -92,7 +168,6 @@ func configureTopologyManagerInKubelet(f *framework.Framework, policy string) { // Set the Topology Manager policy newCfg.TopologyManagerPolicy = policy - //newCfg.TopologyManagerPolicy = topologymanager.PolicySingleNumaNode // Set the CPU Manager policy to static. newCfg.CPUManagerPolicy = string(cpumanager.PolicyStatic) @@ -100,18 +175,25 @@ func configureTopologyManagerInKubelet(f *framework.Framework, policy string) { // Set the CPU Manager reconcile period to 1 second. newCfg.CPUManagerReconcilePeriod = metav1.Duration{Duration: 1 * time.Second} - // The Kubelet panics if either kube-reserved or system-reserved is not set - // when CPU Manager is enabled. Set cpu in kube-reserved > 0 so that - // kubelet doesn't panic. - if newCfg.KubeReserved == nil { - newCfg.KubeReserved = map[string]string{} - } + if nodeNum, ok := findNUMANodeWithoutSRIOVDevices(configMap, numaNodes); ok { + cpus, err := getCPUsPerNUMANode(nodeNum) + framework.Logf("NUMA Node %d doesn't seem to have attached SRIOV devices and has cpus=%v", nodeNum, cpus) + framework.ExpectNoError(err) + newCfg.ReservedSystemCPUs = fmt.Sprintf("%d", cpus[len(cpus)-1]) + } else { + // The Kubelet panics if either kube-reserved or system-reserved is not set + // when CPU Manager is enabled. Set cpu in kube-reserved > 0 so that + // kubelet doesn't panic. + if newCfg.KubeReserved == nil { + newCfg.KubeReserved = map[string]string{} + } - if _, ok := newCfg.KubeReserved["cpu"]; !ok { - newCfg.KubeReserved["cpu"] = "200m" + if _, ok := newCfg.KubeReserved["cpu"]; !ok { + newCfg.KubeReserved["cpu"] = "200m" + } } // Dump the config -- debug - framework.Logf("New kublet config is %s", *newCfg) + framework.Logf("New kubelet config is %s", *newCfg) // Update the Kubelet configuration. framework.ExpectNoError(setKubeletConfiguration(f, newCfg)) @@ -122,9 +204,79 @@ func configureTopologyManagerInKubelet(f *framework.Framework, policy string) { framework.ExpectNoError(err) return nodes == 1 }, time.Minute, time.Second).Should(gomega.BeTrue()) + + return newCfg.ReservedSystemCPUs } -func runTopologyManagerSuiteTests(f *framework.Framework) { +// getSRIOVDevicePluginPod returns the Device Plugin pod for sriov resources in e2e tests. +func getSRIOVDevicePluginPod() *v1.Pod { + ds := readDaemonSetV1OrDie(testfiles.ReadOrDie(SRIOVDevicePluginDSYAML)) + p := &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: SRIOVDevicePluginName, + Namespace: metav1.NamespaceSystem, + }, + + Spec: ds.Spec.Template.Spec, + } + + return p +} + +func readConfigMapV1OrDie(objBytes []byte) *v1.ConfigMap { + v1.AddToScheme(appsScheme) + requiredObj, err := runtime.Decode(appsCodecs.UniversalDecoder(v1.SchemeGroupVersion), objBytes) + if err != nil { + panic(err) + } + return requiredObj.(*v1.ConfigMap) +} + +func readServiceAccountV1OrDie(objBytes []byte) *v1.ServiceAccount { + v1.AddToScheme(appsScheme) + requiredObj, err := runtime.Decode(appsCodecs.UniversalDecoder(v1.SchemeGroupVersion), objBytes) + if err != nil { + panic(err) + } + return requiredObj.(*v1.ServiceAccount) +} + +func findSRIOVResource(node *v1.Node) (string, int64) { + re := regexp.MustCompile(`^intel.com/.*sriov.*`) + for key, val := range node.Status.Capacity { + resource := string(key) + if re.MatchString(resource) { + v := val.Value() + if v > 0 { + return resource, v + } + } + } + return "", 0 +} + +func deletePodInNamespace(f *framework.Framework, namespace, name string) { + gp := int64(0) + deleteOptions := metav1.DeleteOptions{ + GracePeriodSeconds: &gp, + } + err := f.ClientSet.CoreV1().Pods(namespace).Delete(context.TODO(), name, &deleteOptions) + framework.ExpectNoError(err) +} + +func validatePodAlignment(f *framework.Framework, pod *v1.Pod, numaNodes int) { + ginkgo.By("validating the Gu pod") + logs, err := e2epod.GetPodLogs(f.ClientSet, f.Namespace.Name, pod.Name, pod.Spec.Containers[0].Name) + framework.ExpectNoError(err, "expected log not found in container [%s] of pod [%s]", + pod.Spec.Containers[0].Name, pod.Name) + + framework.Logf("got pod logs: %v", logs) + numaRes, err := checkNUMAAlignment(f, pod, logs, numaNodes) + framework.ExpectNoError(err, "NUMA Alignment check failed for [%s] of pod [%s]: %s", + pod.Spec.Containers[0].Name, pod.Name, numaRes.String()) +} + +func runTopologyManagerPolicySuiteTests(f *framework.Framework) { var cpuCap, cpuAlloc int64 var cpuListString, expAllowedCPUsListRegex string var cpuList []int @@ -350,27 +502,252 @@ func runTopologyManagerSuiteTests(f *framework.Framework) { waitForContainerRemoval(pod2.Spec.Containers[0].Name, pod2.Name, pod2.Namespace) } +func runTopologyManagerPositiveTest(f *framework.Framework, numaNodes, numPods int, cpuAmount, sriovResourceName, deviceAmount string) { + var pods []*v1.Pod + + for podID := 0; podID < numPods; podID++ { + ctnAttrs := []tmCtnAttribute{ + { + ctnName: "gu-container", + cpuRequest: cpuAmount, + cpuLimit: cpuAmount, + deviceName: sriovResourceName, + deviceRequest: deviceAmount, + deviceLimit: deviceAmount, + }, + } + + podName := fmt.Sprintf("gu-pod-%d", podID) + framework.Logf("creating pod %s attrs %v", podName, ctnAttrs) + pod := makeTopologyManagerTestPod(podName, numalignCmd, ctnAttrs) + pod = f.PodClient().CreateSync(pod) + framework.Logf("created pod %s", podName) + pods = append(pods, pod) + } + + for podID := 0; podID < numPods; podID++ { + validatePodAlignment(f, pods[podID], numaNodes) + } + + for podID := 0; podID < numPods; podID++ { + pod := pods[podID] + framework.Logf("deleting the pod %s/%s and waiting for container %s removal", + pod.Namespace, pod.Name, pod.Spec.Containers[0].Name) + deletePods(f, []string{pod.Name}) + waitForContainerRemoval(pod.Spec.Containers[0].Name, pod.Name, pod.Namespace) + } +} + +func runTopologyManagerNegativeTest(f *framework.Framework, numaNodes, numPods int, cpuAmount, sriovResourceName, deviceAmount string) { + ctnAttrs := []tmCtnAttribute{ + { + ctnName: "gu-container", + cpuRequest: cpuAmount, + cpuLimit: cpuAmount, + deviceName: sriovResourceName, + deviceRequest: deviceAmount, + deviceLimit: deviceAmount, + }, + } + + podName := "gu-pod" + framework.Logf("creating pod %s attrs %v", podName, ctnAttrs) + pod := makeTopologyManagerTestPod(podName, numalignCmd, ctnAttrs) + + pod = f.PodClient().Create(pod) + err := e2epod.WaitForPodCondition(f.ClientSet, f.Namespace.Name, pod.Name, "Failed", 30*time.Second, func(pod *v1.Pod) (bool, error) { + if pod.Status.Phase != v1.PodPending { + return true, nil + } + return false, nil + }) + framework.ExpectNoError(err) + pod, err = f.PodClient().Get(context.TODO(), pod.Name, metav1.GetOptions{}) + framework.ExpectNoError(err) + + if pod.Status.Phase != v1.PodFailed { + framework.Failf("pod %s not failed: %v", pod.Name, pod.Status) + } + if !isTopologyAffinityError(pod) { + framework.Failf("pod %s failed for wrong reason: %q", pod.Name, pod.Status.Reason) + } + + deletePods(f, []string{pod.Name}) +} + +func isTopologyAffinityError(pod *v1.Pod) bool { + re := regexp.MustCompile(`Topology.*Affinity.*Error`) + return re.MatchString(pod.Status.Reason) +} + +func getSRIOVDevicePluginConfigMap(cmFile string) *v1.ConfigMap { + cmData := testfiles.ReadOrDie(SRIOVDevicePluginCMYAML) + var err error + + // the SRIOVDP configuration is hw-dependent, so we allow per-test-host customization. + framework.Logf("host-local SRIOV Device Plugin Config Map %q", cmFile) + if cmFile != "" { + cmData, err = ioutil.ReadFile(cmFile) + if err != nil { + framework.Failf("unable to load the SRIOV Device Plugin ConfigMap: %v", err) + } + } else { + framework.Logf("Using built-in SRIOV Device Plugin Config Map") + } + + return readConfigMapV1OrDie(cmData) +} + +func setupSRIOVConfigOrFail(f *framework.Framework, configMap *v1.ConfigMap) (*v1.Pod, string, int64) { + var err error + + ginkgo.By(fmt.Sprintf("Creating configMap %v/%v", metav1.NamespaceSystem, configMap.Name)) + if _, err = f.ClientSet.CoreV1().ConfigMaps(metav1.NamespaceSystem).Create(context.TODO(), configMap, metav1.CreateOptions{}); err != nil { + framework.Failf("unable to create test configMap %s: %v", configMap.Name, err) + } + + serviceAccount := readServiceAccountV1OrDie(testfiles.ReadOrDie(SRIOVDevicePluginSAYAML)) + ginkgo.By(fmt.Sprintf("Creating serviceAccount %v/%v", metav1.NamespaceSystem, serviceAccount.Name)) + if _, err = f.ClientSet.CoreV1().ServiceAccounts(metav1.NamespaceSystem).Create(context.TODO(), serviceAccount, metav1.CreateOptions{}); err != nil { + framework.Failf("unable to create test serviceAccount %s: %v", serviceAccount.Name, err) + } + + e2enode.WaitForNodeToBeReady(f.ClientSet, framework.TestContext.NodeName, 5*time.Minute) + + dp := getSRIOVDevicePluginPod() + dp.Spec.NodeName = framework.TestContext.NodeName + + ginkgo.By("Create SRIOV device plugin pod") + dpPod, err := f.ClientSet.CoreV1().Pods(metav1.NamespaceSystem).Create(context.TODO(), dp, metav1.CreateOptions{}) + framework.ExpectNoError(err) + + sriovResourceName := "" + var sriovResourceAmount int64 + ginkgo.By("Waiting for devices to become available on the local node") + gomega.Eventually(func() bool { + node := getLocalNode(f) + framework.Logf("Node status: %v", node.Status.Capacity) + sriovResourceName, sriovResourceAmount = findSRIOVResource(node) + return sriovResourceAmount > 0 + }, 2*time.Minute, framework.Poll).Should(gomega.BeTrue()) + framework.Logf("Successfully created device plugin pod, detected %d SRIOV device %q", sriovResourceAmount, sriovResourceName) + + return dpPod, sriovResourceName, sriovResourceAmount +} + +func teardownSRIOVConfigOrFail(f *framework.Framework, dpPod *v1.Pod) { + framework.Logf("deleting the SRIOV device plugin pod %s/%s and waiting for container %s removal", + dpPod.Namespace, dpPod.Name, dpPod.Spec.Containers[0].Name) + deletePodInNamespace(f, dpPod.Namespace, dpPod.Name) + waitForContainerRemoval(dpPod.Spec.Containers[0].Name, dpPod.Name, dpPod.Namespace) +} + +func runTopologyManagerNodeAlignmentSuiteTests(f *framework.Framework, configMap *v1.ConfigMap, reservedSystemCPUs string, numaNodes, coreCount int) { + threadsPerCore := 1 + if isHTEnabled() { + threadsPerCore = 2 + } + + dpPod, sriovResourceName, sriovResourceAmount := setupSRIOVConfigOrFail(f, configMap) + + // could have been a loop, we unroll it to explain the testcases + + // simplest case + ginkgo.By(fmt.Sprintf("Successfully admit one guaranteed pod with 1 core, 1 %s device", sriovResourceName)) + runTopologyManagerPositiveTest(f, numaNodes, 1, "1000m", sriovResourceName, "1") + + ginkgo.By(fmt.Sprintf("Successfully admit one guaranteed pod with 2 cores, 1 %s device", sriovResourceName)) + runTopologyManagerPositiveTest(f, numaNodes, 1, "2000m", sriovResourceName, "1") + + if reservedSystemCPUs != "" { + // to avoid false negatives, we have put reserved CPUs in such a way there is at least a NUMA node + // with 1+ SRIOV devices and not reserved CPUs. + numCores := threadsPerCore * coreCount + ginkgo.By(fmt.Sprintf("Successfully admit an entire socket (%d cores), 1 %s device", numCores, sriovResourceName)) + runTopologyManagerPositiveTest(f, numaNodes, 1, fmt.Sprintf("%dm", numCores*1000), sriovResourceName, "1") + } + + if sriovResourceAmount > 1 { + // no matter how busses are connected to NUMA nodes and SRIOV devices are installed, this function + // preconditions must ensure the following can be fulfilled + ginkgo.By(fmt.Sprintf("Successfully admit two guaranteed pods, each with 1 core, 1 %s device", sriovResourceName)) + runTopologyManagerPositiveTest(f, numaNodes, 2, "1000m", sriovResourceName, "1") + + ginkgo.By(fmt.Sprintf("Successfully admit two guaranteed pods, each with 2 cores, 1 %s device", sriovResourceName)) + runTopologyManagerPositiveTest(f, numaNodes, 2, "2000m", sriovResourceName, "1") + + // testing more complex conditions require knowledge about the system cpu+bus topology + } + + // overflow NUMA node capacity: cores + numCores := 1 + (threadsPerCore * coreCount) + ginkgo.By(fmt.Sprintf("Trying to admit a guaranteed pods, with %d cores, 1 %s device - and it should be rejected", numCores, sriovResourceName)) + runTopologyManagerNegativeTest(f, numaNodes, 1, fmt.Sprintf("%dm", numCores*1000), sriovResourceName, "1") + + teardownSRIOVConfigOrFail(f, dpPod) +} + func runTopologyManagerTests(f *framework.Framework) { var oldCfg *kubeletconfig.KubeletConfiguration + var err error - ginkgo.It("run Topology Manager test suite", func() { + ginkgo.It("run Topology Manager policy test suite", func() { + oldCfg, err = getCurrentKubeletConfig() + framework.ExpectNoError(err) var policies = []string{topologymanager.PolicySingleNumaNode, topologymanager.PolicyRestricted, topologymanager.PolicyBestEffort, topologymanager.PolicyNone} for _, policy := range policies { // Configure Topology Manager - ginkgo.By("by configuring Topology Manager policy to xxx") + ginkgo.By(fmt.Sprintf("by configuring Topology Manager policy to %s", policy)) framework.Logf("Configuring topology Manager policy to %s", policy) - configureTopologyManagerInKubelet(f, policy) + + configureTopologyManagerInKubelet(f, oldCfg, policy, nil, 0) // Run the tests - runTopologyManagerSuiteTests(f) + runTopologyManagerPolicySuiteTests(f) } // restore kubelet config setOldKubeletConfig(f, oldCfg) - // Debug sleep to allow time to look at kubelet config - time.Sleep(5 * time.Minute) + // Delete state file to allow repeated runs + deleteStateFile() + }) + + ginkgo.It("run Topology Manager node alignment test suite", func() { + // this is a very rough check. We just want to rule out system that does NOT have + // any SRIOV device. A more proper check will be done in runTopologyManagerPositiveTest + sriovdevCount := detectSRIOVDevices() + numaNodes := detectNUMANodes() + coreCount := detectCoresPerSocket() + + if numaNodes < minNumaNodes { + e2eskipper.Skipf("this test is meant to run on a multi-node NUMA system") + } + if coreCount < minCoreCount { + e2eskipper.Skipf("this test is meant to run on a system with at least 4 cores per socket") + } + if sriovdevCount == 0 { + e2eskipper.Skipf("this test is meant to run on a system with at least one SRIOV device") + } + + configMap := getSRIOVDevicePluginConfigMap(framework.TestContext.SriovdpConfigMapFile) + + oldCfg, err = getCurrentKubeletConfig() + framework.ExpectNoError(err) + + policy := topologymanager.PolicySingleNumaNode + + // Configure Topology Manager + ginkgo.By(fmt.Sprintf("by configuring Topology Manager policy to %s", policy)) + framework.Logf("Configuring topology Manager policy to %s", policy) + + reservedSystemCPUs := configureTopologyManagerInKubelet(f, oldCfg, policy, configMap, numaNodes) + + runTopologyManagerNodeAlignmentSuiteTests(f, configMap, reservedSystemCPUs, numaNodes, coreCount) + + // restore kubelet config + setOldKubeletConfig(f, oldCfg) // Delete state file to allow repeated runs deleteStateFile() diff --git a/test/e2e_node/util_sriov.go b/test/e2e_node/util_sriov.go new file mode 100644 index 00000000000..4b404332157 --- /dev/null +++ b/test/e2e_node/util_sriov.go @@ -0,0 +1,28 @@ +/* +Copyright 2020 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package e2enode + +const ( + // SRIOVDevicePluginCMYAML is the path of the config map to configure the sriov device plugin. + SRIOVDevicePluginCMYAML = "test/e2e_node/testing-manifests/sriovdp-cm.yaml" + // SRIOVDevicePluginDSYAML is the path of the daemonset template of the sriov device plugin. // TODO: Parametrize it by making it a feature in TestFramework. + SRIOVDevicePluginDSYAML = "test/e2e_node/testing-manifests/sriovdp-ds.yaml" + // SRIOVDevicePluginSAYAML is the path of the service account needed by the sriov device plugin to run. + SRIOVDevicePluginSAYAML = "test/e2e_node/testing-manifests/sriovdp-sa.yaml" + // SRIOVDevicePluginName is the name of the device plugin pod + SRIOVDevicePluginName = "sriov-device-plugin" +)