Merge pull request #42204 from dashpole/allocatable_eviction
Automatic merge from submit-queue Eviction Manager Enforces Allocatable Thresholds This PR modifies the eviction manager to enforce node allocatable thresholds for memory as described in kubernetes/community#348. This PR should be merged after #41234. cc @kubernetes/sig-node-pr-reviews @kubernetes/sig-node-feature-requests @vishh ** Why is this a bug/regression** Kubelet uses `oom_score_adj` to enforce QoS policies. But the `oom_score_adj` is based on overall memory requested, which means that a Burstable pod that requested a lot of memory can lead to OOM kills for Guaranteed pods, which violates QoS. Even worse, we have observed system daemons like kubelet or kube-proxy being killed by the OOM killer. Without this PR, v1.6 will have node stability issues and regressions in an existing GA feature `out of Resource` handling.
This commit is contained in:
@@ -57,6 +57,7 @@ go_library(
|
||||
go_test(
|
||||
name = "go_default_test",
|
||||
srcs = [
|
||||
"allocatable_eviction_test.go",
|
||||
"apparmor_test.go",
|
||||
"container_manager_test.go",
|
||||
"critical_pod_test.go",
|
||||
|
104
test/e2e_node/allocatable_eviction_test.go
Normal file
104
test/e2e_node/allocatable_eviction_test.go
Normal file
@@ -0,0 +1,104 @@
|
||||
/*
|
||||
Copyright 2017 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package e2e_node
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/kubernetes/pkg/api/v1"
|
||||
"k8s.io/kubernetes/pkg/apis/componentconfig"
|
||||
"k8s.io/kubernetes/pkg/kubelet/cm"
|
||||
"k8s.io/kubernetes/test/e2e/framework"
|
||||
|
||||
. "github.com/onsi/ginkgo"
|
||||
. "github.com/onsi/gomega"
|
||||
)
|
||||
|
||||
// Eviction Policy is described here:
|
||||
// https://github.com/kubernetes/kubernetes/blob/master/docs/proposals/kubelet-eviction.md
|
||||
|
||||
var _ = framework.KubeDescribe("AllocatableEviction [Slow] [Serial] [Disruptive] [Flaky]", func() {
|
||||
f := framework.NewDefaultFramework("allocatable-eviction-test")
|
||||
|
||||
podTestSpecs := []podTestSpec{
|
||||
{
|
||||
evictionPriority: 1, // This pod should be evicted before the innocent pod
|
||||
pod: *getMemhogPod("memory-hog-pod", "memory-hog", v1.ResourceRequirements{}),
|
||||
},
|
||||
{
|
||||
evictionPriority: 0, // This pod should never be evicted
|
||||
pod: v1.Pod{
|
||||
ObjectMeta: metav1.ObjectMeta{Name: "innocent-pod"},
|
||||
Spec: v1.PodSpec{
|
||||
RestartPolicy: v1.RestartPolicyNever,
|
||||
Containers: []v1.Container{
|
||||
{
|
||||
Image: "gcr.io/google_containers/busybox:1.24",
|
||||
Name: "normal-memory-usage-container",
|
||||
Command: []string{
|
||||
"sh",
|
||||
"-c", //make one big (5 Gb) file
|
||||
"dd if=/dev/urandom of=largefile bs=5000000000 count=1; while true; do sleep 5; done",
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
evictionTestTimeout := 40 * time.Minute
|
||||
testCondition := "Memory Pressure"
|
||||
kubeletConfigUpdate := func(initialConfig *componentconfig.KubeletConfiguration) {
|
||||
initialConfig.EvictionHard = "memory.available<10%"
|
||||
// Set large system and kube reserved values to trigger allocatable thresholds far before hard eviction thresholds.
|
||||
initialConfig.SystemReserved = componentconfig.ConfigurationMap(map[string]string{"memory": "1Gi"})
|
||||
initialConfig.KubeReserved = componentconfig.ConfigurationMap(map[string]string{"memory": "1Gi"})
|
||||
initialConfig.EnforceNodeAllocatable = []string{cm.NodeAllocatableEnforcementKey}
|
||||
initialConfig.ExperimentalNodeAllocatableIgnoreEvictionThreshold = false
|
||||
initialConfig.CgroupsPerQOS = true
|
||||
}
|
||||
runEvictionTest(f, testCondition, podTestSpecs, evictionTestTimeout, hasMemoryPressure, kubeletConfigUpdate)
|
||||
})
|
||||
|
||||
// Returns TRUE if the node has Memory Pressure, FALSE otherwise
|
||||
func hasMemoryPressure(f *framework.Framework, testCondition string) (bool, error) {
|
||||
localNodeStatus := getLocalNode(f).Status
|
||||
_, pressure := v1.GetNodeCondition(&localNodeStatus, v1.NodeMemoryPressure)
|
||||
Expect(pressure).NotTo(BeNil())
|
||||
hasPressure := pressure.Status == v1.ConditionTrue
|
||||
By(fmt.Sprintf("checking if pod has %s: %v", testCondition, hasPressure))
|
||||
|
||||
// Additional Logging relating to Memory
|
||||
summary, err := getNodeSummary()
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
if summary.Node.Memory != nil && summary.Node.Memory.WorkingSetBytes != nil && summary.Node.Memory.AvailableBytes != nil {
|
||||
framework.Logf("Node.Memory.WorkingSetBytes: %d, summary.Node.Memory.AvailableBytes: %d", *summary.Node.Memory.WorkingSetBytes, *summary.Node.Memory.AvailableBytes)
|
||||
}
|
||||
for _, pod := range summary.Pods {
|
||||
framework.Logf("Pod: %s", pod.PodRef.Name)
|
||||
for _, container := range pod.Containers {
|
||||
if container.Memory != nil && container.Memory.WorkingSetBytes != nil {
|
||||
framework.Logf("--- summary Container: %s WorkingSetBytes: %d", container.Name, *container.Memory.WorkingSetBytes)
|
||||
}
|
||||
}
|
||||
}
|
||||
return hasPressure, nil
|
||||
}
|
@@ -22,6 +22,7 @@ import (
|
||||
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/kubernetes/pkg/api/v1"
|
||||
"k8s.io/kubernetes/pkg/apis/componentconfig"
|
||||
"k8s.io/kubernetes/test/e2e/framework"
|
||||
|
||||
. "github.com/onsi/ginkgo"
|
||||
@@ -112,10 +113,11 @@ var _ = framework.KubeDescribe("InodeEviction [Slow] [Serial] [Disruptive] [Flak
|
||||
}
|
||||
evictionTestTimeout := 30 * time.Minute
|
||||
testCondition := "Disk Pressure due to Inodes"
|
||||
// Set the EvictionHard threshold lower to decrease test time
|
||||
evictionHardLimit := "nodefs.inodesFree<50%"
|
||||
kubeletConfigUpdate := func(initialConfig *componentconfig.KubeletConfiguration) {
|
||||
initialConfig.EvictionHard = "nodefs.inodesFree<50%"
|
||||
}
|
||||
|
||||
runEvictionTest(f, testCondition, podTestSpecs, evictionHardLimit, evictionTestTimeout, hasInodePressure)
|
||||
runEvictionTest(f, testCondition, podTestSpecs, evictionTestTimeout, hasInodePressure, kubeletConfigUpdate)
|
||||
})
|
||||
|
||||
// Struct used by runEvictionTest that specifies the pod, and when that pod should be evicted, relative to other pods
|
||||
@@ -133,12 +135,12 @@ type podTestSpec struct {
|
||||
// It ensures that lower evictionPriority pods are always evicted before higher evictionPriority pods (2 evicted before 1, etc.)
|
||||
// It ensures that all lower evictionPriority pods are eventually evicted.
|
||||
// runEvictionTest then cleans up the testing environment by deleting provided nodes, and ensures that testCondition no longer exists
|
||||
func runEvictionTest(f *framework.Framework, testCondition string, podTestSpecs []podTestSpec, evictionHard string,
|
||||
evictionTestTimeout time.Duration, hasPressureCondition func(*framework.Framework, string) (bool, error)) {
|
||||
func runEvictionTest(f *framework.Framework, testCondition string, podTestSpecs []podTestSpec, evictionTestTimeout time.Duration,
|
||||
hasPressureCondition func(*framework.Framework, string) (bool, error), updateFunction func(initialConfig *componentconfig.KubeletConfiguration)) {
|
||||
|
||||
Context(fmt.Sprintf("when we run containers that should cause %s", testCondition), func() {
|
||||
|
||||
tempSetEvictionHard(f, evictionHard)
|
||||
tempSetCurrentKubeletConfig(f, updateFunction)
|
||||
BeforeEach(func() {
|
||||
By("seting up pods to be used by tests")
|
||||
for _, spec := range podTestSpecs {
|
||||
@@ -148,6 +150,11 @@ func runEvictionTest(f *framework.Framework, testCondition string, podTestSpecs
|
||||
})
|
||||
|
||||
It(fmt.Sprintf("should eventually see %s, and then evict all of the correct pods", testCondition), func() {
|
||||
configEnabled, err := isKubeletConfigEnabled(f)
|
||||
framework.ExpectNoError(err)
|
||||
if !configEnabled {
|
||||
framework.Skipf("Dynamic kubelet config must be enabled for this test to run.")
|
||||
}
|
||||
Eventually(func() error {
|
||||
hasPressure, err := hasPressureCondition(f, testCondition)
|
||||
if err != nil {
|
||||
@@ -299,14 +306,8 @@ func runEvictionTest(f *framework.Framework, testCondition string, podTestSpecs
|
||||
|
||||
// Returns TRUE if the node has disk pressure due to inodes exists on the node, FALSE otherwise
|
||||
func hasInodePressure(f *framework.Framework, testCondition string) (bool, error) {
|
||||
|
||||
nodeList, err := f.ClientSet.Core().Nodes().List(metav1.ListOptions{})
|
||||
framework.ExpectNoError(err, "getting node list")
|
||||
if len(nodeList.Items) != 1 {
|
||||
return false, fmt.Errorf("expected 1 node, but see %d. List: %v", len(nodeList.Items), nodeList.Items)
|
||||
}
|
||||
|
||||
_, pressure := v1.GetNodeCondition(&nodeList.Items[0].Status, v1.NodeDiskPressure)
|
||||
localNodeStatus := getLocalNode(f).Status
|
||||
_, pressure := v1.GetNodeCondition(&localNodeStatus, v1.NodeDiskPressure)
|
||||
Expect(pressure).NotTo(BeNil())
|
||||
hasPressure := pressure.Status == v1.ConditionTrue
|
||||
By(fmt.Sprintf("checking if pod has %s: %v", testCondition, hasPressure))
|
||||
|
@@ -136,7 +136,7 @@ var _ = framework.KubeDescribe("MemoryEviction [Slow] [Serial] [Disruptive]", fu
|
||||
By("creating a guaranteed pod, a burstable pod, and a besteffort pod.")
|
||||
|
||||
// A pod is guaranteed only when requests and limits are specified for all the containers and they are equal.
|
||||
guaranteed := createMemhogPod(f, "guaranteed-", "guaranteed", v1.ResourceRequirements{
|
||||
guaranteed := getMemhogPod("guaranteed-pod", "guaranteed", v1.ResourceRequirements{
|
||||
Requests: v1.ResourceList{
|
||||
"cpu": resource.MustParse("100m"),
|
||||
"memory": resource.MustParse("100Mi"),
|
||||
@@ -145,16 +145,22 @@ var _ = framework.KubeDescribe("MemoryEviction [Slow] [Serial] [Disruptive]", fu
|
||||
"cpu": resource.MustParse("100m"),
|
||||
"memory": resource.MustParse("100Mi"),
|
||||
}})
|
||||
guaranteed = f.PodClient().CreateSync(guaranteed)
|
||||
glog.Infof("pod created with name: %s", guaranteed.Name)
|
||||
|
||||
// A pod is burstable if limits and requests do not match across all containers.
|
||||
burstable := createMemhogPod(f, "burstable-", "burstable", v1.ResourceRequirements{
|
||||
burstable := getMemhogPod("burstable-pod", "burstable", v1.ResourceRequirements{
|
||||
Requests: v1.ResourceList{
|
||||
"cpu": resource.MustParse("100m"),
|
||||
"memory": resource.MustParse("100Mi"),
|
||||
}})
|
||||
burstable = f.PodClient().CreateSync(burstable)
|
||||
glog.Infof("pod created with name: %s", burstable.Name)
|
||||
|
||||
// A pod is besteffort if none of its containers have specified any requests or limits.
|
||||
besteffort := createMemhogPod(f, "besteffort-", "besteffort", v1.ResourceRequirements{})
|
||||
// A pod is besteffort if none of its containers have specified any requests or limits .
|
||||
besteffort := getMemhogPod("besteffort-pod", "besteffort", v1.ResourceRequirements{})
|
||||
besteffort = f.PodClient().CreateSync(besteffort)
|
||||
glog.Infof("pod created with name: %s", besteffort.Name)
|
||||
|
||||
// We poll until timeout or all pods are killed.
|
||||
// Inside the func, we check that all pods are in a valid phase with
|
||||
@@ -232,7 +238,7 @@ var _ = framework.KubeDescribe("MemoryEviction [Slow] [Serial] [Disruptive]", fu
|
||||
|
||||
})
|
||||
|
||||
func createMemhogPod(f *framework.Framework, genName string, ctnName string, res v1.ResourceRequirements) *v1.Pod {
|
||||
func getMemhogPod(podName string, ctnName string, res v1.ResourceRequirements) *v1.Pod {
|
||||
env := []v1.EnvVar{
|
||||
{
|
||||
Name: "MEMORY_LIMIT",
|
||||
@@ -256,9 +262,9 @@ func createMemhogPod(f *framework.Framework, genName string, ctnName string, res
|
||||
memLimit = "$(MEMORY_LIMIT)"
|
||||
}
|
||||
|
||||
pod := &v1.Pod{
|
||||
return &v1.Pod{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
GenerateName: genName,
|
||||
Name: podName,
|
||||
},
|
||||
Spec: v1.PodSpec{
|
||||
RestartPolicy: v1.RestartPolicyNever,
|
||||
@@ -277,8 +283,4 @@ func createMemhogPod(f *framework.Framework, genName string, ctnName string, res
|
||||
},
|
||||
},
|
||||
}
|
||||
// The generated pod.Name will be on the pod spec returned by CreateSync
|
||||
pod = f.PodClient().CreateSync(pod)
|
||||
glog.Infof("pod created with name: %s", pod.Name)
|
||||
return pod
|
||||
}
|
||||
|
@@ -86,13 +86,6 @@ func getCurrentKubeletConfig() (*componentconfig.KubeletConfiguration, error) {
|
||||
return kubeCfg, nil
|
||||
}
|
||||
|
||||
// Convenience method to set the evictionHard threshold during the current context.
|
||||
func tempSetEvictionHard(f *framework.Framework, evictionHard string) {
|
||||
tempSetCurrentKubeletConfig(f, func(initialConfig *componentconfig.KubeletConfiguration) {
|
||||
initialConfig.EvictionHard = evictionHard
|
||||
})
|
||||
}
|
||||
|
||||
// Must be called within a Context. Allows the function to modify the KubeletConfiguration during the BeforeEach of the context.
|
||||
// The change is reverted in the AfterEach of the context.
|
||||
// Returns true on success.
|
||||
|
Reference in New Issue
Block a user