Merge pull request #42204 from dashpole/allocatable_eviction

Automatic merge from submit-queue

Eviction Manager Enforces Allocatable Thresholds

This PR modifies the eviction manager to enforce node allocatable thresholds for memory as described in kubernetes/community#348.
This PR should be merged after #41234. 

cc @kubernetes/sig-node-pr-reviews @kubernetes/sig-node-feature-requests @vishh 

** Why is this a bug/regression**

Kubelet uses `oom_score_adj` to enforce QoS policies. But the `oom_score_adj` is based on overall memory requested, which means that a Burstable pod that requested a lot of memory can lead to OOM kills for Guaranteed pods, which violates QoS. Even worse, we have observed system daemons like kubelet or kube-proxy being killed by the OOM killer.
Without this PR, v1.6 will have node stability issues and regressions in an existing GA feature `out of Resource` handling.
This commit is contained in:
Kubernetes Submit Queue
2017-03-03 20:20:12 -08:00
committed by GitHub
14 changed files with 474 additions and 83 deletions

View File

@@ -57,6 +57,7 @@ go_library(
go_test(
name = "go_default_test",
srcs = [
"allocatable_eviction_test.go",
"apparmor_test.go",
"container_manager_test.go",
"critical_pod_test.go",

View File

@@ -0,0 +1,104 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package e2e_node
import (
"fmt"
"time"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/kubernetes/pkg/api/v1"
"k8s.io/kubernetes/pkg/apis/componentconfig"
"k8s.io/kubernetes/pkg/kubelet/cm"
"k8s.io/kubernetes/test/e2e/framework"
. "github.com/onsi/ginkgo"
. "github.com/onsi/gomega"
)
// Eviction Policy is described here:
// https://github.com/kubernetes/kubernetes/blob/master/docs/proposals/kubelet-eviction.md
var _ = framework.KubeDescribe("AllocatableEviction [Slow] [Serial] [Disruptive] [Flaky]", func() {
f := framework.NewDefaultFramework("allocatable-eviction-test")
podTestSpecs := []podTestSpec{
{
evictionPriority: 1, // This pod should be evicted before the innocent pod
pod: *getMemhogPod("memory-hog-pod", "memory-hog", v1.ResourceRequirements{}),
},
{
evictionPriority: 0, // This pod should never be evicted
pod: v1.Pod{
ObjectMeta: metav1.ObjectMeta{Name: "innocent-pod"},
Spec: v1.PodSpec{
RestartPolicy: v1.RestartPolicyNever,
Containers: []v1.Container{
{
Image: "gcr.io/google_containers/busybox:1.24",
Name: "normal-memory-usage-container",
Command: []string{
"sh",
"-c", //make one big (5 Gb) file
"dd if=/dev/urandom of=largefile bs=5000000000 count=1; while true; do sleep 5; done",
},
},
},
},
},
},
}
evictionTestTimeout := 40 * time.Minute
testCondition := "Memory Pressure"
kubeletConfigUpdate := func(initialConfig *componentconfig.KubeletConfiguration) {
initialConfig.EvictionHard = "memory.available<10%"
// Set large system and kube reserved values to trigger allocatable thresholds far before hard eviction thresholds.
initialConfig.SystemReserved = componentconfig.ConfigurationMap(map[string]string{"memory": "1Gi"})
initialConfig.KubeReserved = componentconfig.ConfigurationMap(map[string]string{"memory": "1Gi"})
initialConfig.EnforceNodeAllocatable = []string{cm.NodeAllocatableEnforcementKey}
initialConfig.ExperimentalNodeAllocatableIgnoreEvictionThreshold = false
initialConfig.CgroupsPerQOS = true
}
runEvictionTest(f, testCondition, podTestSpecs, evictionTestTimeout, hasMemoryPressure, kubeletConfigUpdate)
})
// Returns TRUE if the node has Memory Pressure, FALSE otherwise
func hasMemoryPressure(f *framework.Framework, testCondition string) (bool, error) {
localNodeStatus := getLocalNode(f).Status
_, pressure := v1.GetNodeCondition(&localNodeStatus, v1.NodeMemoryPressure)
Expect(pressure).NotTo(BeNil())
hasPressure := pressure.Status == v1.ConditionTrue
By(fmt.Sprintf("checking if pod has %s: %v", testCondition, hasPressure))
// Additional Logging relating to Memory
summary, err := getNodeSummary()
if err != nil {
return false, err
}
if summary.Node.Memory != nil && summary.Node.Memory.WorkingSetBytes != nil && summary.Node.Memory.AvailableBytes != nil {
framework.Logf("Node.Memory.WorkingSetBytes: %d, summary.Node.Memory.AvailableBytes: %d", *summary.Node.Memory.WorkingSetBytes, *summary.Node.Memory.AvailableBytes)
}
for _, pod := range summary.Pods {
framework.Logf("Pod: %s", pod.PodRef.Name)
for _, container := range pod.Containers {
if container.Memory != nil && container.Memory.WorkingSetBytes != nil {
framework.Logf("--- summary Container: %s WorkingSetBytes: %d", container.Name, *container.Memory.WorkingSetBytes)
}
}
}
return hasPressure, nil
}

View File

@@ -22,6 +22,7 @@ import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/kubernetes/pkg/api/v1"
"k8s.io/kubernetes/pkg/apis/componentconfig"
"k8s.io/kubernetes/test/e2e/framework"
. "github.com/onsi/ginkgo"
@@ -112,10 +113,11 @@ var _ = framework.KubeDescribe("InodeEviction [Slow] [Serial] [Disruptive] [Flak
}
evictionTestTimeout := 30 * time.Minute
testCondition := "Disk Pressure due to Inodes"
// Set the EvictionHard threshold lower to decrease test time
evictionHardLimit := "nodefs.inodesFree<50%"
kubeletConfigUpdate := func(initialConfig *componentconfig.KubeletConfiguration) {
initialConfig.EvictionHard = "nodefs.inodesFree<50%"
}
runEvictionTest(f, testCondition, podTestSpecs, evictionHardLimit, evictionTestTimeout, hasInodePressure)
runEvictionTest(f, testCondition, podTestSpecs, evictionTestTimeout, hasInodePressure, kubeletConfigUpdate)
})
// Struct used by runEvictionTest that specifies the pod, and when that pod should be evicted, relative to other pods
@@ -133,12 +135,12 @@ type podTestSpec struct {
// It ensures that lower evictionPriority pods are always evicted before higher evictionPriority pods (2 evicted before 1, etc.)
// It ensures that all lower evictionPriority pods are eventually evicted.
// runEvictionTest then cleans up the testing environment by deleting provided nodes, and ensures that testCondition no longer exists
func runEvictionTest(f *framework.Framework, testCondition string, podTestSpecs []podTestSpec, evictionHard string,
evictionTestTimeout time.Duration, hasPressureCondition func(*framework.Framework, string) (bool, error)) {
func runEvictionTest(f *framework.Framework, testCondition string, podTestSpecs []podTestSpec, evictionTestTimeout time.Duration,
hasPressureCondition func(*framework.Framework, string) (bool, error), updateFunction func(initialConfig *componentconfig.KubeletConfiguration)) {
Context(fmt.Sprintf("when we run containers that should cause %s", testCondition), func() {
tempSetEvictionHard(f, evictionHard)
tempSetCurrentKubeletConfig(f, updateFunction)
BeforeEach(func() {
By("seting up pods to be used by tests")
for _, spec := range podTestSpecs {
@@ -148,6 +150,11 @@ func runEvictionTest(f *framework.Framework, testCondition string, podTestSpecs
})
It(fmt.Sprintf("should eventually see %s, and then evict all of the correct pods", testCondition), func() {
configEnabled, err := isKubeletConfigEnabled(f)
framework.ExpectNoError(err)
if !configEnabled {
framework.Skipf("Dynamic kubelet config must be enabled for this test to run.")
}
Eventually(func() error {
hasPressure, err := hasPressureCondition(f, testCondition)
if err != nil {
@@ -299,14 +306,8 @@ func runEvictionTest(f *framework.Framework, testCondition string, podTestSpecs
// Returns TRUE if the node has disk pressure due to inodes exists on the node, FALSE otherwise
func hasInodePressure(f *framework.Framework, testCondition string) (bool, error) {
nodeList, err := f.ClientSet.Core().Nodes().List(metav1.ListOptions{})
framework.ExpectNoError(err, "getting node list")
if len(nodeList.Items) != 1 {
return false, fmt.Errorf("expected 1 node, but see %d. List: %v", len(nodeList.Items), nodeList.Items)
}
_, pressure := v1.GetNodeCondition(&nodeList.Items[0].Status, v1.NodeDiskPressure)
localNodeStatus := getLocalNode(f).Status
_, pressure := v1.GetNodeCondition(&localNodeStatus, v1.NodeDiskPressure)
Expect(pressure).NotTo(BeNil())
hasPressure := pressure.Status == v1.ConditionTrue
By(fmt.Sprintf("checking if pod has %s: %v", testCondition, hasPressure))

View File

@@ -136,7 +136,7 @@ var _ = framework.KubeDescribe("MemoryEviction [Slow] [Serial] [Disruptive]", fu
By("creating a guaranteed pod, a burstable pod, and a besteffort pod.")
// A pod is guaranteed only when requests and limits are specified for all the containers and they are equal.
guaranteed := createMemhogPod(f, "guaranteed-", "guaranteed", v1.ResourceRequirements{
guaranteed := getMemhogPod("guaranteed-pod", "guaranteed", v1.ResourceRequirements{
Requests: v1.ResourceList{
"cpu": resource.MustParse("100m"),
"memory": resource.MustParse("100Mi"),
@@ -145,16 +145,22 @@ var _ = framework.KubeDescribe("MemoryEviction [Slow] [Serial] [Disruptive]", fu
"cpu": resource.MustParse("100m"),
"memory": resource.MustParse("100Mi"),
}})
guaranteed = f.PodClient().CreateSync(guaranteed)
glog.Infof("pod created with name: %s", guaranteed.Name)
// A pod is burstable if limits and requests do not match across all containers.
burstable := createMemhogPod(f, "burstable-", "burstable", v1.ResourceRequirements{
burstable := getMemhogPod("burstable-pod", "burstable", v1.ResourceRequirements{
Requests: v1.ResourceList{
"cpu": resource.MustParse("100m"),
"memory": resource.MustParse("100Mi"),
}})
burstable = f.PodClient().CreateSync(burstable)
glog.Infof("pod created with name: %s", burstable.Name)
// A pod is besteffort if none of its containers have specified any requests or limits.
besteffort := createMemhogPod(f, "besteffort-", "besteffort", v1.ResourceRequirements{})
// A pod is besteffort if none of its containers have specified any requests or limits .
besteffort := getMemhogPod("besteffort-pod", "besteffort", v1.ResourceRequirements{})
besteffort = f.PodClient().CreateSync(besteffort)
glog.Infof("pod created with name: %s", besteffort.Name)
// We poll until timeout or all pods are killed.
// Inside the func, we check that all pods are in a valid phase with
@@ -232,7 +238,7 @@ var _ = framework.KubeDescribe("MemoryEviction [Slow] [Serial] [Disruptive]", fu
})
func createMemhogPod(f *framework.Framework, genName string, ctnName string, res v1.ResourceRequirements) *v1.Pod {
func getMemhogPod(podName string, ctnName string, res v1.ResourceRequirements) *v1.Pod {
env := []v1.EnvVar{
{
Name: "MEMORY_LIMIT",
@@ -256,9 +262,9 @@ func createMemhogPod(f *framework.Framework, genName string, ctnName string, res
memLimit = "$(MEMORY_LIMIT)"
}
pod := &v1.Pod{
return &v1.Pod{
ObjectMeta: metav1.ObjectMeta{
GenerateName: genName,
Name: podName,
},
Spec: v1.PodSpec{
RestartPolicy: v1.RestartPolicyNever,
@@ -277,8 +283,4 @@ func createMemhogPod(f *framework.Framework, genName string, ctnName string, res
},
},
}
// The generated pod.Name will be on the pod spec returned by CreateSync
pod = f.PodClient().CreateSync(pod)
glog.Infof("pod created with name: %s", pod.Name)
return pod
}

View File

@@ -86,13 +86,6 @@ func getCurrentKubeletConfig() (*componentconfig.KubeletConfiguration, error) {
return kubeCfg, nil
}
// Convenience method to set the evictionHard threshold during the current context.
func tempSetEvictionHard(f *framework.Framework, evictionHard string) {
tempSetCurrentKubeletConfig(f, func(initialConfig *componentconfig.KubeletConfiguration) {
initialConfig.EvictionHard = evictionHard
})
}
// Must be called within a Context. Allows the function to modify the KubeletConfiguration during the BeforeEach of the context.
// The change is reverted in the AfterEach of the context.
// Returns true on success.