Merge pull request #43590 from dashpole/eviction_complete_deletion

Automatic merge from submit-queue (batch tested with PRs 46022, 46055, 45308, 46209, 43590)

Eviction does not evict unless the previous pod has been cleaned up

Addresses #43166
This PR makes two main changes:
First, it makes the eviction loop re-trigger immediately if there may still be pressure.  This way, if we already waited 10 seconds to delete a pod, we dont need to wait another 10 seconds for the next synchronize call.
Second, it waits for the pod to be cleaned up (including volumes, cgroups, etc), before moving on to the next synchronize call.  It has a timeout for this operation currently set to 30 seconds.
This commit is contained in:
Kubernetes Submit Queue 2017-05-22 20:00:03 -07:00 committed by GitHub
commit 99a8f7c303
7 changed files with 76 additions and 48 deletions

View File

@ -43,6 +43,11 @@ import (
"k8s.io/kubernetes/pkg/kubelet/util/format"
)
const (
podCleanupTimeout = 30 * time.Second
podCleanupPollFreq = time.Second
)
// managerImpl implements Manager
type managerImpl struct {
// used to track time
@ -135,9 +140,18 @@ func (m *managerImpl) Admit(attrs *lifecycle.PodAdmitAttributes) lifecycle.PodAd
}
// Start starts the control loop to observe and response to low compute resources.
func (m *managerImpl) Start(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, nodeProvider NodeProvider, monitoringInterval time.Duration) {
func (m *managerImpl) Start(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, podCleanedUpFunc PodCleanedUpFunc, nodeProvider NodeProvider, monitoringInterval time.Duration) {
// start the eviction manager monitoring
go wait.Until(func() { m.synchronize(diskInfoProvider, podFunc, nodeProvider) }, monitoringInterval, wait.NeverStop)
go func() {
for {
if evictedPod := m.synchronize(diskInfoProvider, podFunc, nodeProvider); evictedPod != nil {
glog.Infof("eviction manager: pod %s evicted, waiting for pod to be cleaned up", format.Pod(evictedPod))
m.waitForPodCleanup(podCleanedUpFunc, evictedPod)
} else {
time.Sleep(monitoringInterval)
}
}
}()
}
// IsUnderMemoryPressure returns true if the node is under memory pressure.
@ -188,11 +202,12 @@ func startMemoryThresholdNotifier(thresholds []evictionapi.Threshold, observatio
}
// synchronize is the main control loop that enforces eviction thresholds.
func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, nodeProvider NodeProvider) {
// Returns the pod that was killed, or nil if no pod was killed.
func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, nodeProvider NodeProvider) *v1.Pod {
// if we have nothing to do, just return
thresholds := m.config.Thresholds
if len(thresholds) == 0 {
return
return nil
}
glog.V(3).Infof("eviction manager: synchronize housekeeping")
@ -203,7 +218,7 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
// this may error if cadvisor has yet to complete housekeeping, so we will just try again in next pass.
hasDedicatedImageFs, err := diskInfoProvider.HasDedicatedImageFs()
if err != nil {
return
return nil
}
m.resourceToRankFunc = buildResourceToRankFunc(hasDedicatedImageFs)
m.resourceToNodeReclaimFuncs = buildResourceToNodeReclaimFuncs(m.imageGC, hasDedicatedImageFs)
@ -213,7 +228,7 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
observations, statsFunc, err := makeSignalObservations(m.summaryProvider, nodeProvider)
if err != nil {
glog.Errorf("eviction manager: unexpected err: %v", err)
return
return nil
}
debugLogObservations("observations", observations)
@ -291,7 +306,7 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
starvedResources := getStarvedResources(thresholds)
if len(starvedResources) == 0 {
glog.V(3).Infof("eviction manager: no resources are starved")
return
return nil
}
// rank the resources to reclaim by eviction priority
@ -308,7 +323,7 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
// check if there are node-level resources we can reclaim to reduce pressure before evicting end-user pods.
if m.reclaimNodeLevelResources(resourceToReclaim, observations) {
glog.Infof("eviction manager: able to reduce %v pressure without evicting pods.", resourceToReclaim)
return
return nil
}
glog.Infof("eviction manager: must evict pod(s) to reclaim %v", resourceToReclaim)
@ -317,16 +332,11 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
rank, ok := m.resourceToRankFunc[resourceToReclaim]
if !ok {
glog.Errorf("eviction manager: no ranking function for resource %s", resourceToReclaim)
return
return nil
}
// the only candidates viable for eviction are those pods that had anything running.
activePods := podFunc()
if len(activePods) == 0 {
glog.Errorf("eviction manager: eviction thresholds have been met, but no pods are active to evict")
return
}
// rank the running pods for eviction for the specified resource
rank(activePods, statsFunc)
@ -364,14 +374,29 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
// this is a blocking call and should only return when the pod and its containers are killed.
err := m.killPodFunc(pod, status, &gracePeriodOverride)
if err != nil {
glog.Infof("eviction manager: pod %s failed to evict %v", format.Pod(pod), err)
continue
glog.Warningf("eviction manager: error while evicting pod %s: %v", format.Pod(pod), err)
}
// success, so we return until the next housekeeping interval
glog.Infof("eviction manager: pod %s evicted successfully", format.Pod(pod))
return
return pod
}
glog.Infof("eviction manager: unable to evict any pods from the node")
return nil
}
func (m *managerImpl) waitForPodCleanup(podCleanedUpFunc PodCleanedUpFunc, pod *v1.Pod) {
timeout := m.clock.NewTimer(podCleanupTimeout)
tick := m.clock.Tick(podCleanupPollFreq)
for {
select {
case <-timeout.C():
glog.Warningf("eviction manager: timed out waiting for pod %s to be cleaned up", format.Pod(pod))
return
case <-tick:
if podCleanedUpFunc(pod) {
glog.Infof("eviction manager: pod %s successfully cleaned up", format.Pod(pod))
return
}
}
}
}
// reclaimNodeLevelResources attempts to reclaim node level resources. returns true if thresholds were satisfied and no pod eviction is required.

View File

@ -53,7 +53,7 @@ type Config struct {
// Manager evaluates when an eviction threshold for node stability has been met on the node.
type Manager interface {
// Start starts the control loop to monitor eviction thresholds at specified interval.
Start(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, nodeProvider NodeProvider, monitoringInterval time.Duration)
Start(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, podCleanedUpFunc PodCleanedUpFunc, nodeProvider NodeProvider, monitoringInterval time.Duration)
// IsUnderMemoryPressure returns true if the node is under memory pressure.
IsUnderMemoryPressure() bool
@ -93,6 +93,9 @@ type KillPodFunc func(pod *v1.Pod, status v1.PodStatus, gracePeriodOverride *int
// ActivePodsFunc returns pods bound to the kubelet that are active (i.e. non-terminal state)
type ActivePodsFunc func() []*v1.Pod
// PodCleanedUpFunc returns true if all resources associated with a pod have been reclaimed.
type PodCleanedUpFunc func(*v1.Pod) bool
// statsFunc returns the usage stats if known for an input pod.
type statsFunc func(pod *v1.Pod) (statsapi.PodStats, bool)

View File

@ -1156,7 +1156,7 @@ func (kl *Kubelet) initializeRuntimeDependentModules() {
glog.Fatalf("Failed to start cAdvisor %v", err)
}
// eviction manager must start after cadvisor because it needs to know if the container runtime has a dedicated imagefs
kl.evictionManager.Start(kl, kl.GetActivePods, kl, evictionMonitoringPeriod)
kl.evictionManager.Start(kl, kl.GetActivePods, kl.podResourcesAreReclaimed, kl, evictionMonitoringPeriod)
}
// Run starts the kubelet reacting to config updates

View File

@ -733,15 +733,10 @@ func (kl *Kubelet) podIsTerminated(pod *v1.Pod) bool {
return status.Phase == v1.PodFailed || status.Phase == v1.PodSucceeded || (pod.DeletionTimestamp != nil && notRunning(status.ContainerStatuses))
}
// OkToDeletePod returns true if all required node-level resources that a pod was consuming have
// been reclaimed by the kubelet. Reclaiming resources is a prerequisite to deleting a pod from the
// API server.
func (kl *Kubelet) OkToDeletePod(pod *v1.Pod) bool {
if pod.DeletionTimestamp == nil {
// We shouldnt delete pods whose DeletionTimestamp is not set
return false
}
if !notRunning(pod.Status.ContainerStatuses) {
// PodResourcesAreReclaimed returns true if all required node-level resources that a pod was consuming have
// been reclaimed by the kubelet. Reclaiming resources is a prerequisite to deleting a pod from the API server.
func (kl *Kubelet) PodResourcesAreReclaimed(pod *v1.Pod, status v1.PodStatus) bool {
if !notRunning(status.ContainerStatuses) {
// We shouldnt delete pods that still have running containers
glog.V(3).Infof("Pod %q is terminated, but some containers are still running", format.Pod(pod))
return false
@ -761,6 +756,15 @@ func (kl *Kubelet) OkToDeletePod(pod *v1.Pod) bool {
return true
}
// podResourcesAreReclaimed simply calls PodResourcesAreReclaimed with the most up-to-date status.
func (kl *Kubelet) podResourcesAreReclaimed(pod *v1.Pod) bool {
status, ok := kl.statusManager.GetPodStatus(pod.UID)
if !ok {
status = pod.Status
}
return kl.PodResourcesAreReclaimed(pod, status)
}
// notRunning returns true if every status is terminated or waiting, or the status list
// is empty.
func notRunning(statuses []v1.ContainerStatus) bool {

View File

@ -81,7 +81,7 @@ type PodStatusProvider interface {
// An object which provides guarantees that a pod can be saftely deleted.
type PodDeletionSafetyProvider interface {
// A function which returns true if the pod can safely be deleted
OkToDeletePod(pod *v1.Pod) bool
PodResourcesAreReclaimed(pod *v1.Pod, status v1.PodStatus) bool
}
// Manager is the Source of truth for kubelet pod status, and should be kept up-to-date with
@ -454,7 +454,7 @@ func (m *manager) syncPod(uid types.UID, status versionedPodStatus) {
m.apiStatusVersions[pod.UID] = status.version
// We don't handle graceful deletion of mirror pods.
if !kubepod.IsMirrorPod(pod) && m.podDeletionSafety.OkToDeletePod(pod) {
if m.canBeDeleted(pod, status.status) {
deleteOptions := metav1.NewDeleteOptions(0)
// Use the pod UID as the precondition for deletion to prevent deleting a newly created pod with the same name and namespace.
deleteOptions.Preconditions = metav1.NewUIDPreconditions(string(pod.UID))
@ -472,16 +472,18 @@ func (m *manager) syncPod(uid types.UID, status versionedPodStatus) {
// This method is not thread safe, and most only be accessed by the sync thread.
func (m *manager) needsUpdate(uid types.UID, status versionedPodStatus) bool {
latest, ok := m.apiStatusVersions[uid]
return !ok || latest < status.version || m.couldBeDeleted(uid, status.status)
if !ok || latest < status.version {
return true
}
func (m *manager) couldBeDeleted(uid types.UID, status v1.PodStatus) bool {
// The pod could be a static pod, so we should translate first.
pod, ok := m.podManager.GetPodByUID(uid)
if !ok {
return false
}
return !kubepod.IsMirrorPod(pod) && m.podDeletionSafety.OkToDeletePod(pod)
return m.canBeDeleted(pod, status.status)
}
func (m *manager) canBeDeleted(pod *v1.Pod, status v1.PodStatus) bool {
return !kubepod.IsMirrorPod(pod) && m.podDeletionSafety.PodResourcesAreReclaimed(pod, status) && pod.DeletionTimestamp != nil
}
// needsReconcile compares the given status with the status in the pod manager (which

View File

@ -11,10 +11,7 @@ go_library(
name = "go_default_library",
srcs = ["fake_pod_deletion_safety.go"],
tags = ["automanaged"],
deps = [
"//pkg/api/v1:go_default_library",
"//pkg/kubelet/pod:go_default_library",
],
deps = ["//pkg/api/v1:go_default_library"],
)
filegroup(

View File

@ -16,13 +16,10 @@ limitations under the License.
package testing
import (
"k8s.io/kubernetes/pkg/api/v1"
kubepod "k8s.io/kubernetes/pkg/kubelet/pod"
)
import "k8s.io/kubernetes/pkg/api/v1"
type FakePodDeletionSafetyProvider struct{}
func (f *FakePodDeletionSafetyProvider) OkToDeletePod(pod *v1.Pod) bool {
return !kubepod.IsMirrorPod(pod) && pod.DeletionTimestamp != nil
func (f *FakePodDeletionSafetyProvider) PodResourcesAreReclaimed(pod *v1.Pod, status v1.PodStatus) bool {
return true
}