Merge pull request #43590 from dashpole/eviction_complete_deletion

Automatic merge from submit-queue (batch tested with PRs 46022, 46055, 45308, 46209, 43590) Eviction does not evict unless the previous pod has been cleaned up Addresses #43166 This PR makes two main changes: First, it makes the eviction loop re-trigger immediately if there may still be pressure. This way, if we already waited 10 seconds to delete a pod, we dont need to wait another 10 seconds for the next synchronize call. Second, it waits for the pod to be cleaned up (including volumes, cgroups, etc), before moving on to the next synchronize call. It has a timeout for this operation currently set to 30 seconds.
2017-05-22 20:00:03 -07:00 · 2017-05-22 20:00:03 -07:00 · 99a8f7c303
commit 99a8f7c303
parent c586f36e55 21fb487245
7 changed files with 76 additions and 48 deletions
--- a/pkg/kubelet/eviction/eviction_manager.go
+++ b/pkg/kubelet/eviction/eviction_manager.go
@ -43,6 +43,11 @@ import (
 	"k8s.io/kubernetes/pkg/kubelet/util/format"
 )

+const (
+	podCleanupTimeout  = 30 * time.Second
+	podCleanupPollFreq = time.Second
+)
+
 // managerImpl implements Manager
 type managerImpl struct {
 	//  used to track time
@ -135,9 +140,18 @@ func (m *managerImpl) Admit(attrs *lifecycle.PodAdmitAttributes) lifecycle.PodAd
 }

 // Start starts the control loop to observe and response to low compute resources.
-func (m *managerImpl) Start(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, nodeProvider NodeProvider, monitoringInterval time.Duration) {
+func (m *managerImpl) Start(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, podCleanedUpFunc PodCleanedUpFunc, nodeProvider NodeProvider, monitoringInterval time.Duration) {
 	// start the eviction manager monitoring
-	go wait.Until(func() { m.synchronize(diskInfoProvider, podFunc, nodeProvider) }, monitoringInterval, wait.NeverStop)
+	go func() {
+		for {
+			if evictedPod := m.synchronize(diskInfoProvider, podFunc, nodeProvider); evictedPod != nil {
+				glog.Infof("eviction manager: pod %s evicted, waiting for pod to be cleaned up", format.Pod(evictedPod))
+				m.waitForPodCleanup(podCleanedUpFunc, evictedPod)
+			} else {
+				time.Sleep(monitoringInterval)
+			}
+		}
+	}()
 }

 // IsUnderMemoryPressure returns true if the node is under memory pressure.
@ -188,11 +202,12 @@ func startMemoryThresholdNotifier(thresholds []evictionapi.Threshold, observatio
 }

 // synchronize is the main control loop that enforces eviction thresholds.
-func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, nodeProvider NodeProvider) {
+// Returns the pod that was killed, or nil if no pod was killed.
+func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, nodeProvider NodeProvider) *v1.Pod {
 	// if we have nothing to do, just return
 	thresholds := m.config.Thresholds
 	if len(thresholds) == 0 {
-		return
+		return nil
 	}

 	glog.V(3).Infof("eviction manager: synchronize housekeeping")
@ -203,7 +218,7 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
 		// this may error if cadvisor has yet to complete housekeeping, so we will just try again in next pass.
 		hasDedicatedImageFs, err := diskInfoProvider.HasDedicatedImageFs()
 		if err != nil {
-			return
+			return nil
 		}
 		m.resourceToRankFunc = buildResourceToRankFunc(hasDedicatedImageFs)
 		m.resourceToNodeReclaimFuncs = buildResourceToNodeReclaimFuncs(m.imageGC, hasDedicatedImageFs)
@ -213,7 +228,7 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
 	observations, statsFunc, err := makeSignalObservations(m.summaryProvider, nodeProvider)
 	if err != nil {
 		glog.Errorf("eviction manager: unexpected err: %v", err)
-		return
+		return nil
 	}
 	debugLogObservations("observations", observations)

@ -291,7 +306,7 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
 	starvedResources := getStarvedResources(thresholds)
 	if len(starvedResources) == 0 {
 		glog.V(3).Infof("eviction manager: no resources are starved")
-		return
+		return nil
 	}

 	// rank the resources to reclaim by eviction priority
@ -308,7 +323,7 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
 	// check if there are node-level resources we can reclaim to reduce pressure before evicting end-user pods.
 	if m.reclaimNodeLevelResources(resourceToReclaim, observations) {
 		glog.Infof("eviction manager: able to reduce %v pressure without evicting pods.", resourceToReclaim)
-		return
+		return nil
 	}

 	glog.Infof("eviction manager: must evict pod(s) to reclaim %v", resourceToReclaim)
@ -317,16 +332,11 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
 	rank, ok := m.resourceToRankFunc[resourceToReclaim]
 	if !ok {
 		glog.Errorf("eviction manager: no ranking function for resource %s", resourceToReclaim)
-		return
+		return nil
 	}

 	// the only candidates viable for eviction are those pods that had anything running.
 	activePods := podFunc()
-	if len(activePods) == 0 {
-		glog.Errorf("eviction manager: eviction thresholds have been met, but no pods are active to evict")
-		return
-	}
-
 	// rank the running pods for eviction for the specified resource
 	rank(activePods, statsFunc)

@ -364,14 +374,29 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
 		// this is a blocking call and should only return when the pod and its containers are killed.
 		err := m.killPodFunc(pod, status, &gracePeriodOverride)
 		if err != nil {
-			glog.Infof("eviction manager: pod %s failed to evict %v", format.Pod(pod), err)
-			continue
+			glog.Warningf("eviction manager: error while evicting pod %s: %v", format.Pod(pod), err)
 		}
-		// success, so we return until the next housekeeping interval
-		glog.Infof("eviction manager: pod %s evicted successfully", format.Pod(pod))
-		return
+		return pod
 	}
 	glog.Infof("eviction manager: unable to evict any pods from the node")
+	return nil
+}
+
+func (m *managerImpl) waitForPodCleanup(podCleanedUpFunc PodCleanedUpFunc, pod *v1.Pod) {
+	timeout := m.clock.NewTimer(podCleanupTimeout)
+	tick := m.clock.Tick(podCleanupPollFreq)
+	for {
+		select {
+		case <-timeout.C():
+			glog.Warningf("eviction manager: timed out waiting for pod %s to be cleaned up", format.Pod(pod))
+			return
+		case <-tick:
+			if podCleanedUpFunc(pod) {
+				glog.Infof("eviction manager: pod %s successfully cleaned up", format.Pod(pod))
+				return
+			}
+		}
+	}
 }

 // reclaimNodeLevelResources attempts to reclaim node level resources.  returns true if thresholds were satisfied and no pod eviction is required.
--- a/pkg/kubelet/eviction/types.go
+++ b/pkg/kubelet/eviction/types.go
@ -53,7 +53,7 @@ type Config struct {
 // Manager evaluates when an eviction threshold for node stability has been met on the node.
 type Manager interface {
 	// Start starts the control loop to monitor eviction thresholds at specified interval.
-	Start(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, nodeProvider NodeProvider, monitoringInterval time.Duration)
+	Start(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, podCleanedUpFunc PodCleanedUpFunc, nodeProvider NodeProvider, monitoringInterval time.Duration)

 	// IsUnderMemoryPressure returns true if the node is under memory pressure.
 	IsUnderMemoryPressure() bool
@ -93,6 +93,9 @@ type KillPodFunc func(pod *v1.Pod, status v1.PodStatus, gracePeriodOverride *int
 // ActivePodsFunc returns pods bound to the kubelet that are active (i.e. non-terminal state)
 type ActivePodsFunc func() []*v1.Pod

+// PodCleanedUpFunc returns true if all resources associated with a pod have been reclaimed.
+type PodCleanedUpFunc func(*v1.Pod) bool
+
 // statsFunc returns the usage stats if known for an input pod.
 type statsFunc func(pod *v1.Pod) (statsapi.PodStats, bool)

--- a/pkg/kubelet/kubelet.go
+++ b/pkg/kubelet/kubelet.go
@ -1156,7 +1156,7 @@ func (kl *Kubelet) initializeRuntimeDependentModules() {
 		glog.Fatalf("Failed to start cAdvisor %v", err)
 	}
 	// eviction manager must start after cadvisor because it needs to know if the container runtime has a dedicated imagefs
-	kl.evictionManager.Start(kl, kl.GetActivePods, kl, evictionMonitoringPeriod)
+	kl.evictionManager.Start(kl, kl.GetActivePods, kl.podResourcesAreReclaimed, kl, evictionMonitoringPeriod)
 }

 // Run starts the kubelet reacting to config updates
--- a/pkg/kubelet/kubelet_pods.go
+++ b/pkg/kubelet/kubelet_pods.go
@ -733,15 +733,10 @@ func (kl *Kubelet) podIsTerminated(pod *v1.Pod) bool {
 	return status.Phase == v1.PodFailed || status.Phase == v1.PodSucceeded || (pod.DeletionTimestamp != nil && notRunning(status.ContainerStatuses))
 }

-// OkToDeletePod returns true if all required node-level resources that a pod was consuming have
-// been reclaimed by the kubelet.  Reclaiming resources is a prerequisite to deleting a pod from the
-// API server.
-func (kl *Kubelet) OkToDeletePod(pod *v1.Pod) bool {
-	if pod.DeletionTimestamp == nil {
-		// We shouldnt delete pods whose DeletionTimestamp is not set
-		return false
-	}
-	if !notRunning(pod.Status.ContainerStatuses) {
+// PodResourcesAreReclaimed returns true if all required node-level resources that a pod was consuming have
+// been reclaimed by the kubelet.  Reclaiming resources is a prerequisite to deleting a pod from the API server.
+func (kl *Kubelet) PodResourcesAreReclaimed(pod *v1.Pod, status v1.PodStatus) bool {
+	if !notRunning(status.ContainerStatuses) {
 		// We shouldnt delete pods that still have running containers
 		glog.V(3).Infof("Pod %q is terminated, but some containers are still running", format.Pod(pod))
 		return false
@ -761,6 +756,15 @@ func (kl *Kubelet) OkToDeletePod(pod *v1.Pod) bool {
 	return true
 }

+// podResourcesAreReclaimed simply calls PodResourcesAreReclaimed with the most up-to-date status.
+func (kl *Kubelet) podResourcesAreReclaimed(pod *v1.Pod) bool {
+	status, ok := kl.statusManager.GetPodStatus(pod.UID)
+	if !ok {
+		status = pod.Status
+	}
+	return kl.PodResourcesAreReclaimed(pod, status)
+}
+
 // notRunning returns true if every status is terminated or waiting, or the status list
 // is empty.
 func notRunning(statuses []v1.ContainerStatus) bool {
--- a/pkg/kubelet/status/status_manager.go
+++ b/pkg/kubelet/status/status_manager.go
@ -81,7 +81,7 @@ type PodStatusProvider interface {
 // An object which provides guarantees that a pod can be saftely deleted.
 type PodDeletionSafetyProvider interface {
 	// A function which returns true if the pod can safely be deleted
-	OkToDeletePod(pod *v1.Pod) bool
+	PodResourcesAreReclaimed(pod *v1.Pod, status v1.PodStatus) bool
 }

 // Manager is the Source of truth for kubelet pod status, and should be kept up-to-date with
@ -454,7 +454,7 @@ func (m *manager) syncPod(uid types.UID, status versionedPodStatus) {
 	m.apiStatusVersions[pod.UID] = status.version

 	// We don't handle graceful deletion of mirror pods.
-	if !kubepod.IsMirrorPod(pod) && m.podDeletionSafety.OkToDeletePod(pod) {
+	if m.canBeDeleted(pod, status.status) {
 		deleteOptions := metav1.NewDeleteOptions(0)
 		// Use the pod UID as the precondition for deletion to prevent deleting a newly created pod with the same name and namespace.
 		deleteOptions.Preconditions = metav1.NewUIDPreconditions(string(pod.UID))
@ -472,16 +472,18 @@ func (m *manager) syncPod(uid types.UID, status versionedPodStatus) {
 // This method is not thread safe, and most only be accessed by the sync thread.
 func (m *manager) needsUpdate(uid types.UID, status versionedPodStatus) bool {
 	latest, ok := m.apiStatusVersions[uid]
-	return !ok || latest < status.version || m.couldBeDeleted(uid, status.status)
+	if !ok || latest < status.version {
+		return true
 	}
-
-func (m *manager) couldBeDeleted(uid types.UID, status v1.PodStatus) bool {
-	// The pod could be a static pod, so we should translate first.
 	pod, ok := m.podManager.GetPodByUID(uid)
 	if !ok {
 		return false
 	}
-	return !kubepod.IsMirrorPod(pod) && m.podDeletionSafety.OkToDeletePod(pod)
+	return m.canBeDeleted(pod, status.status)
+}
+
+func (m *manager) canBeDeleted(pod *v1.Pod, status v1.PodStatus) bool {
+	return !kubepod.IsMirrorPod(pod) && m.podDeletionSafety.PodResourcesAreReclaimed(pod, status) && pod.DeletionTimestamp != nil
 }

 // needsReconcile compares the given status with the status in the pod manager (which
--- a/pkg/kubelet/status/testing/BUILD
+++ b/pkg/kubelet/status/testing/BUILD
@ -11,10 +11,7 @@ go_library(
    name = "go_default_library",
    srcs = ["fake_pod_deletion_safety.go"],
    tags = ["automanaged"],
-    deps = [
-        "//pkg/api/v1:go_default_library",
-        "//pkg/kubelet/pod:go_default_library",
-    ],
+    deps = ["//pkg/api/v1:go_default_library"],
 )

 filegroup(
--- a/pkg/kubelet/status/testing/fake_pod_deletion_safety.go
+++ b/pkg/kubelet/status/testing/fake_pod_deletion_safety.go
@ -16,13 +16,10 @@ limitations under the License.

 package testing

-import (
-	"k8s.io/kubernetes/pkg/api/v1"
-	kubepod "k8s.io/kubernetes/pkg/kubelet/pod"
-)
+import "k8s.io/kubernetes/pkg/api/v1"

 type FakePodDeletionSafetyProvider struct{}

-func (f *FakePodDeletionSafetyProvider) OkToDeletePod(pod *v1.Pod) bool {
-	return !kubepod.IsMirrorPod(pod) && pod.DeletionTimestamp != nil
+func (f *FakePodDeletionSafetyProvider) PodResourcesAreReclaimed(pod *v1.Pod, status v1.PodStatus) bool {
+	return true
 }