wait for previous evicted pod to be cleaned up

2017-05-16 14:23:42 -07:00 · 2017-05-16 14:23:42 -07:00 · 21fb487245
commit 21fb487245
parent 682f3a39a0
7 changed files with 76 additions and 48 deletions
--- a/pkg/kubelet/eviction/eviction_manager.go
+++ b/pkg/kubelet/eviction/eviction_manager.go
@ -43,6 +43,11 @@ import (
 	"k8s.io/kubernetes/pkg/kubelet/util/format"
 )

+const (
+	podCleanupTimeout  = 30 * time.Second
+	podCleanupPollFreq = time.Second
+)
+
 // managerImpl implements Manager
 type managerImpl struct {
 	//  used to track time
@ -135,9 +140,18 @@ func (m *managerImpl) Admit(attrs *lifecycle.PodAdmitAttributes) lifecycle.PodAd
 }

 // Start starts the control loop to observe and response to low compute resources.
-func (m *managerImpl) Start(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, nodeProvider NodeProvider, monitoringInterval time.Duration) {
+func (m *managerImpl) Start(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, podCleanedUpFunc PodCleanedUpFunc, nodeProvider NodeProvider, monitoringInterval time.Duration) {
 	// start the eviction manager monitoring
-	go wait.Until(func() { m.synchronize(diskInfoProvider, podFunc, nodeProvider) }, monitoringInterval, wait.NeverStop)
+	go func() {
+		for {
+			if evictedPod := m.synchronize(diskInfoProvider, podFunc, nodeProvider); evictedPod != nil {
+				glog.Infof("eviction manager: pod %s evicted, waiting for pod to be cleaned up", format.Pod(evictedPod))
+				m.waitForPodCleanup(podCleanedUpFunc, evictedPod)
+			} else {
+				time.Sleep(monitoringInterval)
+			}
+		}
+	}()
 }

 // IsUnderMemoryPressure returns true if the node is under memory pressure.
@ -188,11 +202,12 @@ func startMemoryThresholdNotifier(thresholds []evictionapi.Threshold, observatio
 }

 // synchronize is the main control loop that enforces eviction thresholds.
-func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, nodeProvider NodeProvider) {
+// Returns the pod that was killed, or nil if no pod was killed.
+func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, nodeProvider NodeProvider) *v1.Pod {
 	// if we have nothing to do, just return
 	thresholds := m.config.Thresholds
 	if len(thresholds) == 0 {
-		return
+		return nil
 	}

 	glog.V(3).Infof("eviction manager: synchronize housekeeping")
@ -203,7 +218,7 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
 		// this may error if cadvisor has yet to complete housekeeping, so we will just try again in next pass.
 		hasDedicatedImageFs, err := diskInfoProvider.HasDedicatedImageFs()
 		if err != nil {
-			return
+			return nil
 		}
 		m.resourceToRankFunc = buildResourceToRankFunc(hasDedicatedImageFs)
 		m.resourceToNodeReclaimFuncs = buildResourceToNodeReclaimFuncs(m.imageGC, hasDedicatedImageFs)
@ -213,7 +228,7 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
 	observations, statsFunc, err := makeSignalObservations(m.summaryProvider, nodeProvider)
 	if err != nil {
 		glog.Errorf("eviction manager: unexpected err: %v", err)
-		return
+		return nil
 	}
 	debugLogObservations("observations", observations)

@ -291,7 +306,7 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
 	starvedResources := getStarvedResources(thresholds)
 	if len(starvedResources) == 0 {
 		glog.V(3).Infof("eviction manager: no resources are starved")
-		return
+		return nil
 	}

 	// rank the resources to reclaim by eviction priority
@ -308,7 +323,7 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
 	// check if there are node-level resources we can reclaim to reduce pressure before evicting end-user pods.
 	if m.reclaimNodeLevelResources(resourceToReclaim, observations) {
 		glog.Infof("eviction manager: able to reduce %v pressure without evicting pods.", resourceToReclaim)
-		return
+		return nil
 	}

 	glog.Infof("eviction manager: must evict pod(s) to reclaim %v", resourceToReclaim)
@ -317,16 +332,11 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
 	rank, ok := m.resourceToRankFunc[resourceToReclaim]
 	if !ok {
 		glog.Errorf("eviction manager: no ranking function for resource %s", resourceToReclaim)
-		return
+		return nil
 	}

 	// the only candidates viable for eviction are those pods that had anything running.
 	activePods := podFunc()
-	if len(activePods) == 0 {
-		glog.Errorf("eviction manager: eviction thresholds have been met, but no pods are active to evict")
-		return
-	}
-
 	// rank the running pods for eviction for the specified resource
 	rank(activePods, statsFunc)

@ -364,14 +374,29 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
 		// this is a blocking call and should only return when the pod and its containers are killed.
 		err := m.killPodFunc(pod, status, &gracePeriodOverride)
 		if err != nil {
-			glog.Infof("eviction manager: pod %s failed to evict %v", format.Pod(pod), err)
-			continue
+			glog.Warningf("eviction manager: error while evicting pod %s: %v", format.Pod(pod), err)
 		}
-		// success, so we return until the next housekeeping interval
-		glog.Infof("eviction manager: pod %s evicted successfully", format.Pod(pod))
-		return
+		return pod
 	}
 	glog.Infof("eviction manager: unable to evict any pods from the node")
+	return nil
+}
+
+func (m *managerImpl) waitForPodCleanup(podCleanedUpFunc PodCleanedUpFunc, pod *v1.Pod) {
+	timeout := m.clock.NewTimer(podCleanupTimeout)
+	tick := m.clock.Tick(podCleanupPollFreq)
+	for {
+		select {
+		case <-timeout.C():
+			glog.Warningf("eviction manager: timed out waiting for pod %s to be cleaned up", format.Pod(pod))
+			return
+		case <-tick:
+			if podCleanedUpFunc(pod) {
+				glog.Infof("eviction manager: pod %s successfully cleaned up", format.Pod(pod))
+				return
+			}
+		}
+	}
 }

 // reclaimNodeLevelResources attempts to reclaim node level resources.  returns true if thresholds were satisfied and no pod eviction is required.
--- a/pkg/kubelet/eviction/types.go
+++ b/pkg/kubelet/eviction/types.go
@ -53,7 +53,7 @@ type Config struct {
 // Manager evaluates when an eviction threshold for node stability has been met on the node.
 type Manager interface {
 	// Start starts the control loop to monitor eviction thresholds at specified interval.
-	Start(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, nodeProvider NodeProvider, monitoringInterval time.Duration)
+	Start(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, podCleanedUpFunc PodCleanedUpFunc, nodeProvider NodeProvider, monitoringInterval time.Duration)

 	// IsUnderMemoryPressure returns true if the node is under memory pressure.
 	IsUnderMemoryPressure() bool
@ -93,6 +93,9 @@ type KillPodFunc func(pod *v1.Pod, status v1.PodStatus, gracePeriodOverride *int
 // ActivePodsFunc returns pods bound to the kubelet that are active (i.e. non-terminal state)
 type ActivePodsFunc func() []*v1.Pod

+// PodCleanedUpFunc returns true if all resources associated with a pod have been reclaimed.
+type PodCleanedUpFunc func(*v1.Pod) bool
+
 // statsFunc returns the usage stats if known for an input pod.
 type statsFunc func(pod *v1.Pod) (statsapi.PodStats, bool)

--- a/pkg/kubelet/kubelet.go
+++ b/pkg/kubelet/kubelet.go
@ -1156,7 +1156,7 @@ func (kl *Kubelet) initializeRuntimeDependentModules() {
 		glog.Fatalf("Failed to start cAdvisor %v", err)
 	}
 	// eviction manager must start after cadvisor because it needs to know if the container runtime has a dedicated imagefs
-	kl.evictionManager.Start(kl, kl.GetActivePods, kl, evictionMonitoringPeriod)
+	kl.evictionManager.Start(kl, kl.GetActivePods, kl.podResourcesAreReclaimed, kl, evictionMonitoringPeriod)
 }

 // Run starts the kubelet reacting to config updates
--- a/pkg/kubelet/kubelet_pods.go
+++ b/pkg/kubelet/kubelet_pods.go
@ -733,15 +733,10 @@ func (kl *Kubelet) podIsTerminated(pod *v1.Pod) bool {
 	return status.Phase == v1.PodFailed || status.Phase == v1.PodSucceeded || (pod.DeletionTimestamp != nil && notRunning(status.ContainerStatuses))
 }

-// OkToDeletePod returns true if all required node-level resources that a pod was consuming have
-// been reclaimed by the kubelet.  Reclaiming resources is a prerequisite to deleting a pod from the
-// API server.
-func (kl *Kubelet) OkToDeletePod(pod *v1.Pod) bool {
-	if pod.DeletionTimestamp == nil {
-		// We shouldnt delete pods whose DeletionTimestamp is not set
-		return false
-	}
-	if !notRunning(pod.Status.ContainerStatuses) {
+// PodResourcesAreReclaimed returns true if all required node-level resources that a pod was consuming have
+// been reclaimed by the kubelet.  Reclaiming resources is a prerequisite to deleting a pod from the API server.
+func (kl *Kubelet) PodResourcesAreReclaimed(pod *v1.Pod, status v1.PodStatus) bool {
+	if !notRunning(status.ContainerStatuses) {
 		// We shouldnt delete pods that still have running containers
 		glog.V(3).Infof("Pod %q is terminated, but some containers are still running", format.Pod(pod))
 		return false
@ -761,6 +756,15 @@ func (kl *Kubelet) OkToDeletePod(pod *v1.Pod) bool {
 	return true
 }

+// podResourcesAreReclaimed simply calls PodResourcesAreReclaimed with the most up-to-date status.
+func (kl *Kubelet) podResourcesAreReclaimed(pod *v1.Pod) bool {
+	status, ok := kl.statusManager.GetPodStatus(pod.UID)
+	if !ok {
+		status = pod.Status
+	}
+	return kl.PodResourcesAreReclaimed(pod, status)
+}
+
 // notRunning returns true if every status is terminated or waiting, or the status list
 // is empty.
 func notRunning(statuses []v1.ContainerStatus) bool {
--- a/pkg/kubelet/status/status_manager.go
+++ b/pkg/kubelet/status/status_manager.go
@ -81,7 +81,7 @@ type PodStatusProvider interface {
 // An object which provides guarantees that a pod can be saftely deleted.
 type PodDeletionSafetyProvider interface {
 	// A function which returns true if the pod can safely be deleted
-	OkToDeletePod(pod *v1.Pod) bool
+	PodResourcesAreReclaimed(pod *v1.Pod, status v1.PodStatus) bool
 }

 // Manager is the Source of truth for kubelet pod status, and should be kept up-to-date with
@ -454,7 +454,7 @@ func (m *manager) syncPod(uid types.UID, status versionedPodStatus) {
 	m.apiStatusVersions[pod.UID] = status.version

 	// We don't handle graceful deletion of mirror pods.
-	if !kubepod.IsMirrorPod(pod) && m.podDeletionSafety.OkToDeletePod(pod) {
+	if m.canBeDeleted(pod, status.status) {
 		deleteOptions := metav1.NewDeleteOptions(0)
 		// Use the pod UID as the precondition for deletion to prevent deleting a newly created pod with the same name and namespace.
 		deleteOptions.Preconditions = metav1.NewUIDPreconditions(string(pod.UID))
@ -472,16 +472,18 @@ func (m *manager) syncPod(uid types.UID, status versionedPodStatus) {
 // This method is not thread safe, and most only be accessed by the sync thread.
 func (m *manager) needsUpdate(uid types.UID, status versionedPodStatus) bool {
 	latest, ok := m.apiStatusVersions[uid]
-	return !ok || latest < status.version || m.couldBeDeleted(uid, status.status)
-}
-
-func (m *manager) couldBeDeleted(uid types.UID, status v1.PodStatus) bool {
-	// The pod could be a static pod, so we should translate first.
+	if !ok || latest < status.version {
+		return true
+	}
 	pod, ok := m.podManager.GetPodByUID(uid)
 	if !ok {
 		return false
 	}
-	return !kubepod.IsMirrorPod(pod) && m.podDeletionSafety.OkToDeletePod(pod)
+	return m.canBeDeleted(pod, status.status)
+}
+
+func (m *manager) canBeDeleted(pod *v1.Pod, status v1.PodStatus) bool {
+	return !kubepod.IsMirrorPod(pod) && m.podDeletionSafety.PodResourcesAreReclaimed(pod, status) && pod.DeletionTimestamp != nil
 }

 // needsReconcile compares the given status with the status in the pod manager (which
--- a/pkg/kubelet/status/testing/BUILD
+++ b/pkg/kubelet/status/testing/BUILD
@ -11,10 +11,7 @@ go_library(
    name = "go_default_library",
    srcs = ["fake_pod_deletion_safety.go"],
    tags = ["automanaged"],
-    deps = [
-        "//pkg/api/v1:go_default_library",
-        "//pkg/kubelet/pod:go_default_library",
-    ],
+    deps = ["//pkg/api/v1:go_default_library"],
 )

 filegroup(
--- a/pkg/kubelet/status/testing/fake_pod_deletion_safety.go
+++ b/pkg/kubelet/status/testing/fake_pod_deletion_safety.go
@ -16,13 +16,10 @@ limitations under the License.

 package testing

-import (
-	"k8s.io/kubernetes/pkg/api/v1"
-	kubepod "k8s.io/kubernetes/pkg/kubelet/pod"
-)
+import "k8s.io/kubernetes/pkg/api/v1"

 type FakePodDeletionSafetyProvider struct{}

-func (f *FakePodDeletionSafetyProvider) OkToDeletePod(pod *v1.Pod) bool {
-	return !kubepod.IsMirrorPod(pod) && pod.DeletionTimestamp != nil
+func (f *FakePodDeletionSafetyProvider) PodResourcesAreReclaimed(pod *v1.Pod, status v1.PodStatus) bool {
+	return true
 }