diff --git a/pkg/kubelet/eviction/eviction_manager.go b/pkg/kubelet/eviction/eviction_manager.go index 6cdb87a5630..c75081e78ba 100644 --- a/pkg/kubelet/eviction/eviction_manager.go +++ b/pkg/kubelet/eviction/eviction_manager.go @@ -43,6 +43,11 @@ import ( "k8s.io/kubernetes/pkg/kubelet/util/format" ) +const ( + podCleanupTimeout = 30 * time.Second + podCleanupPollFreq = time.Second +) + // managerImpl implements Manager type managerImpl struct { // used to track time @@ -135,9 +140,18 @@ func (m *managerImpl) Admit(attrs *lifecycle.PodAdmitAttributes) lifecycle.PodAd } // Start starts the control loop to observe and response to low compute resources. -func (m *managerImpl) Start(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, nodeProvider NodeProvider, monitoringInterval time.Duration) { +func (m *managerImpl) Start(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, podCleanedUpFunc PodCleanedUpFunc, nodeProvider NodeProvider, monitoringInterval time.Duration) { // start the eviction manager monitoring - go wait.Until(func() { m.synchronize(diskInfoProvider, podFunc, nodeProvider) }, monitoringInterval, wait.NeverStop) + go func() { + for { + if evictedPod := m.synchronize(diskInfoProvider, podFunc, nodeProvider); evictedPod != nil { + glog.Infof("eviction manager: pod %s evicted, waiting for pod to be cleaned up", format.Pod(evictedPod)) + m.waitForPodCleanup(podCleanedUpFunc, evictedPod) + } else { + time.Sleep(monitoringInterval) + } + } + }() } // IsUnderMemoryPressure returns true if the node is under memory pressure. @@ -188,11 +202,12 @@ func startMemoryThresholdNotifier(thresholds []evictionapi.Threshold, observatio } // synchronize is the main control loop that enforces eviction thresholds. -func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, nodeProvider NodeProvider) { +// Returns the pod that was killed, or nil if no pod was killed. +func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, nodeProvider NodeProvider) *v1.Pod { // if we have nothing to do, just return thresholds := m.config.Thresholds if len(thresholds) == 0 { - return + return nil } glog.V(3).Infof("eviction manager: synchronize housekeeping") @@ -203,7 +218,7 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act // this may error if cadvisor has yet to complete housekeeping, so we will just try again in next pass. hasDedicatedImageFs, err := diskInfoProvider.HasDedicatedImageFs() if err != nil { - return + return nil } m.resourceToRankFunc = buildResourceToRankFunc(hasDedicatedImageFs) m.resourceToNodeReclaimFuncs = buildResourceToNodeReclaimFuncs(m.imageGC, hasDedicatedImageFs) @@ -213,7 +228,7 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act observations, statsFunc, err := makeSignalObservations(m.summaryProvider, nodeProvider) if err != nil { glog.Errorf("eviction manager: unexpected err: %v", err) - return + return nil } debugLogObservations("observations", observations) @@ -291,7 +306,7 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act starvedResources := getStarvedResources(thresholds) if len(starvedResources) == 0 { glog.V(3).Infof("eviction manager: no resources are starved") - return + return nil } // rank the resources to reclaim by eviction priority @@ -308,7 +323,7 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act // check if there are node-level resources we can reclaim to reduce pressure before evicting end-user pods. if m.reclaimNodeLevelResources(resourceToReclaim, observations) { glog.Infof("eviction manager: able to reduce %v pressure without evicting pods.", resourceToReclaim) - return + return nil } glog.Infof("eviction manager: must evict pod(s) to reclaim %v", resourceToReclaim) @@ -317,16 +332,11 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act rank, ok := m.resourceToRankFunc[resourceToReclaim] if !ok { glog.Errorf("eviction manager: no ranking function for resource %s", resourceToReclaim) - return + return nil } // the only candidates viable for eviction are those pods that had anything running. activePods := podFunc() - if len(activePods) == 0 { - glog.Errorf("eviction manager: eviction thresholds have been met, but no pods are active to evict") - return - } - // rank the running pods for eviction for the specified resource rank(activePods, statsFunc) @@ -364,14 +374,29 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act // this is a blocking call and should only return when the pod and its containers are killed. err := m.killPodFunc(pod, status, &gracePeriodOverride) if err != nil { - glog.Infof("eviction manager: pod %s failed to evict %v", format.Pod(pod), err) - continue + glog.Warningf("eviction manager: error while evicting pod %s: %v", format.Pod(pod), err) } - // success, so we return until the next housekeeping interval - glog.Infof("eviction manager: pod %s evicted successfully", format.Pod(pod)) - return + return pod } glog.Infof("eviction manager: unable to evict any pods from the node") + return nil +} + +func (m *managerImpl) waitForPodCleanup(podCleanedUpFunc PodCleanedUpFunc, pod *v1.Pod) { + timeout := m.clock.NewTimer(podCleanupTimeout) + tick := m.clock.Tick(podCleanupPollFreq) + for { + select { + case <-timeout.C(): + glog.Warningf("eviction manager: timed out waiting for pod %s to be cleaned up", format.Pod(pod)) + return + case <-tick: + if podCleanedUpFunc(pod) { + glog.Infof("eviction manager: pod %s successfully cleaned up", format.Pod(pod)) + return + } + } + } } // reclaimNodeLevelResources attempts to reclaim node level resources. returns true if thresholds were satisfied and no pod eviction is required. diff --git a/pkg/kubelet/eviction/types.go b/pkg/kubelet/eviction/types.go index fb147f1c559..a5adc4fad8e 100644 --- a/pkg/kubelet/eviction/types.go +++ b/pkg/kubelet/eviction/types.go @@ -53,7 +53,7 @@ type Config struct { // Manager evaluates when an eviction threshold for node stability has been met on the node. type Manager interface { // Start starts the control loop to monitor eviction thresholds at specified interval. - Start(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, nodeProvider NodeProvider, monitoringInterval time.Duration) + Start(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, podCleanedUpFunc PodCleanedUpFunc, nodeProvider NodeProvider, monitoringInterval time.Duration) // IsUnderMemoryPressure returns true if the node is under memory pressure. IsUnderMemoryPressure() bool @@ -93,6 +93,9 @@ type KillPodFunc func(pod *v1.Pod, status v1.PodStatus, gracePeriodOverride *int // ActivePodsFunc returns pods bound to the kubelet that are active (i.e. non-terminal state) type ActivePodsFunc func() []*v1.Pod +// PodCleanedUpFunc returns true if all resources associated with a pod have been reclaimed. +type PodCleanedUpFunc func(*v1.Pod) bool + // statsFunc returns the usage stats if known for an input pod. type statsFunc func(pod *v1.Pod) (statsapi.PodStats, bool) diff --git a/pkg/kubelet/kubelet.go b/pkg/kubelet/kubelet.go index 8b6e4098310..ab0047fb3d5 100644 --- a/pkg/kubelet/kubelet.go +++ b/pkg/kubelet/kubelet.go @@ -1156,7 +1156,7 @@ func (kl *Kubelet) initializeRuntimeDependentModules() { glog.Fatalf("Failed to start cAdvisor %v", err) } // eviction manager must start after cadvisor because it needs to know if the container runtime has a dedicated imagefs - kl.evictionManager.Start(kl, kl.GetActivePods, kl, evictionMonitoringPeriod) + kl.evictionManager.Start(kl, kl.GetActivePods, kl.podResourcesAreReclaimed, kl, evictionMonitoringPeriod) } // Run starts the kubelet reacting to config updates diff --git a/pkg/kubelet/kubelet_pods.go b/pkg/kubelet/kubelet_pods.go index 0af466cbcad..790493c4004 100644 --- a/pkg/kubelet/kubelet_pods.go +++ b/pkg/kubelet/kubelet_pods.go @@ -733,15 +733,10 @@ func (kl *Kubelet) podIsTerminated(pod *v1.Pod) bool { return status.Phase == v1.PodFailed || status.Phase == v1.PodSucceeded || (pod.DeletionTimestamp != nil && notRunning(status.ContainerStatuses)) } -// OkToDeletePod returns true if all required node-level resources that a pod was consuming have -// been reclaimed by the kubelet. Reclaiming resources is a prerequisite to deleting a pod from the -// API server. -func (kl *Kubelet) OkToDeletePod(pod *v1.Pod) bool { - if pod.DeletionTimestamp == nil { - // We shouldnt delete pods whose DeletionTimestamp is not set - return false - } - if !notRunning(pod.Status.ContainerStatuses) { +// PodResourcesAreReclaimed returns true if all required node-level resources that a pod was consuming have +// been reclaimed by the kubelet. Reclaiming resources is a prerequisite to deleting a pod from the API server. +func (kl *Kubelet) PodResourcesAreReclaimed(pod *v1.Pod, status v1.PodStatus) bool { + if !notRunning(status.ContainerStatuses) { // We shouldnt delete pods that still have running containers glog.V(3).Infof("Pod %q is terminated, but some containers are still running", format.Pod(pod)) return false @@ -761,6 +756,15 @@ func (kl *Kubelet) OkToDeletePod(pod *v1.Pod) bool { return true } +// podResourcesAreReclaimed simply calls PodResourcesAreReclaimed with the most up-to-date status. +func (kl *Kubelet) podResourcesAreReclaimed(pod *v1.Pod) bool { + status, ok := kl.statusManager.GetPodStatus(pod.UID) + if !ok { + status = pod.Status + } + return kl.PodResourcesAreReclaimed(pod, status) +} + // notRunning returns true if every status is terminated or waiting, or the status list // is empty. func notRunning(statuses []v1.ContainerStatus) bool { diff --git a/pkg/kubelet/status/status_manager.go b/pkg/kubelet/status/status_manager.go index 36121115020..03cdbf35245 100644 --- a/pkg/kubelet/status/status_manager.go +++ b/pkg/kubelet/status/status_manager.go @@ -81,7 +81,7 @@ type PodStatusProvider interface { // An object which provides guarantees that a pod can be saftely deleted. type PodDeletionSafetyProvider interface { // A function which returns true if the pod can safely be deleted - OkToDeletePod(pod *v1.Pod) bool + PodResourcesAreReclaimed(pod *v1.Pod, status v1.PodStatus) bool } // Manager is the Source of truth for kubelet pod status, and should be kept up-to-date with @@ -454,7 +454,7 @@ func (m *manager) syncPod(uid types.UID, status versionedPodStatus) { m.apiStatusVersions[pod.UID] = status.version // We don't handle graceful deletion of mirror pods. - if !kubepod.IsMirrorPod(pod) && m.podDeletionSafety.OkToDeletePod(pod) { + if m.canBeDeleted(pod, status.status) { deleteOptions := metav1.NewDeleteOptions(0) // Use the pod UID as the precondition for deletion to prevent deleting a newly created pod with the same name and namespace. deleteOptions.Preconditions = metav1.NewUIDPreconditions(string(pod.UID)) @@ -472,16 +472,18 @@ func (m *manager) syncPod(uid types.UID, status versionedPodStatus) { // This method is not thread safe, and most only be accessed by the sync thread. func (m *manager) needsUpdate(uid types.UID, status versionedPodStatus) bool { latest, ok := m.apiStatusVersions[uid] - return !ok || latest < status.version || m.couldBeDeleted(uid, status.status) -} - -func (m *manager) couldBeDeleted(uid types.UID, status v1.PodStatus) bool { - // The pod could be a static pod, so we should translate first. + if !ok || latest < status.version { + return true + } pod, ok := m.podManager.GetPodByUID(uid) if !ok { return false } - return !kubepod.IsMirrorPod(pod) && m.podDeletionSafety.OkToDeletePod(pod) + return m.canBeDeleted(pod, status.status) +} + +func (m *manager) canBeDeleted(pod *v1.Pod, status v1.PodStatus) bool { + return !kubepod.IsMirrorPod(pod) && m.podDeletionSafety.PodResourcesAreReclaimed(pod, status) && pod.DeletionTimestamp != nil } // needsReconcile compares the given status with the status in the pod manager (which diff --git a/pkg/kubelet/status/testing/BUILD b/pkg/kubelet/status/testing/BUILD index 93a4a942261..f200aa7e3f0 100644 --- a/pkg/kubelet/status/testing/BUILD +++ b/pkg/kubelet/status/testing/BUILD @@ -11,10 +11,7 @@ go_library( name = "go_default_library", srcs = ["fake_pod_deletion_safety.go"], tags = ["automanaged"], - deps = [ - "//pkg/api/v1:go_default_library", - "//pkg/kubelet/pod:go_default_library", - ], + deps = ["//pkg/api/v1:go_default_library"], ) filegroup( diff --git a/pkg/kubelet/status/testing/fake_pod_deletion_safety.go b/pkg/kubelet/status/testing/fake_pod_deletion_safety.go index c05382907f3..62d785e4fd1 100644 --- a/pkg/kubelet/status/testing/fake_pod_deletion_safety.go +++ b/pkg/kubelet/status/testing/fake_pod_deletion_safety.go @@ -16,13 +16,10 @@ limitations under the License. package testing -import ( - "k8s.io/kubernetes/pkg/api/v1" - kubepod "k8s.io/kubernetes/pkg/kubelet/pod" -) +import "k8s.io/kubernetes/pkg/api/v1" type FakePodDeletionSafetyProvider struct{} -func (f *FakePodDeletionSafetyProvider) OkToDeletePod(pod *v1.Pod) bool { - return !kubepod.IsMirrorPod(pod) && pod.DeletionTimestamp != nil +func (f *FakePodDeletionSafetyProvider) PodResourcesAreReclaimed(pod *v1.Pod, status v1.PodStatus) bool { + return true }