Merge pull request #43590 from dashpole/eviction_complete_deletion
Automatic merge from submit-queue (batch tested with PRs 46022, 46055, 45308, 46209, 43590) Eviction does not evict unless the previous pod has been cleaned up Addresses #43166 This PR makes two main changes: First, it makes the eviction loop re-trigger immediately if there may still be pressure. This way, if we already waited 10 seconds to delete a pod, we dont need to wait another 10 seconds for the next synchronize call. Second, it waits for the pod to be cleaned up (including volumes, cgroups, etc), before moving on to the next synchronize call. It has a timeout for this operation currently set to 30 seconds.
This commit is contained in:
commit
99a8f7c303
@ -43,6 +43,11 @@ import (
|
|||||||
"k8s.io/kubernetes/pkg/kubelet/util/format"
|
"k8s.io/kubernetes/pkg/kubelet/util/format"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
podCleanupTimeout = 30 * time.Second
|
||||||
|
podCleanupPollFreq = time.Second
|
||||||
|
)
|
||||||
|
|
||||||
// managerImpl implements Manager
|
// managerImpl implements Manager
|
||||||
type managerImpl struct {
|
type managerImpl struct {
|
||||||
// used to track time
|
// used to track time
|
||||||
@ -135,9 +140,18 @@ func (m *managerImpl) Admit(attrs *lifecycle.PodAdmitAttributes) lifecycle.PodAd
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Start starts the control loop to observe and response to low compute resources.
|
// Start starts the control loop to observe and response to low compute resources.
|
||||||
func (m *managerImpl) Start(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, nodeProvider NodeProvider, monitoringInterval time.Duration) {
|
func (m *managerImpl) Start(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, podCleanedUpFunc PodCleanedUpFunc, nodeProvider NodeProvider, monitoringInterval time.Duration) {
|
||||||
// start the eviction manager monitoring
|
// start the eviction manager monitoring
|
||||||
go wait.Until(func() { m.synchronize(diskInfoProvider, podFunc, nodeProvider) }, monitoringInterval, wait.NeverStop)
|
go func() {
|
||||||
|
for {
|
||||||
|
if evictedPod := m.synchronize(diskInfoProvider, podFunc, nodeProvider); evictedPod != nil {
|
||||||
|
glog.Infof("eviction manager: pod %s evicted, waiting for pod to be cleaned up", format.Pod(evictedPod))
|
||||||
|
m.waitForPodCleanup(podCleanedUpFunc, evictedPod)
|
||||||
|
} else {
|
||||||
|
time.Sleep(monitoringInterval)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}()
|
||||||
}
|
}
|
||||||
|
|
||||||
// IsUnderMemoryPressure returns true if the node is under memory pressure.
|
// IsUnderMemoryPressure returns true if the node is under memory pressure.
|
||||||
@ -188,11 +202,12 @@ func startMemoryThresholdNotifier(thresholds []evictionapi.Threshold, observatio
|
|||||||
}
|
}
|
||||||
|
|
||||||
// synchronize is the main control loop that enforces eviction thresholds.
|
// synchronize is the main control loop that enforces eviction thresholds.
|
||||||
func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, nodeProvider NodeProvider) {
|
// Returns the pod that was killed, or nil if no pod was killed.
|
||||||
|
func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, nodeProvider NodeProvider) *v1.Pod {
|
||||||
// if we have nothing to do, just return
|
// if we have nothing to do, just return
|
||||||
thresholds := m.config.Thresholds
|
thresholds := m.config.Thresholds
|
||||||
if len(thresholds) == 0 {
|
if len(thresholds) == 0 {
|
||||||
return
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
glog.V(3).Infof("eviction manager: synchronize housekeeping")
|
glog.V(3).Infof("eviction manager: synchronize housekeeping")
|
||||||
@ -203,7 +218,7 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
|
|||||||
// this may error if cadvisor has yet to complete housekeeping, so we will just try again in next pass.
|
// this may error if cadvisor has yet to complete housekeeping, so we will just try again in next pass.
|
||||||
hasDedicatedImageFs, err := diskInfoProvider.HasDedicatedImageFs()
|
hasDedicatedImageFs, err := diskInfoProvider.HasDedicatedImageFs()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return
|
return nil
|
||||||
}
|
}
|
||||||
m.resourceToRankFunc = buildResourceToRankFunc(hasDedicatedImageFs)
|
m.resourceToRankFunc = buildResourceToRankFunc(hasDedicatedImageFs)
|
||||||
m.resourceToNodeReclaimFuncs = buildResourceToNodeReclaimFuncs(m.imageGC, hasDedicatedImageFs)
|
m.resourceToNodeReclaimFuncs = buildResourceToNodeReclaimFuncs(m.imageGC, hasDedicatedImageFs)
|
||||||
@ -213,7 +228,7 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
|
|||||||
observations, statsFunc, err := makeSignalObservations(m.summaryProvider, nodeProvider)
|
observations, statsFunc, err := makeSignalObservations(m.summaryProvider, nodeProvider)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
glog.Errorf("eviction manager: unexpected err: %v", err)
|
glog.Errorf("eviction manager: unexpected err: %v", err)
|
||||||
return
|
return nil
|
||||||
}
|
}
|
||||||
debugLogObservations("observations", observations)
|
debugLogObservations("observations", observations)
|
||||||
|
|
||||||
@ -291,7 +306,7 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
|
|||||||
starvedResources := getStarvedResources(thresholds)
|
starvedResources := getStarvedResources(thresholds)
|
||||||
if len(starvedResources) == 0 {
|
if len(starvedResources) == 0 {
|
||||||
glog.V(3).Infof("eviction manager: no resources are starved")
|
glog.V(3).Infof("eviction manager: no resources are starved")
|
||||||
return
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// rank the resources to reclaim by eviction priority
|
// rank the resources to reclaim by eviction priority
|
||||||
@ -308,7 +323,7 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
|
|||||||
// check if there are node-level resources we can reclaim to reduce pressure before evicting end-user pods.
|
// check if there are node-level resources we can reclaim to reduce pressure before evicting end-user pods.
|
||||||
if m.reclaimNodeLevelResources(resourceToReclaim, observations) {
|
if m.reclaimNodeLevelResources(resourceToReclaim, observations) {
|
||||||
glog.Infof("eviction manager: able to reduce %v pressure without evicting pods.", resourceToReclaim)
|
glog.Infof("eviction manager: able to reduce %v pressure without evicting pods.", resourceToReclaim)
|
||||||
return
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
glog.Infof("eviction manager: must evict pod(s) to reclaim %v", resourceToReclaim)
|
glog.Infof("eviction manager: must evict pod(s) to reclaim %v", resourceToReclaim)
|
||||||
@ -317,16 +332,11 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
|
|||||||
rank, ok := m.resourceToRankFunc[resourceToReclaim]
|
rank, ok := m.resourceToRankFunc[resourceToReclaim]
|
||||||
if !ok {
|
if !ok {
|
||||||
glog.Errorf("eviction manager: no ranking function for resource %s", resourceToReclaim)
|
glog.Errorf("eviction manager: no ranking function for resource %s", resourceToReclaim)
|
||||||
return
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// the only candidates viable for eviction are those pods that had anything running.
|
// the only candidates viable for eviction are those pods that had anything running.
|
||||||
activePods := podFunc()
|
activePods := podFunc()
|
||||||
if len(activePods) == 0 {
|
|
||||||
glog.Errorf("eviction manager: eviction thresholds have been met, but no pods are active to evict")
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
// rank the running pods for eviction for the specified resource
|
// rank the running pods for eviction for the specified resource
|
||||||
rank(activePods, statsFunc)
|
rank(activePods, statsFunc)
|
||||||
|
|
||||||
@ -364,14 +374,29 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
|
|||||||
// this is a blocking call and should only return when the pod and its containers are killed.
|
// this is a blocking call and should only return when the pod and its containers are killed.
|
||||||
err := m.killPodFunc(pod, status, &gracePeriodOverride)
|
err := m.killPodFunc(pod, status, &gracePeriodOverride)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
glog.Infof("eviction manager: pod %s failed to evict %v", format.Pod(pod), err)
|
glog.Warningf("eviction manager: error while evicting pod %s: %v", format.Pod(pod), err)
|
||||||
continue
|
|
||||||
}
|
}
|
||||||
// success, so we return until the next housekeeping interval
|
return pod
|
||||||
glog.Infof("eviction manager: pod %s evicted successfully", format.Pod(pod))
|
|
||||||
return
|
|
||||||
}
|
}
|
||||||
glog.Infof("eviction manager: unable to evict any pods from the node")
|
glog.Infof("eviction manager: unable to evict any pods from the node")
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *managerImpl) waitForPodCleanup(podCleanedUpFunc PodCleanedUpFunc, pod *v1.Pod) {
|
||||||
|
timeout := m.clock.NewTimer(podCleanupTimeout)
|
||||||
|
tick := m.clock.Tick(podCleanupPollFreq)
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-timeout.C():
|
||||||
|
glog.Warningf("eviction manager: timed out waiting for pod %s to be cleaned up", format.Pod(pod))
|
||||||
|
return
|
||||||
|
case <-tick:
|
||||||
|
if podCleanedUpFunc(pod) {
|
||||||
|
glog.Infof("eviction manager: pod %s successfully cleaned up", format.Pod(pod))
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// reclaimNodeLevelResources attempts to reclaim node level resources. returns true if thresholds were satisfied and no pod eviction is required.
|
// reclaimNodeLevelResources attempts to reclaim node level resources. returns true if thresholds were satisfied and no pod eviction is required.
|
||||||
|
@ -53,7 +53,7 @@ type Config struct {
|
|||||||
// Manager evaluates when an eviction threshold for node stability has been met on the node.
|
// Manager evaluates when an eviction threshold for node stability has been met on the node.
|
||||||
type Manager interface {
|
type Manager interface {
|
||||||
// Start starts the control loop to monitor eviction thresholds at specified interval.
|
// Start starts the control loop to monitor eviction thresholds at specified interval.
|
||||||
Start(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, nodeProvider NodeProvider, monitoringInterval time.Duration)
|
Start(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, podCleanedUpFunc PodCleanedUpFunc, nodeProvider NodeProvider, monitoringInterval time.Duration)
|
||||||
|
|
||||||
// IsUnderMemoryPressure returns true if the node is under memory pressure.
|
// IsUnderMemoryPressure returns true if the node is under memory pressure.
|
||||||
IsUnderMemoryPressure() bool
|
IsUnderMemoryPressure() bool
|
||||||
@ -93,6 +93,9 @@ type KillPodFunc func(pod *v1.Pod, status v1.PodStatus, gracePeriodOverride *int
|
|||||||
// ActivePodsFunc returns pods bound to the kubelet that are active (i.e. non-terminal state)
|
// ActivePodsFunc returns pods bound to the kubelet that are active (i.e. non-terminal state)
|
||||||
type ActivePodsFunc func() []*v1.Pod
|
type ActivePodsFunc func() []*v1.Pod
|
||||||
|
|
||||||
|
// PodCleanedUpFunc returns true if all resources associated with a pod have been reclaimed.
|
||||||
|
type PodCleanedUpFunc func(*v1.Pod) bool
|
||||||
|
|
||||||
// statsFunc returns the usage stats if known for an input pod.
|
// statsFunc returns the usage stats if known for an input pod.
|
||||||
type statsFunc func(pod *v1.Pod) (statsapi.PodStats, bool)
|
type statsFunc func(pod *v1.Pod) (statsapi.PodStats, bool)
|
||||||
|
|
||||||
|
@ -1156,7 +1156,7 @@ func (kl *Kubelet) initializeRuntimeDependentModules() {
|
|||||||
glog.Fatalf("Failed to start cAdvisor %v", err)
|
glog.Fatalf("Failed to start cAdvisor %v", err)
|
||||||
}
|
}
|
||||||
// eviction manager must start after cadvisor because it needs to know if the container runtime has a dedicated imagefs
|
// eviction manager must start after cadvisor because it needs to know if the container runtime has a dedicated imagefs
|
||||||
kl.evictionManager.Start(kl, kl.GetActivePods, kl, evictionMonitoringPeriod)
|
kl.evictionManager.Start(kl, kl.GetActivePods, kl.podResourcesAreReclaimed, kl, evictionMonitoringPeriod)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Run starts the kubelet reacting to config updates
|
// Run starts the kubelet reacting to config updates
|
||||||
|
@ -733,15 +733,10 @@ func (kl *Kubelet) podIsTerminated(pod *v1.Pod) bool {
|
|||||||
return status.Phase == v1.PodFailed || status.Phase == v1.PodSucceeded || (pod.DeletionTimestamp != nil && notRunning(status.ContainerStatuses))
|
return status.Phase == v1.PodFailed || status.Phase == v1.PodSucceeded || (pod.DeletionTimestamp != nil && notRunning(status.ContainerStatuses))
|
||||||
}
|
}
|
||||||
|
|
||||||
// OkToDeletePod returns true if all required node-level resources that a pod was consuming have
|
// PodResourcesAreReclaimed returns true if all required node-level resources that a pod was consuming have
|
||||||
// been reclaimed by the kubelet. Reclaiming resources is a prerequisite to deleting a pod from the
|
// been reclaimed by the kubelet. Reclaiming resources is a prerequisite to deleting a pod from the API server.
|
||||||
// API server.
|
func (kl *Kubelet) PodResourcesAreReclaimed(pod *v1.Pod, status v1.PodStatus) bool {
|
||||||
func (kl *Kubelet) OkToDeletePod(pod *v1.Pod) bool {
|
if !notRunning(status.ContainerStatuses) {
|
||||||
if pod.DeletionTimestamp == nil {
|
|
||||||
// We shouldnt delete pods whose DeletionTimestamp is not set
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
if !notRunning(pod.Status.ContainerStatuses) {
|
|
||||||
// We shouldnt delete pods that still have running containers
|
// We shouldnt delete pods that still have running containers
|
||||||
glog.V(3).Infof("Pod %q is terminated, but some containers are still running", format.Pod(pod))
|
glog.V(3).Infof("Pod %q is terminated, but some containers are still running", format.Pod(pod))
|
||||||
return false
|
return false
|
||||||
@ -761,6 +756,15 @@ func (kl *Kubelet) OkToDeletePod(pod *v1.Pod) bool {
|
|||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// podResourcesAreReclaimed simply calls PodResourcesAreReclaimed with the most up-to-date status.
|
||||||
|
func (kl *Kubelet) podResourcesAreReclaimed(pod *v1.Pod) bool {
|
||||||
|
status, ok := kl.statusManager.GetPodStatus(pod.UID)
|
||||||
|
if !ok {
|
||||||
|
status = pod.Status
|
||||||
|
}
|
||||||
|
return kl.PodResourcesAreReclaimed(pod, status)
|
||||||
|
}
|
||||||
|
|
||||||
// notRunning returns true if every status is terminated or waiting, or the status list
|
// notRunning returns true if every status is terminated or waiting, or the status list
|
||||||
// is empty.
|
// is empty.
|
||||||
func notRunning(statuses []v1.ContainerStatus) bool {
|
func notRunning(statuses []v1.ContainerStatus) bool {
|
||||||
|
@ -81,7 +81,7 @@ type PodStatusProvider interface {
|
|||||||
// An object which provides guarantees that a pod can be saftely deleted.
|
// An object which provides guarantees that a pod can be saftely deleted.
|
||||||
type PodDeletionSafetyProvider interface {
|
type PodDeletionSafetyProvider interface {
|
||||||
// A function which returns true if the pod can safely be deleted
|
// A function which returns true if the pod can safely be deleted
|
||||||
OkToDeletePod(pod *v1.Pod) bool
|
PodResourcesAreReclaimed(pod *v1.Pod, status v1.PodStatus) bool
|
||||||
}
|
}
|
||||||
|
|
||||||
// Manager is the Source of truth for kubelet pod status, and should be kept up-to-date with
|
// Manager is the Source of truth for kubelet pod status, and should be kept up-to-date with
|
||||||
@ -454,7 +454,7 @@ func (m *manager) syncPod(uid types.UID, status versionedPodStatus) {
|
|||||||
m.apiStatusVersions[pod.UID] = status.version
|
m.apiStatusVersions[pod.UID] = status.version
|
||||||
|
|
||||||
// We don't handle graceful deletion of mirror pods.
|
// We don't handle graceful deletion of mirror pods.
|
||||||
if !kubepod.IsMirrorPod(pod) && m.podDeletionSafety.OkToDeletePod(pod) {
|
if m.canBeDeleted(pod, status.status) {
|
||||||
deleteOptions := metav1.NewDeleteOptions(0)
|
deleteOptions := metav1.NewDeleteOptions(0)
|
||||||
// Use the pod UID as the precondition for deletion to prevent deleting a newly created pod with the same name and namespace.
|
// Use the pod UID as the precondition for deletion to prevent deleting a newly created pod with the same name and namespace.
|
||||||
deleteOptions.Preconditions = metav1.NewUIDPreconditions(string(pod.UID))
|
deleteOptions.Preconditions = metav1.NewUIDPreconditions(string(pod.UID))
|
||||||
@ -472,16 +472,18 @@ func (m *manager) syncPod(uid types.UID, status versionedPodStatus) {
|
|||||||
// This method is not thread safe, and most only be accessed by the sync thread.
|
// This method is not thread safe, and most only be accessed by the sync thread.
|
||||||
func (m *manager) needsUpdate(uid types.UID, status versionedPodStatus) bool {
|
func (m *manager) needsUpdate(uid types.UID, status versionedPodStatus) bool {
|
||||||
latest, ok := m.apiStatusVersions[uid]
|
latest, ok := m.apiStatusVersions[uid]
|
||||||
return !ok || latest < status.version || m.couldBeDeleted(uid, status.status)
|
if !ok || latest < status.version {
|
||||||
}
|
return true
|
||||||
|
}
|
||||||
func (m *manager) couldBeDeleted(uid types.UID, status v1.PodStatus) bool {
|
|
||||||
// The pod could be a static pod, so we should translate first.
|
|
||||||
pod, ok := m.podManager.GetPodByUID(uid)
|
pod, ok := m.podManager.GetPodByUID(uid)
|
||||||
if !ok {
|
if !ok {
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
return !kubepod.IsMirrorPod(pod) && m.podDeletionSafety.OkToDeletePod(pod)
|
return m.canBeDeleted(pod, status.status)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *manager) canBeDeleted(pod *v1.Pod, status v1.PodStatus) bool {
|
||||||
|
return !kubepod.IsMirrorPod(pod) && m.podDeletionSafety.PodResourcesAreReclaimed(pod, status) && pod.DeletionTimestamp != nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// needsReconcile compares the given status with the status in the pod manager (which
|
// needsReconcile compares the given status with the status in the pod manager (which
|
||||||
|
@ -11,10 +11,7 @@ go_library(
|
|||||||
name = "go_default_library",
|
name = "go_default_library",
|
||||||
srcs = ["fake_pod_deletion_safety.go"],
|
srcs = ["fake_pod_deletion_safety.go"],
|
||||||
tags = ["automanaged"],
|
tags = ["automanaged"],
|
||||||
deps = [
|
deps = ["//pkg/api/v1:go_default_library"],
|
||||||
"//pkg/api/v1:go_default_library",
|
|
||||||
"//pkg/kubelet/pod:go_default_library",
|
|
||||||
],
|
|
||||||
)
|
)
|
||||||
|
|
||||||
filegroup(
|
filegroup(
|
||||||
|
@ -16,13 +16,10 @@ limitations under the License.
|
|||||||
|
|
||||||
package testing
|
package testing
|
||||||
|
|
||||||
import (
|
import "k8s.io/kubernetes/pkg/api/v1"
|
||||||
"k8s.io/kubernetes/pkg/api/v1"
|
|
||||||
kubepod "k8s.io/kubernetes/pkg/kubelet/pod"
|
|
||||||
)
|
|
||||||
|
|
||||||
type FakePodDeletionSafetyProvider struct{}
|
type FakePodDeletionSafetyProvider struct{}
|
||||||
|
|
||||||
func (f *FakePodDeletionSafetyProvider) OkToDeletePod(pod *v1.Pod) bool {
|
func (f *FakePodDeletionSafetyProvider) PodResourcesAreReclaimed(pod *v1.Pod, status v1.PodStatus) bool {
|
||||||
return !kubepod.IsMirrorPod(pod) && pod.DeletionTimestamp != nil
|
return true
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user