wait for previous evicted pod to be cleaned up
This commit is contained in:
parent
682f3a39a0
commit
21fb487245
@ -43,6 +43,11 @@ import (
|
|||||||
"k8s.io/kubernetes/pkg/kubelet/util/format"
|
"k8s.io/kubernetes/pkg/kubelet/util/format"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
podCleanupTimeout = 30 * time.Second
|
||||||
|
podCleanupPollFreq = time.Second
|
||||||
|
)
|
||||||
|
|
||||||
// managerImpl implements Manager
|
// managerImpl implements Manager
|
||||||
type managerImpl struct {
|
type managerImpl struct {
|
||||||
// used to track time
|
// used to track time
|
||||||
@ -135,9 +140,18 @@ func (m *managerImpl) Admit(attrs *lifecycle.PodAdmitAttributes) lifecycle.PodAd
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Start starts the control loop to observe and response to low compute resources.
|
// Start starts the control loop to observe and response to low compute resources.
|
||||||
func (m *managerImpl) Start(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, nodeProvider NodeProvider, monitoringInterval time.Duration) {
|
func (m *managerImpl) Start(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, podCleanedUpFunc PodCleanedUpFunc, nodeProvider NodeProvider, monitoringInterval time.Duration) {
|
||||||
// start the eviction manager monitoring
|
// start the eviction manager monitoring
|
||||||
go wait.Until(func() { m.synchronize(diskInfoProvider, podFunc, nodeProvider) }, monitoringInterval, wait.NeverStop)
|
go func() {
|
||||||
|
for {
|
||||||
|
if evictedPod := m.synchronize(diskInfoProvider, podFunc, nodeProvider); evictedPod != nil {
|
||||||
|
glog.Infof("eviction manager: pod %s evicted, waiting for pod to be cleaned up", format.Pod(evictedPod))
|
||||||
|
m.waitForPodCleanup(podCleanedUpFunc, evictedPod)
|
||||||
|
} else {
|
||||||
|
time.Sleep(monitoringInterval)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}()
|
||||||
}
|
}
|
||||||
|
|
||||||
// IsUnderMemoryPressure returns true if the node is under memory pressure.
|
// IsUnderMemoryPressure returns true if the node is under memory pressure.
|
||||||
@ -188,11 +202,12 @@ func startMemoryThresholdNotifier(thresholds []evictionapi.Threshold, observatio
|
|||||||
}
|
}
|
||||||
|
|
||||||
// synchronize is the main control loop that enforces eviction thresholds.
|
// synchronize is the main control loop that enforces eviction thresholds.
|
||||||
func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, nodeProvider NodeProvider) {
|
// Returns the pod that was killed, or nil if no pod was killed.
|
||||||
|
func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, nodeProvider NodeProvider) *v1.Pod {
|
||||||
// if we have nothing to do, just return
|
// if we have nothing to do, just return
|
||||||
thresholds := m.config.Thresholds
|
thresholds := m.config.Thresholds
|
||||||
if len(thresholds) == 0 {
|
if len(thresholds) == 0 {
|
||||||
return
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
glog.V(3).Infof("eviction manager: synchronize housekeeping")
|
glog.V(3).Infof("eviction manager: synchronize housekeeping")
|
||||||
@ -203,7 +218,7 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
|
|||||||
// this may error if cadvisor has yet to complete housekeeping, so we will just try again in next pass.
|
// this may error if cadvisor has yet to complete housekeeping, so we will just try again in next pass.
|
||||||
hasDedicatedImageFs, err := diskInfoProvider.HasDedicatedImageFs()
|
hasDedicatedImageFs, err := diskInfoProvider.HasDedicatedImageFs()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return
|
return nil
|
||||||
}
|
}
|
||||||
m.resourceToRankFunc = buildResourceToRankFunc(hasDedicatedImageFs)
|
m.resourceToRankFunc = buildResourceToRankFunc(hasDedicatedImageFs)
|
||||||
m.resourceToNodeReclaimFuncs = buildResourceToNodeReclaimFuncs(m.imageGC, hasDedicatedImageFs)
|
m.resourceToNodeReclaimFuncs = buildResourceToNodeReclaimFuncs(m.imageGC, hasDedicatedImageFs)
|
||||||
@ -213,7 +228,7 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
|
|||||||
observations, statsFunc, err := makeSignalObservations(m.summaryProvider, nodeProvider)
|
observations, statsFunc, err := makeSignalObservations(m.summaryProvider, nodeProvider)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
glog.Errorf("eviction manager: unexpected err: %v", err)
|
glog.Errorf("eviction manager: unexpected err: %v", err)
|
||||||
return
|
return nil
|
||||||
}
|
}
|
||||||
debugLogObservations("observations", observations)
|
debugLogObservations("observations", observations)
|
||||||
|
|
||||||
@ -291,7 +306,7 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
|
|||||||
starvedResources := getStarvedResources(thresholds)
|
starvedResources := getStarvedResources(thresholds)
|
||||||
if len(starvedResources) == 0 {
|
if len(starvedResources) == 0 {
|
||||||
glog.V(3).Infof("eviction manager: no resources are starved")
|
glog.V(3).Infof("eviction manager: no resources are starved")
|
||||||
return
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// rank the resources to reclaim by eviction priority
|
// rank the resources to reclaim by eviction priority
|
||||||
@ -308,7 +323,7 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
|
|||||||
// check if there are node-level resources we can reclaim to reduce pressure before evicting end-user pods.
|
// check if there are node-level resources we can reclaim to reduce pressure before evicting end-user pods.
|
||||||
if m.reclaimNodeLevelResources(resourceToReclaim, observations) {
|
if m.reclaimNodeLevelResources(resourceToReclaim, observations) {
|
||||||
glog.Infof("eviction manager: able to reduce %v pressure without evicting pods.", resourceToReclaim)
|
glog.Infof("eviction manager: able to reduce %v pressure without evicting pods.", resourceToReclaim)
|
||||||
return
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
glog.Infof("eviction manager: must evict pod(s) to reclaim %v", resourceToReclaim)
|
glog.Infof("eviction manager: must evict pod(s) to reclaim %v", resourceToReclaim)
|
||||||
@ -317,16 +332,11 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
|
|||||||
rank, ok := m.resourceToRankFunc[resourceToReclaim]
|
rank, ok := m.resourceToRankFunc[resourceToReclaim]
|
||||||
if !ok {
|
if !ok {
|
||||||
glog.Errorf("eviction manager: no ranking function for resource %s", resourceToReclaim)
|
glog.Errorf("eviction manager: no ranking function for resource %s", resourceToReclaim)
|
||||||
return
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// the only candidates viable for eviction are those pods that had anything running.
|
// the only candidates viable for eviction are those pods that had anything running.
|
||||||
activePods := podFunc()
|
activePods := podFunc()
|
||||||
if len(activePods) == 0 {
|
|
||||||
glog.Errorf("eviction manager: eviction thresholds have been met, but no pods are active to evict")
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
// rank the running pods for eviction for the specified resource
|
// rank the running pods for eviction for the specified resource
|
||||||
rank(activePods, statsFunc)
|
rank(activePods, statsFunc)
|
||||||
|
|
||||||
@ -364,14 +374,29 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
|
|||||||
// this is a blocking call and should only return when the pod and its containers are killed.
|
// this is a blocking call and should only return when the pod and its containers are killed.
|
||||||
err := m.killPodFunc(pod, status, &gracePeriodOverride)
|
err := m.killPodFunc(pod, status, &gracePeriodOverride)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
glog.Infof("eviction manager: pod %s failed to evict %v", format.Pod(pod), err)
|
glog.Warningf("eviction manager: error while evicting pod %s: %v", format.Pod(pod), err)
|
||||||
continue
|
|
||||||
}
|
}
|
||||||
// success, so we return until the next housekeeping interval
|
return pod
|
||||||
glog.Infof("eviction manager: pod %s evicted successfully", format.Pod(pod))
|
|
||||||
return
|
|
||||||
}
|
}
|
||||||
glog.Infof("eviction manager: unable to evict any pods from the node")
|
glog.Infof("eviction manager: unable to evict any pods from the node")
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *managerImpl) waitForPodCleanup(podCleanedUpFunc PodCleanedUpFunc, pod *v1.Pod) {
|
||||||
|
timeout := m.clock.NewTimer(podCleanupTimeout)
|
||||||
|
tick := m.clock.Tick(podCleanupPollFreq)
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-timeout.C():
|
||||||
|
glog.Warningf("eviction manager: timed out waiting for pod %s to be cleaned up", format.Pod(pod))
|
||||||
|
return
|
||||||
|
case <-tick:
|
||||||
|
if podCleanedUpFunc(pod) {
|
||||||
|
glog.Infof("eviction manager: pod %s successfully cleaned up", format.Pod(pod))
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// reclaimNodeLevelResources attempts to reclaim node level resources. returns true if thresholds were satisfied and no pod eviction is required.
|
// reclaimNodeLevelResources attempts to reclaim node level resources. returns true if thresholds were satisfied and no pod eviction is required.
|
||||||
|
@ -53,7 +53,7 @@ type Config struct {
|
|||||||
// Manager evaluates when an eviction threshold for node stability has been met on the node.
|
// Manager evaluates when an eviction threshold for node stability has been met on the node.
|
||||||
type Manager interface {
|
type Manager interface {
|
||||||
// Start starts the control loop to monitor eviction thresholds at specified interval.
|
// Start starts the control loop to monitor eviction thresholds at specified interval.
|
||||||
Start(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, nodeProvider NodeProvider, monitoringInterval time.Duration)
|
Start(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, podCleanedUpFunc PodCleanedUpFunc, nodeProvider NodeProvider, monitoringInterval time.Duration)
|
||||||
|
|
||||||
// IsUnderMemoryPressure returns true if the node is under memory pressure.
|
// IsUnderMemoryPressure returns true if the node is under memory pressure.
|
||||||
IsUnderMemoryPressure() bool
|
IsUnderMemoryPressure() bool
|
||||||
@ -93,6 +93,9 @@ type KillPodFunc func(pod *v1.Pod, status v1.PodStatus, gracePeriodOverride *int
|
|||||||
// ActivePodsFunc returns pods bound to the kubelet that are active (i.e. non-terminal state)
|
// ActivePodsFunc returns pods bound to the kubelet that are active (i.e. non-terminal state)
|
||||||
type ActivePodsFunc func() []*v1.Pod
|
type ActivePodsFunc func() []*v1.Pod
|
||||||
|
|
||||||
|
// PodCleanedUpFunc returns true if all resources associated with a pod have been reclaimed.
|
||||||
|
type PodCleanedUpFunc func(*v1.Pod) bool
|
||||||
|
|
||||||
// statsFunc returns the usage stats if known for an input pod.
|
// statsFunc returns the usage stats if known for an input pod.
|
||||||
type statsFunc func(pod *v1.Pod) (statsapi.PodStats, bool)
|
type statsFunc func(pod *v1.Pod) (statsapi.PodStats, bool)
|
||||||
|
|
||||||
|
@ -1156,7 +1156,7 @@ func (kl *Kubelet) initializeRuntimeDependentModules() {
|
|||||||
glog.Fatalf("Failed to start cAdvisor %v", err)
|
glog.Fatalf("Failed to start cAdvisor %v", err)
|
||||||
}
|
}
|
||||||
// eviction manager must start after cadvisor because it needs to know if the container runtime has a dedicated imagefs
|
// eviction manager must start after cadvisor because it needs to know if the container runtime has a dedicated imagefs
|
||||||
kl.evictionManager.Start(kl, kl.GetActivePods, kl, evictionMonitoringPeriod)
|
kl.evictionManager.Start(kl, kl.GetActivePods, kl.podResourcesAreReclaimed, kl, evictionMonitoringPeriod)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Run starts the kubelet reacting to config updates
|
// Run starts the kubelet reacting to config updates
|
||||||
|
@ -733,15 +733,10 @@ func (kl *Kubelet) podIsTerminated(pod *v1.Pod) bool {
|
|||||||
return status.Phase == v1.PodFailed || status.Phase == v1.PodSucceeded || (pod.DeletionTimestamp != nil && notRunning(status.ContainerStatuses))
|
return status.Phase == v1.PodFailed || status.Phase == v1.PodSucceeded || (pod.DeletionTimestamp != nil && notRunning(status.ContainerStatuses))
|
||||||
}
|
}
|
||||||
|
|
||||||
// OkToDeletePod returns true if all required node-level resources that a pod was consuming have
|
// PodResourcesAreReclaimed returns true if all required node-level resources that a pod was consuming have
|
||||||
// been reclaimed by the kubelet. Reclaiming resources is a prerequisite to deleting a pod from the
|
// been reclaimed by the kubelet. Reclaiming resources is a prerequisite to deleting a pod from the API server.
|
||||||
// API server.
|
func (kl *Kubelet) PodResourcesAreReclaimed(pod *v1.Pod, status v1.PodStatus) bool {
|
||||||
func (kl *Kubelet) OkToDeletePod(pod *v1.Pod) bool {
|
if !notRunning(status.ContainerStatuses) {
|
||||||
if pod.DeletionTimestamp == nil {
|
|
||||||
// We shouldnt delete pods whose DeletionTimestamp is not set
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
if !notRunning(pod.Status.ContainerStatuses) {
|
|
||||||
// We shouldnt delete pods that still have running containers
|
// We shouldnt delete pods that still have running containers
|
||||||
glog.V(3).Infof("Pod %q is terminated, but some containers are still running", format.Pod(pod))
|
glog.V(3).Infof("Pod %q is terminated, but some containers are still running", format.Pod(pod))
|
||||||
return false
|
return false
|
||||||
@ -761,6 +756,15 @@ func (kl *Kubelet) OkToDeletePod(pod *v1.Pod) bool {
|
|||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// podResourcesAreReclaimed simply calls PodResourcesAreReclaimed with the most up-to-date status.
|
||||||
|
func (kl *Kubelet) podResourcesAreReclaimed(pod *v1.Pod) bool {
|
||||||
|
status, ok := kl.statusManager.GetPodStatus(pod.UID)
|
||||||
|
if !ok {
|
||||||
|
status = pod.Status
|
||||||
|
}
|
||||||
|
return kl.PodResourcesAreReclaimed(pod, status)
|
||||||
|
}
|
||||||
|
|
||||||
// notRunning returns true if every status is terminated or waiting, or the status list
|
// notRunning returns true if every status is terminated or waiting, or the status list
|
||||||
// is empty.
|
// is empty.
|
||||||
func notRunning(statuses []v1.ContainerStatus) bool {
|
func notRunning(statuses []v1.ContainerStatus) bool {
|
||||||
|
@ -81,7 +81,7 @@ type PodStatusProvider interface {
|
|||||||
// An object which provides guarantees that a pod can be saftely deleted.
|
// An object which provides guarantees that a pod can be saftely deleted.
|
||||||
type PodDeletionSafetyProvider interface {
|
type PodDeletionSafetyProvider interface {
|
||||||
// A function which returns true if the pod can safely be deleted
|
// A function which returns true if the pod can safely be deleted
|
||||||
OkToDeletePod(pod *v1.Pod) bool
|
PodResourcesAreReclaimed(pod *v1.Pod, status v1.PodStatus) bool
|
||||||
}
|
}
|
||||||
|
|
||||||
// Manager is the Source of truth for kubelet pod status, and should be kept up-to-date with
|
// Manager is the Source of truth for kubelet pod status, and should be kept up-to-date with
|
||||||
@ -454,7 +454,7 @@ func (m *manager) syncPod(uid types.UID, status versionedPodStatus) {
|
|||||||
m.apiStatusVersions[pod.UID] = status.version
|
m.apiStatusVersions[pod.UID] = status.version
|
||||||
|
|
||||||
// We don't handle graceful deletion of mirror pods.
|
// We don't handle graceful deletion of mirror pods.
|
||||||
if !kubepod.IsMirrorPod(pod) && m.podDeletionSafety.OkToDeletePod(pod) {
|
if m.canBeDeleted(pod, status.status) {
|
||||||
deleteOptions := metav1.NewDeleteOptions(0)
|
deleteOptions := metav1.NewDeleteOptions(0)
|
||||||
// Use the pod UID as the precondition for deletion to prevent deleting a newly created pod with the same name and namespace.
|
// Use the pod UID as the precondition for deletion to prevent deleting a newly created pod with the same name and namespace.
|
||||||
deleteOptions.Preconditions = metav1.NewUIDPreconditions(string(pod.UID))
|
deleteOptions.Preconditions = metav1.NewUIDPreconditions(string(pod.UID))
|
||||||
@ -472,16 +472,18 @@ func (m *manager) syncPod(uid types.UID, status versionedPodStatus) {
|
|||||||
// This method is not thread safe, and most only be accessed by the sync thread.
|
// This method is not thread safe, and most only be accessed by the sync thread.
|
||||||
func (m *manager) needsUpdate(uid types.UID, status versionedPodStatus) bool {
|
func (m *manager) needsUpdate(uid types.UID, status versionedPodStatus) bool {
|
||||||
latest, ok := m.apiStatusVersions[uid]
|
latest, ok := m.apiStatusVersions[uid]
|
||||||
return !ok || latest < status.version || m.couldBeDeleted(uid, status.status)
|
if !ok || latest < status.version {
|
||||||
|
return true
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *manager) couldBeDeleted(uid types.UID, status v1.PodStatus) bool {
|
|
||||||
// The pod could be a static pod, so we should translate first.
|
|
||||||
pod, ok := m.podManager.GetPodByUID(uid)
|
pod, ok := m.podManager.GetPodByUID(uid)
|
||||||
if !ok {
|
if !ok {
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
return !kubepod.IsMirrorPod(pod) && m.podDeletionSafety.OkToDeletePod(pod)
|
return m.canBeDeleted(pod, status.status)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *manager) canBeDeleted(pod *v1.Pod, status v1.PodStatus) bool {
|
||||||
|
return !kubepod.IsMirrorPod(pod) && m.podDeletionSafety.PodResourcesAreReclaimed(pod, status) && pod.DeletionTimestamp != nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// needsReconcile compares the given status with the status in the pod manager (which
|
// needsReconcile compares the given status with the status in the pod manager (which
|
||||||
|
@ -11,10 +11,7 @@ go_library(
|
|||||||
name = "go_default_library",
|
name = "go_default_library",
|
||||||
srcs = ["fake_pod_deletion_safety.go"],
|
srcs = ["fake_pod_deletion_safety.go"],
|
||||||
tags = ["automanaged"],
|
tags = ["automanaged"],
|
||||||
deps = [
|
deps = ["//pkg/api/v1:go_default_library"],
|
||||||
"//pkg/api/v1:go_default_library",
|
|
||||||
"//pkg/kubelet/pod:go_default_library",
|
|
||||||
],
|
|
||||||
)
|
)
|
||||||
|
|
||||||
filegroup(
|
filegroup(
|
||||||
|
@ -16,13 +16,10 @@ limitations under the License.
|
|||||||
|
|
||||||
package testing
|
package testing
|
||||||
|
|
||||||
import (
|
import "k8s.io/kubernetes/pkg/api/v1"
|
||||||
"k8s.io/kubernetes/pkg/api/v1"
|
|
||||||
kubepod "k8s.io/kubernetes/pkg/kubelet/pod"
|
|
||||||
)
|
|
||||||
|
|
||||||
type FakePodDeletionSafetyProvider struct{}
|
type FakePodDeletionSafetyProvider struct{}
|
||||||
|
|
||||||
func (f *FakePodDeletionSafetyProvider) OkToDeletePod(pod *v1.Pod) bool {
|
func (f *FakePodDeletionSafetyProvider) PodResourcesAreReclaimed(pod *v1.Pod, status v1.PodStatus) bool {
|
||||||
return !kubepod.IsMirrorPod(pod) && pod.DeletionTimestamp != nil
|
return true
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user