wait for previous evicted pod to be cleaned up
This commit is contained in:
parent
682f3a39a0
commit
21fb487245
@ -43,6 +43,11 @@ import (
|
||||
"k8s.io/kubernetes/pkg/kubelet/util/format"
|
||||
)
|
||||
|
||||
const (
|
||||
podCleanupTimeout = 30 * time.Second
|
||||
podCleanupPollFreq = time.Second
|
||||
)
|
||||
|
||||
// managerImpl implements Manager
|
||||
type managerImpl struct {
|
||||
// used to track time
|
||||
@ -135,9 +140,18 @@ func (m *managerImpl) Admit(attrs *lifecycle.PodAdmitAttributes) lifecycle.PodAd
|
||||
}
|
||||
|
||||
// Start starts the control loop to observe and response to low compute resources.
|
||||
func (m *managerImpl) Start(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, nodeProvider NodeProvider, monitoringInterval time.Duration) {
|
||||
func (m *managerImpl) Start(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, podCleanedUpFunc PodCleanedUpFunc, nodeProvider NodeProvider, monitoringInterval time.Duration) {
|
||||
// start the eviction manager monitoring
|
||||
go wait.Until(func() { m.synchronize(diskInfoProvider, podFunc, nodeProvider) }, monitoringInterval, wait.NeverStop)
|
||||
go func() {
|
||||
for {
|
||||
if evictedPod := m.synchronize(diskInfoProvider, podFunc, nodeProvider); evictedPod != nil {
|
||||
glog.Infof("eviction manager: pod %s evicted, waiting for pod to be cleaned up", format.Pod(evictedPod))
|
||||
m.waitForPodCleanup(podCleanedUpFunc, evictedPod)
|
||||
} else {
|
||||
time.Sleep(monitoringInterval)
|
||||
}
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
// IsUnderMemoryPressure returns true if the node is under memory pressure.
|
||||
@ -188,11 +202,12 @@ func startMemoryThresholdNotifier(thresholds []evictionapi.Threshold, observatio
|
||||
}
|
||||
|
||||
// synchronize is the main control loop that enforces eviction thresholds.
|
||||
func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, nodeProvider NodeProvider) {
|
||||
// Returns the pod that was killed, or nil if no pod was killed.
|
||||
func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, nodeProvider NodeProvider) *v1.Pod {
|
||||
// if we have nothing to do, just return
|
||||
thresholds := m.config.Thresholds
|
||||
if len(thresholds) == 0 {
|
||||
return
|
||||
return nil
|
||||
}
|
||||
|
||||
glog.V(3).Infof("eviction manager: synchronize housekeeping")
|
||||
@ -203,7 +218,7 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
|
||||
// this may error if cadvisor has yet to complete housekeeping, so we will just try again in next pass.
|
||||
hasDedicatedImageFs, err := diskInfoProvider.HasDedicatedImageFs()
|
||||
if err != nil {
|
||||
return
|
||||
return nil
|
||||
}
|
||||
m.resourceToRankFunc = buildResourceToRankFunc(hasDedicatedImageFs)
|
||||
m.resourceToNodeReclaimFuncs = buildResourceToNodeReclaimFuncs(m.imageGC, hasDedicatedImageFs)
|
||||
@ -213,7 +228,7 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
|
||||
observations, statsFunc, err := makeSignalObservations(m.summaryProvider, nodeProvider)
|
||||
if err != nil {
|
||||
glog.Errorf("eviction manager: unexpected err: %v", err)
|
||||
return
|
||||
return nil
|
||||
}
|
||||
debugLogObservations("observations", observations)
|
||||
|
||||
@ -291,7 +306,7 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
|
||||
starvedResources := getStarvedResources(thresholds)
|
||||
if len(starvedResources) == 0 {
|
||||
glog.V(3).Infof("eviction manager: no resources are starved")
|
||||
return
|
||||
return nil
|
||||
}
|
||||
|
||||
// rank the resources to reclaim by eviction priority
|
||||
@ -308,7 +323,7 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
|
||||
// check if there are node-level resources we can reclaim to reduce pressure before evicting end-user pods.
|
||||
if m.reclaimNodeLevelResources(resourceToReclaim, observations) {
|
||||
glog.Infof("eviction manager: able to reduce %v pressure without evicting pods.", resourceToReclaim)
|
||||
return
|
||||
return nil
|
||||
}
|
||||
|
||||
glog.Infof("eviction manager: must evict pod(s) to reclaim %v", resourceToReclaim)
|
||||
@ -317,16 +332,11 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
|
||||
rank, ok := m.resourceToRankFunc[resourceToReclaim]
|
||||
if !ok {
|
||||
glog.Errorf("eviction manager: no ranking function for resource %s", resourceToReclaim)
|
||||
return
|
||||
return nil
|
||||
}
|
||||
|
||||
// the only candidates viable for eviction are those pods that had anything running.
|
||||
activePods := podFunc()
|
||||
if len(activePods) == 0 {
|
||||
glog.Errorf("eviction manager: eviction thresholds have been met, but no pods are active to evict")
|
||||
return
|
||||
}
|
||||
|
||||
// rank the running pods for eviction for the specified resource
|
||||
rank(activePods, statsFunc)
|
||||
|
||||
@ -364,14 +374,29 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
|
||||
// this is a blocking call and should only return when the pod and its containers are killed.
|
||||
err := m.killPodFunc(pod, status, &gracePeriodOverride)
|
||||
if err != nil {
|
||||
glog.Infof("eviction manager: pod %s failed to evict %v", format.Pod(pod), err)
|
||||
continue
|
||||
glog.Warningf("eviction manager: error while evicting pod %s: %v", format.Pod(pod), err)
|
||||
}
|
||||
// success, so we return until the next housekeeping interval
|
||||
glog.Infof("eviction manager: pod %s evicted successfully", format.Pod(pod))
|
||||
return
|
||||
return pod
|
||||
}
|
||||
glog.Infof("eviction manager: unable to evict any pods from the node")
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *managerImpl) waitForPodCleanup(podCleanedUpFunc PodCleanedUpFunc, pod *v1.Pod) {
|
||||
timeout := m.clock.NewTimer(podCleanupTimeout)
|
||||
tick := m.clock.Tick(podCleanupPollFreq)
|
||||
for {
|
||||
select {
|
||||
case <-timeout.C():
|
||||
glog.Warningf("eviction manager: timed out waiting for pod %s to be cleaned up", format.Pod(pod))
|
||||
return
|
||||
case <-tick:
|
||||
if podCleanedUpFunc(pod) {
|
||||
glog.Infof("eviction manager: pod %s successfully cleaned up", format.Pod(pod))
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// reclaimNodeLevelResources attempts to reclaim node level resources. returns true if thresholds were satisfied and no pod eviction is required.
|
||||
|
@ -53,7 +53,7 @@ type Config struct {
|
||||
// Manager evaluates when an eviction threshold for node stability has been met on the node.
|
||||
type Manager interface {
|
||||
// Start starts the control loop to monitor eviction thresholds at specified interval.
|
||||
Start(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, nodeProvider NodeProvider, monitoringInterval time.Duration)
|
||||
Start(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, podCleanedUpFunc PodCleanedUpFunc, nodeProvider NodeProvider, monitoringInterval time.Duration)
|
||||
|
||||
// IsUnderMemoryPressure returns true if the node is under memory pressure.
|
||||
IsUnderMemoryPressure() bool
|
||||
@ -93,6 +93,9 @@ type KillPodFunc func(pod *v1.Pod, status v1.PodStatus, gracePeriodOverride *int
|
||||
// ActivePodsFunc returns pods bound to the kubelet that are active (i.e. non-terminal state)
|
||||
type ActivePodsFunc func() []*v1.Pod
|
||||
|
||||
// PodCleanedUpFunc returns true if all resources associated with a pod have been reclaimed.
|
||||
type PodCleanedUpFunc func(*v1.Pod) bool
|
||||
|
||||
// statsFunc returns the usage stats if known for an input pod.
|
||||
type statsFunc func(pod *v1.Pod) (statsapi.PodStats, bool)
|
||||
|
||||
|
@ -1156,7 +1156,7 @@ func (kl *Kubelet) initializeRuntimeDependentModules() {
|
||||
glog.Fatalf("Failed to start cAdvisor %v", err)
|
||||
}
|
||||
// eviction manager must start after cadvisor because it needs to know if the container runtime has a dedicated imagefs
|
||||
kl.evictionManager.Start(kl, kl.GetActivePods, kl, evictionMonitoringPeriod)
|
||||
kl.evictionManager.Start(kl, kl.GetActivePods, kl.podResourcesAreReclaimed, kl, evictionMonitoringPeriod)
|
||||
}
|
||||
|
||||
// Run starts the kubelet reacting to config updates
|
||||
|
@ -733,15 +733,10 @@ func (kl *Kubelet) podIsTerminated(pod *v1.Pod) bool {
|
||||
return status.Phase == v1.PodFailed || status.Phase == v1.PodSucceeded || (pod.DeletionTimestamp != nil && notRunning(status.ContainerStatuses))
|
||||
}
|
||||
|
||||
// OkToDeletePod returns true if all required node-level resources that a pod was consuming have
|
||||
// been reclaimed by the kubelet. Reclaiming resources is a prerequisite to deleting a pod from the
|
||||
// API server.
|
||||
func (kl *Kubelet) OkToDeletePod(pod *v1.Pod) bool {
|
||||
if pod.DeletionTimestamp == nil {
|
||||
// We shouldnt delete pods whose DeletionTimestamp is not set
|
||||
return false
|
||||
}
|
||||
if !notRunning(pod.Status.ContainerStatuses) {
|
||||
// PodResourcesAreReclaimed returns true if all required node-level resources that a pod was consuming have
|
||||
// been reclaimed by the kubelet. Reclaiming resources is a prerequisite to deleting a pod from the API server.
|
||||
func (kl *Kubelet) PodResourcesAreReclaimed(pod *v1.Pod, status v1.PodStatus) bool {
|
||||
if !notRunning(status.ContainerStatuses) {
|
||||
// We shouldnt delete pods that still have running containers
|
||||
glog.V(3).Infof("Pod %q is terminated, but some containers are still running", format.Pod(pod))
|
||||
return false
|
||||
@ -761,6 +756,15 @@ func (kl *Kubelet) OkToDeletePod(pod *v1.Pod) bool {
|
||||
return true
|
||||
}
|
||||
|
||||
// podResourcesAreReclaimed simply calls PodResourcesAreReclaimed with the most up-to-date status.
|
||||
func (kl *Kubelet) podResourcesAreReclaimed(pod *v1.Pod) bool {
|
||||
status, ok := kl.statusManager.GetPodStatus(pod.UID)
|
||||
if !ok {
|
||||
status = pod.Status
|
||||
}
|
||||
return kl.PodResourcesAreReclaimed(pod, status)
|
||||
}
|
||||
|
||||
// notRunning returns true if every status is terminated or waiting, or the status list
|
||||
// is empty.
|
||||
func notRunning(statuses []v1.ContainerStatus) bool {
|
||||
|
@ -81,7 +81,7 @@ type PodStatusProvider interface {
|
||||
// An object which provides guarantees that a pod can be saftely deleted.
|
||||
type PodDeletionSafetyProvider interface {
|
||||
// A function which returns true if the pod can safely be deleted
|
||||
OkToDeletePod(pod *v1.Pod) bool
|
||||
PodResourcesAreReclaimed(pod *v1.Pod, status v1.PodStatus) bool
|
||||
}
|
||||
|
||||
// Manager is the Source of truth for kubelet pod status, and should be kept up-to-date with
|
||||
@ -454,7 +454,7 @@ func (m *manager) syncPod(uid types.UID, status versionedPodStatus) {
|
||||
m.apiStatusVersions[pod.UID] = status.version
|
||||
|
||||
// We don't handle graceful deletion of mirror pods.
|
||||
if !kubepod.IsMirrorPod(pod) && m.podDeletionSafety.OkToDeletePod(pod) {
|
||||
if m.canBeDeleted(pod, status.status) {
|
||||
deleteOptions := metav1.NewDeleteOptions(0)
|
||||
// Use the pod UID as the precondition for deletion to prevent deleting a newly created pod with the same name and namespace.
|
||||
deleteOptions.Preconditions = metav1.NewUIDPreconditions(string(pod.UID))
|
||||
@ -472,16 +472,18 @@ func (m *manager) syncPod(uid types.UID, status versionedPodStatus) {
|
||||
// This method is not thread safe, and most only be accessed by the sync thread.
|
||||
func (m *manager) needsUpdate(uid types.UID, status versionedPodStatus) bool {
|
||||
latest, ok := m.apiStatusVersions[uid]
|
||||
return !ok || latest < status.version || m.couldBeDeleted(uid, status.status)
|
||||
if !ok || latest < status.version {
|
||||
return true
|
||||
}
|
||||
|
||||
func (m *manager) couldBeDeleted(uid types.UID, status v1.PodStatus) bool {
|
||||
// The pod could be a static pod, so we should translate first.
|
||||
pod, ok := m.podManager.GetPodByUID(uid)
|
||||
if !ok {
|
||||
return false
|
||||
}
|
||||
return !kubepod.IsMirrorPod(pod) && m.podDeletionSafety.OkToDeletePod(pod)
|
||||
return m.canBeDeleted(pod, status.status)
|
||||
}
|
||||
|
||||
func (m *manager) canBeDeleted(pod *v1.Pod, status v1.PodStatus) bool {
|
||||
return !kubepod.IsMirrorPod(pod) && m.podDeletionSafety.PodResourcesAreReclaimed(pod, status) && pod.DeletionTimestamp != nil
|
||||
}
|
||||
|
||||
// needsReconcile compares the given status with the status in the pod manager (which
|
||||
|
@ -11,10 +11,7 @@ go_library(
|
||||
name = "go_default_library",
|
||||
srcs = ["fake_pod_deletion_safety.go"],
|
||||
tags = ["automanaged"],
|
||||
deps = [
|
||||
"//pkg/api/v1:go_default_library",
|
||||
"//pkg/kubelet/pod:go_default_library",
|
||||
],
|
||||
deps = ["//pkg/api/v1:go_default_library"],
|
||||
)
|
||||
|
||||
filegroup(
|
||||
|
@ -16,13 +16,10 @@ limitations under the License.
|
||||
|
||||
package testing
|
||||
|
||||
import (
|
||||
"k8s.io/kubernetes/pkg/api/v1"
|
||||
kubepod "k8s.io/kubernetes/pkg/kubelet/pod"
|
||||
)
|
||||
import "k8s.io/kubernetes/pkg/api/v1"
|
||||
|
||||
type FakePodDeletionSafetyProvider struct{}
|
||||
|
||||
func (f *FakePodDeletionSafetyProvider) OkToDeletePod(pod *v1.Pod) bool {
|
||||
return !kubepod.IsMirrorPod(pod) && pod.DeletionTimestamp != nil
|
||||
func (f *FakePodDeletionSafetyProvider) PodResourcesAreReclaimed(pod *v1.Pod, status v1.PodStatus) bool {
|
||||
return true
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user