wait for previous evicted pod to be cleaned up

This commit is contained in:
David Ashpole 2017-05-16 14:23:42 -07:00
parent 682f3a39a0
commit 21fb487245
7 changed files with 76 additions and 48 deletions

View File

@ -43,6 +43,11 @@ import (
"k8s.io/kubernetes/pkg/kubelet/util/format"
)
const (
podCleanupTimeout = 30 * time.Second
podCleanupPollFreq = time.Second
)
// managerImpl implements Manager
type managerImpl struct {
// used to track time
@ -135,9 +140,18 @@ func (m *managerImpl) Admit(attrs *lifecycle.PodAdmitAttributes) lifecycle.PodAd
}
// Start starts the control loop to observe and response to low compute resources.
func (m *managerImpl) Start(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, nodeProvider NodeProvider, monitoringInterval time.Duration) {
func (m *managerImpl) Start(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, podCleanedUpFunc PodCleanedUpFunc, nodeProvider NodeProvider, monitoringInterval time.Duration) {
// start the eviction manager monitoring
go wait.Until(func() { m.synchronize(diskInfoProvider, podFunc, nodeProvider) }, monitoringInterval, wait.NeverStop)
go func() {
for {
if evictedPod := m.synchronize(diskInfoProvider, podFunc, nodeProvider); evictedPod != nil {
glog.Infof("eviction manager: pod %s evicted, waiting for pod to be cleaned up", format.Pod(evictedPod))
m.waitForPodCleanup(podCleanedUpFunc, evictedPod)
} else {
time.Sleep(monitoringInterval)
}
}
}()
}
// IsUnderMemoryPressure returns true if the node is under memory pressure.
@ -188,11 +202,12 @@ func startMemoryThresholdNotifier(thresholds []evictionapi.Threshold, observatio
}
// synchronize is the main control loop that enforces eviction thresholds.
func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, nodeProvider NodeProvider) {
// Returns the pod that was killed, or nil if no pod was killed.
func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, nodeProvider NodeProvider) *v1.Pod {
// if we have nothing to do, just return
thresholds := m.config.Thresholds
if len(thresholds) == 0 {
return
return nil
}
glog.V(3).Infof("eviction manager: synchronize housekeeping")
@ -203,7 +218,7 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
// this may error if cadvisor has yet to complete housekeeping, so we will just try again in next pass.
hasDedicatedImageFs, err := diskInfoProvider.HasDedicatedImageFs()
if err != nil {
return
return nil
}
m.resourceToRankFunc = buildResourceToRankFunc(hasDedicatedImageFs)
m.resourceToNodeReclaimFuncs = buildResourceToNodeReclaimFuncs(m.imageGC, hasDedicatedImageFs)
@ -213,7 +228,7 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
observations, statsFunc, err := makeSignalObservations(m.summaryProvider, nodeProvider)
if err != nil {
glog.Errorf("eviction manager: unexpected err: %v", err)
return
return nil
}
debugLogObservations("observations", observations)
@ -291,7 +306,7 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
starvedResources := getStarvedResources(thresholds)
if len(starvedResources) == 0 {
glog.V(3).Infof("eviction manager: no resources are starved")
return
return nil
}
// rank the resources to reclaim by eviction priority
@ -308,7 +323,7 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
// check if there are node-level resources we can reclaim to reduce pressure before evicting end-user pods.
if m.reclaimNodeLevelResources(resourceToReclaim, observations) {
glog.Infof("eviction manager: able to reduce %v pressure without evicting pods.", resourceToReclaim)
return
return nil
}
glog.Infof("eviction manager: must evict pod(s) to reclaim %v", resourceToReclaim)
@ -317,16 +332,11 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
rank, ok := m.resourceToRankFunc[resourceToReclaim]
if !ok {
glog.Errorf("eviction manager: no ranking function for resource %s", resourceToReclaim)
return
return nil
}
// the only candidates viable for eviction are those pods that had anything running.
activePods := podFunc()
if len(activePods) == 0 {
glog.Errorf("eviction manager: eviction thresholds have been met, but no pods are active to evict")
return
}
// rank the running pods for eviction for the specified resource
rank(activePods, statsFunc)
@ -364,14 +374,29 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
// this is a blocking call and should only return when the pod and its containers are killed.
err := m.killPodFunc(pod, status, &gracePeriodOverride)
if err != nil {
glog.Infof("eviction manager: pod %s failed to evict %v", format.Pod(pod), err)
continue
glog.Warningf("eviction manager: error while evicting pod %s: %v", format.Pod(pod), err)
}
// success, so we return until the next housekeeping interval
glog.Infof("eviction manager: pod %s evicted successfully", format.Pod(pod))
return
return pod
}
glog.Infof("eviction manager: unable to evict any pods from the node")
return nil
}
func (m *managerImpl) waitForPodCleanup(podCleanedUpFunc PodCleanedUpFunc, pod *v1.Pod) {
timeout := m.clock.NewTimer(podCleanupTimeout)
tick := m.clock.Tick(podCleanupPollFreq)
for {
select {
case <-timeout.C():
glog.Warningf("eviction manager: timed out waiting for pod %s to be cleaned up", format.Pod(pod))
return
case <-tick:
if podCleanedUpFunc(pod) {
glog.Infof("eviction manager: pod %s successfully cleaned up", format.Pod(pod))
return
}
}
}
}
// reclaimNodeLevelResources attempts to reclaim node level resources. returns true if thresholds were satisfied and no pod eviction is required.

View File

@ -53,7 +53,7 @@ type Config struct {
// Manager evaluates when an eviction threshold for node stability has been met on the node.
type Manager interface {
// Start starts the control loop to monitor eviction thresholds at specified interval.
Start(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, nodeProvider NodeProvider, monitoringInterval time.Duration)
Start(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, podCleanedUpFunc PodCleanedUpFunc, nodeProvider NodeProvider, monitoringInterval time.Duration)
// IsUnderMemoryPressure returns true if the node is under memory pressure.
IsUnderMemoryPressure() bool
@ -93,6 +93,9 @@ type KillPodFunc func(pod *v1.Pod, status v1.PodStatus, gracePeriodOverride *int
// ActivePodsFunc returns pods bound to the kubelet that are active (i.e. non-terminal state)
type ActivePodsFunc func() []*v1.Pod
// PodCleanedUpFunc returns true if all resources associated with a pod have been reclaimed.
type PodCleanedUpFunc func(*v1.Pod) bool
// statsFunc returns the usage stats if known for an input pod.
type statsFunc func(pod *v1.Pod) (statsapi.PodStats, bool)

View File

@ -1156,7 +1156,7 @@ func (kl *Kubelet) initializeRuntimeDependentModules() {
glog.Fatalf("Failed to start cAdvisor %v", err)
}
// eviction manager must start after cadvisor because it needs to know if the container runtime has a dedicated imagefs
kl.evictionManager.Start(kl, kl.GetActivePods, kl, evictionMonitoringPeriod)
kl.evictionManager.Start(kl, kl.GetActivePods, kl.podResourcesAreReclaimed, kl, evictionMonitoringPeriod)
}
// Run starts the kubelet reacting to config updates

View File

@ -733,15 +733,10 @@ func (kl *Kubelet) podIsTerminated(pod *v1.Pod) bool {
return status.Phase == v1.PodFailed || status.Phase == v1.PodSucceeded || (pod.DeletionTimestamp != nil && notRunning(status.ContainerStatuses))
}
// OkToDeletePod returns true if all required node-level resources that a pod was consuming have
// been reclaimed by the kubelet. Reclaiming resources is a prerequisite to deleting a pod from the
// API server.
func (kl *Kubelet) OkToDeletePod(pod *v1.Pod) bool {
if pod.DeletionTimestamp == nil {
// We shouldnt delete pods whose DeletionTimestamp is not set
return false
}
if !notRunning(pod.Status.ContainerStatuses) {
// PodResourcesAreReclaimed returns true if all required node-level resources that a pod was consuming have
// been reclaimed by the kubelet. Reclaiming resources is a prerequisite to deleting a pod from the API server.
func (kl *Kubelet) PodResourcesAreReclaimed(pod *v1.Pod, status v1.PodStatus) bool {
if !notRunning(status.ContainerStatuses) {
// We shouldnt delete pods that still have running containers
glog.V(3).Infof("Pod %q is terminated, but some containers are still running", format.Pod(pod))
return false
@ -761,6 +756,15 @@ func (kl *Kubelet) OkToDeletePod(pod *v1.Pod) bool {
return true
}
// podResourcesAreReclaimed simply calls PodResourcesAreReclaimed with the most up-to-date status.
func (kl *Kubelet) podResourcesAreReclaimed(pod *v1.Pod) bool {
status, ok := kl.statusManager.GetPodStatus(pod.UID)
if !ok {
status = pod.Status
}
return kl.PodResourcesAreReclaimed(pod, status)
}
// notRunning returns true if every status is terminated or waiting, or the status list
// is empty.
func notRunning(statuses []v1.ContainerStatus) bool {

View File

@ -81,7 +81,7 @@ type PodStatusProvider interface {
// An object which provides guarantees that a pod can be saftely deleted.
type PodDeletionSafetyProvider interface {
// A function which returns true if the pod can safely be deleted
OkToDeletePod(pod *v1.Pod) bool
PodResourcesAreReclaimed(pod *v1.Pod, status v1.PodStatus) bool
}
// Manager is the Source of truth for kubelet pod status, and should be kept up-to-date with
@ -454,7 +454,7 @@ func (m *manager) syncPod(uid types.UID, status versionedPodStatus) {
m.apiStatusVersions[pod.UID] = status.version
// We don't handle graceful deletion of mirror pods.
if !kubepod.IsMirrorPod(pod) && m.podDeletionSafety.OkToDeletePod(pod) {
if m.canBeDeleted(pod, status.status) {
deleteOptions := metav1.NewDeleteOptions(0)
// Use the pod UID as the precondition for deletion to prevent deleting a newly created pod with the same name and namespace.
deleteOptions.Preconditions = metav1.NewUIDPreconditions(string(pod.UID))
@ -472,16 +472,18 @@ func (m *manager) syncPod(uid types.UID, status versionedPodStatus) {
// This method is not thread safe, and most only be accessed by the sync thread.
func (m *manager) needsUpdate(uid types.UID, status versionedPodStatus) bool {
latest, ok := m.apiStatusVersions[uid]
return !ok || latest < status.version || m.couldBeDeleted(uid, status.status)
}
func (m *manager) couldBeDeleted(uid types.UID, status v1.PodStatus) bool {
// The pod could be a static pod, so we should translate first.
if !ok || latest < status.version {
return true
}
pod, ok := m.podManager.GetPodByUID(uid)
if !ok {
return false
}
return !kubepod.IsMirrorPod(pod) && m.podDeletionSafety.OkToDeletePod(pod)
return m.canBeDeleted(pod, status.status)
}
func (m *manager) canBeDeleted(pod *v1.Pod, status v1.PodStatus) bool {
return !kubepod.IsMirrorPod(pod) && m.podDeletionSafety.PodResourcesAreReclaimed(pod, status) && pod.DeletionTimestamp != nil
}
// needsReconcile compares the given status with the status in the pod manager (which

View File

@ -11,10 +11,7 @@ go_library(
name = "go_default_library",
srcs = ["fake_pod_deletion_safety.go"],
tags = ["automanaged"],
deps = [
"//pkg/api/v1:go_default_library",
"//pkg/kubelet/pod:go_default_library",
],
deps = ["//pkg/api/v1:go_default_library"],
)
filegroup(

View File

@ -16,13 +16,10 @@ limitations under the License.
package testing
import (
"k8s.io/kubernetes/pkg/api/v1"
kubepod "k8s.io/kubernetes/pkg/kubelet/pod"
)
import "k8s.io/kubernetes/pkg/api/v1"
type FakePodDeletionSafetyProvider struct{}
func (f *FakePodDeletionSafetyProvider) OkToDeletePod(pod *v1.Pod) bool {
return !kubepod.IsMirrorPod(pod) && pod.DeletionTimestamp != nil
func (f *FakePodDeletionSafetyProvider) PodResourcesAreReclaimed(pod *v1.Pod, status v1.PodStatus) bool {
return true
}