
A number of race conditions exist when pods are terminated early in their lifecycle because components in the kubelet need to know "no running containers" or "containers can't be started from now on" but were relying on outdated state. Only the pod worker knows whether containers are being started for a given pod, which is required to know when a pod is "terminated" (no running containers, none coming). Move that responsibility and podKiller function into the pod workers, and have everything that was killing the pod go into the UpdatePod loop. Split syncPod into three phases - setup, terminate containers, and cleanup pod - and have transitions between those methods be visible to other components. After this change, to kill a pod you tell the pod worker to UpdatePod({UpdateType: SyncPodKill, Pod: pod}). Several places in the kubelet were incorrect about whether they were handling terminating (should stop running, might have containers) or terminated (no running containers) pods. The pod worker exposes methods that allow other loops to know when to set up or tear down resources based on the state of the pod - these methods remove the possibility of race conditions by ensuring a single component is responsible for knowing each pod's allowed state and other components simply delegate to checking whether they are in the window by UID. Removing containers now no longer blocks final pod deletion in the API server and are handled as background cleanup. Node shutdown no longer marks pods as failed as they can be restarted in the next step. See https://docs.google.com/document/d/1Pic5TPntdJnYfIpBeZndDelM-AbS4FN9H2GTLFhoJ04/edit# for details
271 lines
11 KiB
Go
271 lines
11 KiB
Go
/*
|
|
Copyright 2017 The Kubernetes Authors.
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
*/
|
|
|
|
package preemption
|
|
|
|
import (
|
|
"fmt"
|
|
"math"
|
|
|
|
v1 "k8s.io/api/core/v1"
|
|
"k8s.io/client-go/tools/record"
|
|
"k8s.io/klog/v2"
|
|
"k8s.io/kubernetes/pkg/api/v1/resource"
|
|
v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos"
|
|
"k8s.io/kubernetes/pkg/kubelet/events"
|
|
"k8s.io/kubernetes/pkg/kubelet/eviction"
|
|
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
|
|
"k8s.io/kubernetes/pkg/kubelet/metrics"
|
|
kubetypes "k8s.io/kubernetes/pkg/kubelet/types"
|
|
)
|
|
|
|
const message = "Preempted in order to admit critical pod"
|
|
|
|
// CriticalPodAdmissionHandler is an AdmissionFailureHandler that handles admission failure for Critical Pods.
|
|
// If the ONLY admission failures are due to insufficient resources, then CriticalPodAdmissionHandler evicts pods
|
|
// so that the critical pod can be admitted. For evictions, the CriticalPodAdmissionHandler evicts a set of pods that
|
|
// frees up the required resource requests. The set of pods is designed to minimize impact, and is prioritized according to the ordering:
|
|
// minimal impact for guaranteed pods > minimal impact for burstable pods > minimal impact for besteffort pods.
|
|
// minimal impact is defined as follows: fewest pods evicted > fewest total requests of pods.
|
|
// finding the fewest total requests of pods is considered besteffort.
|
|
type CriticalPodAdmissionHandler struct {
|
|
getPodsFunc eviction.ActivePodsFunc
|
|
killPodFunc eviction.KillPodFunc
|
|
recorder record.EventRecorder
|
|
}
|
|
|
|
var _ lifecycle.AdmissionFailureHandler = &CriticalPodAdmissionHandler{}
|
|
|
|
func NewCriticalPodAdmissionHandler(getPodsFunc eviction.ActivePodsFunc, killPodFunc eviction.KillPodFunc, recorder record.EventRecorder) *CriticalPodAdmissionHandler {
|
|
return &CriticalPodAdmissionHandler{
|
|
getPodsFunc: getPodsFunc,
|
|
killPodFunc: killPodFunc,
|
|
recorder: recorder,
|
|
}
|
|
}
|
|
|
|
// HandleAdmissionFailure gracefully handles admission rejection, and, in some cases,
|
|
// to allow admission of the pod despite its previous failure.
|
|
func (c *CriticalPodAdmissionHandler) HandleAdmissionFailure(admitPod *v1.Pod, failureReasons []lifecycle.PredicateFailureReason) ([]lifecycle.PredicateFailureReason, error) {
|
|
if !kubetypes.IsCriticalPod(admitPod) {
|
|
return failureReasons, nil
|
|
}
|
|
// InsufficientResourceError is not a reason to reject a critical pod.
|
|
// Instead of rejecting, we free up resources to admit it, if no other reasons for rejection exist.
|
|
nonResourceReasons := []lifecycle.PredicateFailureReason{}
|
|
resourceReasons := []*admissionRequirement{}
|
|
for _, reason := range failureReasons {
|
|
if r, ok := reason.(*lifecycle.InsufficientResourceError); ok {
|
|
resourceReasons = append(resourceReasons, &admissionRequirement{
|
|
resourceName: r.ResourceName,
|
|
quantity: r.GetInsufficientAmount(),
|
|
})
|
|
} else {
|
|
nonResourceReasons = append(nonResourceReasons, reason)
|
|
}
|
|
}
|
|
if len(nonResourceReasons) > 0 {
|
|
// Return only reasons that are not resource related, since critical pods cannot fail admission for resource reasons.
|
|
return nonResourceReasons, nil
|
|
}
|
|
err := c.evictPodsToFreeRequests(admitPod, admissionRequirementList(resourceReasons))
|
|
// if no error is returned, preemption succeeded and the pod is safe to admit.
|
|
return nil, err
|
|
}
|
|
|
|
// evictPodsToFreeRequests takes a list of insufficient resources, and attempts to free them by evicting pods
|
|
// based on requests. For example, if the only insufficient resource is 200Mb of memory, this function could
|
|
// evict a pod with request=250Mb.
|
|
func (c *CriticalPodAdmissionHandler) evictPodsToFreeRequests(admitPod *v1.Pod, insufficientResources admissionRequirementList) error {
|
|
podsToPreempt, err := getPodsToPreempt(admitPod, c.getPodsFunc(), insufficientResources)
|
|
if err != nil {
|
|
return fmt.Errorf("preemption: error finding a set of pods to preempt: %v", err)
|
|
}
|
|
for _, pod := range podsToPreempt {
|
|
// record that we are evicting the pod
|
|
c.recorder.Eventf(pod, v1.EventTypeWarning, events.PreemptContainer, message)
|
|
// this is a blocking call and should only return when the pod and its containers are killed.
|
|
klog.V(3).InfoS("Preempting pod to free up resources", "pod", klog.KObj(pod), "podUID", pod.UID, "insufficientResources", insufficientResources.toString())
|
|
err := c.killPodFunc(pod, true, nil, func(status *v1.PodStatus) {
|
|
status.Phase = v1.PodFailed
|
|
status.Reason = events.PreemptContainer
|
|
status.Message = message
|
|
})
|
|
if err != nil {
|
|
klog.ErrorS(err, "Failed to evict pod", "pod", klog.KObj(pod))
|
|
// In future syncPod loops, the kubelet will retry the pod deletion steps that it was stuck on.
|
|
continue
|
|
}
|
|
if len(insufficientResources) > 0 {
|
|
metrics.Preemptions.WithLabelValues(insufficientResources[0].resourceName.String()).Inc()
|
|
} else {
|
|
metrics.Preemptions.WithLabelValues("").Inc()
|
|
}
|
|
klog.InfoS("Pod evicted successfully", "pod", klog.KObj(pod))
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// getPodsToPreempt returns a list of pods that could be preempted to free requests >= requirements
|
|
func getPodsToPreempt(pod *v1.Pod, pods []*v1.Pod, requirements admissionRequirementList) ([]*v1.Pod, error) {
|
|
bestEffortPods, burstablePods, guaranteedPods := sortPodsByQOS(pod, pods)
|
|
|
|
// make sure that pods exist to reclaim the requirements
|
|
unableToMeetRequirements := requirements.subtract(append(append(bestEffortPods, burstablePods...), guaranteedPods...)...)
|
|
if len(unableToMeetRequirements) > 0 {
|
|
return nil, fmt.Errorf("no set of running pods found to reclaim resources: %v", unableToMeetRequirements.toString())
|
|
}
|
|
// find the guaranteed pods we would need to evict if we already evicted ALL burstable and besteffort pods.
|
|
guaranteedToEvict, err := getPodsToPreemptByDistance(guaranteedPods, requirements.subtract(append(bestEffortPods, burstablePods...)...))
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
// Find the burstable pods we would need to evict if we already evicted ALL besteffort pods, and the required guaranteed pods.
|
|
burstableToEvict, err := getPodsToPreemptByDistance(burstablePods, requirements.subtract(append(bestEffortPods, guaranteedToEvict...)...))
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
// Find the besteffort pods we would need to evict if we already evicted the required guaranteed and burstable pods.
|
|
bestEffortToEvict, err := getPodsToPreemptByDistance(bestEffortPods, requirements.subtract(append(burstableToEvict, guaranteedToEvict...)...))
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return append(append(bestEffortToEvict, burstableToEvict...), guaranteedToEvict...), nil
|
|
}
|
|
|
|
// getPodsToPreemptByDistance finds the pods that have pod requests >= admission requirements.
|
|
// Chooses pods that minimize "distance" to the requirements.
|
|
// If more than one pod exists that fulfills the remaining requirements,
|
|
// it chooses the pod that has the "smaller resource request"
|
|
// This method, by repeatedly choosing the pod that fulfills as much of the requirements as possible,
|
|
// attempts to minimize the number of pods returned.
|
|
func getPodsToPreemptByDistance(pods []*v1.Pod, requirements admissionRequirementList) ([]*v1.Pod, error) {
|
|
podsToEvict := []*v1.Pod{}
|
|
// evict pods by shortest distance from remaining requirements, updating requirements every round.
|
|
for len(requirements) > 0 {
|
|
if len(pods) == 0 {
|
|
return nil, fmt.Errorf("no set of running pods found to reclaim resources: %v", requirements.toString())
|
|
}
|
|
// all distances must be less than len(requirements), because the max distance for a single requirement is 1
|
|
bestDistance := float64(len(requirements) + 1)
|
|
bestPodIndex := 0
|
|
// Find the pod with the smallest distance from requirements
|
|
// Or, in the case of two equidistant pods, find the pod with "smaller" resource requests.
|
|
for i, pod := range pods {
|
|
dist := requirements.distance(pod)
|
|
if dist < bestDistance || (bestDistance == dist && smallerResourceRequest(pod, pods[bestPodIndex])) {
|
|
bestDistance = dist
|
|
bestPodIndex = i
|
|
}
|
|
}
|
|
// subtract the pod from requirements, and transfer the pod from input-pods to pods-to-evicted
|
|
requirements = requirements.subtract(pods[bestPodIndex])
|
|
podsToEvict = append(podsToEvict, pods[bestPodIndex])
|
|
pods[bestPodIndex] = pods[len(pods)-1]
|
|
pods = pods[:len(pods)-1]
|
|
}
|
|
return podsToEvict, nil
|
|
}
|
|
|
|
type admissionRequirement struct {
|
|
resourceName v1.ResourceName
|
|
quantity int64
|
|
}
|
|
|
|
type admissionRequirementList []*admissionRequirement
|
|
|
|
// distance returns distance of the pods requests from the admissionRequirements.
|
|
// The distance is measured by the fraction of the requirement satisfied by the pod,
|
|
// so that each requirement is weighted equally, regardless of absolute magnitude.
|
|
func (a admissionRequirementList) distance(pod *v1.Pod) float64 {
|
|
dist := float64(0)
|
|
for _, req := range a {
|
|
remainingRequest := float64(req.quantity - resource.GetResourceRequest(pod, req.resourceName))
|
|
if remainingRequest > 0 {
|
|
dist += math.Pow(remainingRequest/float64(req.quantity), 2)
|
|
}
|
|
}
|
|
return dist
|
|
}
|
|
|
|
// subtract returns a new admissionRequirementList containing remaining requirements if the provided pod
|
|
// were to be preempted
|
|
func (a admissionRequirementList) subtract(pods ...*v1.Pod) admissionRequirementList {
|
|
newList := []*admissionRequirement{}
|
|
for _, req := range a {
|
|
newQuantity := req.quantity
|
|
for _, pod := range pods {
|
|
newQuantity -= resource.GetResourceRequest(pod, req.resourceName)
|
|
if newQuantity <= 0 {
|
|
break
|
|
}
|
|
}
|
|
if newQuantity > 0 {
|
|
newList = append(newList, &admissionRequirement{
|
|
resourceName: req.resourceName,
|
|
quantity: newQuantity,
|
|
})
|
|
}
|
|
}
|
|
return newList
|
|
}
|
|
|
|
func (a admissionRequirementList) toString() string {
|
|
s := "["
|
|
for _, req := range a {
|
|
s += fmt.Sprintf("(res: %v, q: %d), ", req.resourceName, req.quantity)
|
|
}
|
|
return s + "]"
|
|
}
|
|
|
|
// sortPodsByQOS returns lists containing besteffort, burstable, and guaranteed pods that
|
|
// can be preempted by preemptor pod.
|
|
func sortPodsByQOS(preemptor *v1.Pod, pods []*v1.Pod) (bestEffort, burstable, guaranteed []*v1.Pod) {
|
|
for _, pod := range pods {
|
|
if kubetypes.Preemptable(preemptor, pod) {
|
|
switch v1qos.GetPodQOS(pod) {
|
|
case v1.PodQOSBestEffort:
|
|
bestEffort = append(bestEffort, pod)
|
|
case v1.PodQOSBurstable:
|
|
burstable = append(burstable, pod)
|
|
case v1.PodQOSGuaranteed:
|
|
guaranteed = append(guaranteed, pod)
|
|
default:
|
|
}
|
|
}
|
|
}
|
|
|
|
return
|
|
}
|
|
|
|
// smallerResourceRequest returns true if pod1 has a smaller request than pod2
|
|
func smallerResourceRequest(pod1 *v1.Pod, pod2 *v1.Pod) bool {
|
|
priorityList := []v1.ResourceName{
|
|
v1.ResourceMemory,
|
|
v1.ResourceCPU,
|
|
}
|
|
for _, res := range priorityList {
|
|
req1 := resource.GetResourceRequest(pod1, res)
|
|
req2 := resource.GetResourceRequest(pod2, res)
|
|
if req1 < req2 {
|
|
return true
|
|
} else if req1 > req2 {
|
|
return false
|
|
}
|
|
}
|
|
return true
|
|
}
|