kubernetes/pkg/kubelet/preemption/preemption.go
Clayton Coleman 3eadd1a9ea
Keep pod worker running until pod is truly complete
A number of race conditions exist when pods are terminated early in
their lifecycle because components in the kubelet need to know "no
running containers" or "containers can't be started from now on" but
were relying on outdated state.

Only the pod worker knows whether containers are being started for
a given pod, which is required to know when a pod is "terminated"
(no running containers, none coming). Move that responsibility and
podKiller function into the pod workers, and have everything that
was killing the pod go into the UpdatePod loop. Split syncPod into
three phases - setup, terminate containers, and cleanup pod - and
have transitions between those methods be visible to other
components. After this change, to kill a pod you tell the pod worker
to UpdatePod({UpdateType: SyncPodKill, Pod: pod}).

Several places in the kubelet were incorrect about whether they
were handling terminating (should stop running, might have
containers) or terminated (no running containers) pods. The pod worker
exposes methods that allow other loops to know when to set up or tear
down resources based on the state of the pod - these methods remove
the possibility of race conditions by ensuring a single component is
responsible for knowing each pod's allowed state and other components
simply delegate to checking whether they are in the window by UID.

Removing containers now no longer blocks final pod deletion in the
API server and are handled as background cleanup. Node shutdown
no longer marks pods as failed as they can be restarted in the
next step.

See https://docs.google.com/document/d/1Pic5TPntdJnYfIpBeZndDelM-AbS4FN9H2GTLFhoJ04/edit# for details
2021-07-06 15:55:22 -04:00

271 lines
11 KiB
Go

/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package preemption
import (
"fmt"
"math"
v1 "k8s.io/api/core/v1"
"k8s.io/client-go/tools/record"
"k8s.io/klog/v2"
"k8s.io/kubernetes/pkg/api/v1/resource"
v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos"
"k8s.io/kubernetes/pkg/kubelet/events"
"k8s.io/kubernetes/pkg/kubelet/eviction"
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
"k8s.io/kubernetes/pkg/kubelet/metrics"
kubetypes "k8s.io/kubernetes/pkg/kubelet/types"
)
const message = "Preempted in order to admit critical pod"
// CriticalPodAdmissionHandler is an AdmissionFailureHandler that handles admission failure for Critical Pods.
// If the ONLY admission failures are due to insufficient resources, then CriticalPodAdmissionHandler evicts pods
// so that the critical pod can be admitted. For evictions, the CriticalPodAdmissionHandler evicts a set of pods that
// frees up the required resource requests. The set of pods is designed to minimize impact, and is prioritized according to the ordering:
// minimal impact for guaranteed pods > minimal impact for burstable pods > minimal impact for besteffort pods.
// minimal impact is defined as follows: fewest pods evicted > fewest total requests of pods.
// finding the fewest total requests of pods is considered besteffort.
type CriticalPodAdmissionHandler struct {
getPodsFunc eviction.ActivePodsFunc
killPodFunc eviction.KillPodFunc
recorder record.EventRecorder
}
var _ lifecycle.AdmissionFailureHandler = &CriticalPodAdmissionHandler{}
func NewCriticalPodAdmissionHandler(getPodsFunc eviction.ActivePodsFunc, killPodFunc eviction.KillPodFunc, recorder record.EventRecorder) *CriticalPodAdmissionHandler {
return &CriticalPodAdmissionHandler{
getPodsFunc: getPodsFunc,
killPodFunc: killPodFunc,
recorder: recorder,
}
}
// HandleAdmissionFailure gracefully handles admission rejection, and, in some cases,
// to allow admission of the pod despite its previous failure.
func (c *CriticalPodAdmissionHandler) HandleAdmissionFailure(admitPod *v1.Pod, failureReasons []lifecycle.PredicateFailureReason) ([]lifecycle.PredicateFailureReason, error) {
if !kubetypes.IsCriticalPod(admitPod) {
return failureReasons, nil
}
// InsufficientResourceError is not a reason to reject a critical pod.
// Instead of rejecting, we free up resources to admit it, if no other reasons for rejection exist.
nonResourceReasons := []lifecycle.PredicateFailureReason{}
resourceReasons := []*admissionRequirement{}
for _, reason := range failureReasons {
if r, ok := reason.(*lifecycle.InsufficientResourceError); ok {
resourceReasons = append(resourceReasons, &admissionRequirement{
resourceName: r.ResourceName,
quantity: r.GetInsufficientAmount(),
})
} else {
nonResourceReasons = append(nonResourceReasons, reason)
}
}
if len(nonResourceReasons) > 0 {
// Return only reasons that are not resource related, since critical pods cannot fail admission for resource reasons.
return nonResourceReasons, nil
}
err := c.evictPodsToFreeRequests(admitPod, admissionRequirementList(resourceReasons))
// if no error is returned, preemption succeeded and the pod is safe to admit.
return nil, err
}
// evictPodsToFreeRequests takes a list of insufficient resources, and attempts to free them by evicting pods
// based on requests. For example, if the only insufficient resource is 200Mb of memory, this function could
// evict a pod with request=250Mb.
func (c *CriticalPodAdmissionHandler) evictPodsToFreeRequests(admitPod *v1.Pod, insufficientResources admissionRequirementList) error {
podsToPreempt, err := getPodsToPreempt(admitPod, c.getPodsFunc(), insufficientResources)
if err != nil {
return fmt.Errorf("preemption: error finding a set of pods to preempt: %v", err)
}
for _, pod := range podsToPreempt {
// record that we are evicting the pod
c.recorder.Eventf(pod, v1.EventTypeWarning, events.PreemptContainer, message)
// this is a blocking call and should only return when the pod and its containers are killed.
klog.V(3).InfoS("Preempting pod to free up resources", "pod", klog.KObj(pod), "podUID", pod.UID, "insufficientResources", insufficientResources.toString())
err := c.killPodFunc(pod, true, nil, func(status *v1.PodStatus) {
status.Phase = v1.PodFailed
status.Reason = events.PreemptContainer
status.Message = message
})
if err != nil {
klog.ErrorS(err, "Failed to evict pod", "pod", klog.KObj(pod))
// In future syncPod loops, the kubelet will retry the pod deletion steps that it was stuck on.
continue
}
if len(insufficientResources) > 0 {
metrics.Preemptions.WithLabelValues(insufficientResources[0].resourceName.String()).Inc()
} else {
metrics.Preemptions.WithLabelValues("").Inc()
}
klog.InfoS("Pod evicted successfully", "pod", klog.KObj(pod))
}
return nil
}
// getPodsToPreempt returns a list of pods that could be preempted to free requests >= requirements
func getPodsToPreempt(pod *v1.Pod, pods []*v1.Pod, requirements admissionRequirementList) ([]*v1.Pod, error) {
bestEffortPods, burstablePods, guaranteedPods := sortPodsByQOS(pod, pods)
// make sure that pods exist to reclaim the requirements
unableToMeetRequirements := requirements.subtract(append(append(bestEffortPods, burstablePods...), guaranteedPods...)...)
if len(unableToMeetRequirements) > 0 {
return nil, fmt.Errorf("no set of running pods found to reclaim resources: %v", unableToMeetRequirements.toString())
}
// find the guaranteed pods we would need to evict if we already evicted ALL burstable and besteffort pods.
guaranteedToEvict, err := getPodsToPreemptByDistance(guaranteedPods, requirements.subtract(append(bestEffortPods, burstablePods...)...))
if err != nil {
return nil, err
}
// Find the burstable pods we would need to evict if we already evicted ALL besteffort pods, and the required guaranteed pods.
burstableToEvict, err := getPodsToPreemptByDistance(burstablePods, requirements.subtract(append(bestEffortPods, guaranteedToEvict...)...))
if err != nil {
return nil, err
}
// Find the besteffort pods we would need to evict if we already evicted the required guaranteed and burstable pods.
bestEffortToEvict, err := getPodsToPreemptByDistance(bestEffortPods, requirements.subtract(append(burstableToEvict, guaranteedToEvict...)...))
if err != nil {
return nil, err
}
return append(append(bestEffortToEvict, burstableToEvict...), guaranteedToEvict...), nil
}
// getPodsToPreemptByDistance finds the pods that have pod requests >= admission requirements.
// Chooses pods that minimize "distance" to the requirements.
// If more than one pod exists that fulfills the remaining requirements,
// it chooses the pod that has the "smaller resource request"
// This method, by repeatedly choosing the pod that fulfills as much of the requirements as possible,
// attempts to minimize the number of pods returned.
func getPodsToPreemptByDistance(pods []*v1.Pod, requirements admissionRequirementList) ([]*v1.Pod, error) {
podsToEvict := []*v1.Pod{}
// evict pods by shortest distance from remaining requirements, updating requirements every round.
for len(requirements) > 0 {
if len(pods) == 0 {
return nil, fmt.Errorf("no set of running pods found to reclaim resources: %v", requirements.toString())
}
// all distances must be less than len(requirements), because the max distance for a single requirement is 1
bestDistance := float64(len(requirements) + 1)
bestPodIndex := 0
// Find the pod with the smallest distance from requirements
// Or, in the case of two equidistant pods, find the pod with "smaller" resource requests.
for i, pod := range pods {
dist := requirements.distance(pod)
if dist < bestDistance || (bestDistance == dist && smallerResourceRequest(pod, pods[bestPodIndex])) {
bestDistance = dist
bestPodIndex = i
}
}
// subtract the pod from requirements, and transfer the pod from input-pods to pods-to-evicted
requirements = requirements.subtract(pods[bestPodIndex])
podsToEvict = append(podsToEvict, pods[bestPodIndex])
pods[bestPodIndex] = pods[len(pods)-1]
pods = pods[:len(pods)-1]
}
return podsToEvict, nil
}
type admissionRequirement struct {
resourceName v1.ResourceName
quantity int64
}
type admissionRequirementList []*admissionRequirement
// distance returns distance of the pods requests from the admissionRequirements.
// The distance is measured by the fraction of the requirement satisfied by the pod,
// so that each requirement is weighted equally, regardless of absolute magnitude.
func (a admissionRequirementList) distance(pod *v1.Pod) float64 {
dist := float64(0)
for _, req := range a {
remainingRequest := float64(req.quantity - resource.GetResourceRequest(pod, req.resourceName))
if remainingRequest > 0 {
dist += math.Pow(remainingRequest/float64(req.quantity), 2)
}
}
return dist
}
// subtract returns a new admissionRequirementList containing remaining requirements if the provided pod
// were to be preempted
func (a admissionRequirementList) subtract(pods ...*v1.Pod) admissionRequirementList {
newList := []*admissionRequirement{}
for _, req := range a {
newQuantity := req.quantity
for _, pod := range pods {
newQuantity -= resource.GetResourceRequest(pod, req.resourceName)
if newQuantity <= 0 {
break
}
}
if newQuantity > 0 {
newList = append(newList, &admissionRequirement{
resourceName: req.resourceName,
quantity: newQuantity,
})
}
}
return newList
}
func (a admissionRequirementList) toString() string {
s := "["
for _, req := range a {
s += fmt.Sprintf("(res: %v, q: %d), ", req.resourceName, req.quantity)
}
return s + "]"
}
// sortPodsByQOS returns lists containing besteffort, burstable, and guaranteed pods that
// can be preempted by preemptor pod.
func sortPodsByQOS(preemptor *v1.Pod, pods []*v1.Pod) (bestEffort, burstable, guaranteed []*v1.Pod) {
for _, pod := range pods {
if kubetypes.Preemptable(preemptor, pod) {
switch v1qos.GetPodQOS(pod) {
case v1.PodQOSBestEffort:
bestEffort = append(bestEffort, pod)
case v1.PodQOSBurstable:
burstable = append(burstable, pod)
case v1.PodQOSGuaranteed:
guaranteed = append(guaranteed, pod)
default:
}
}
}
return
}
// smallerResourceRequest returns true if pod1 has a smaller request than pod2
func smallerResourceRequest(pod1 *v1.Pod, pod2 *v1.Pod) bool {
priorityList := []v1.ResourceName{
v1.ResourceMemory,
v1.ResourceCPU,
}
for _, res := range priorityList {
req1 := resource.GetResourceRequest(pod1, res)
req2 := resource.GetResourceRequest(pod2, res)
if req1 < req2 {
return true
} else if req1 > req2 {
return false
}
}
return true
}