280 lines
8.3 KiB
Go
280 lines
8.3 KiB
Go
/*
|
|
Copyright 2023 The Kubernetes Authors.
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
*/
|
|
|
|
package job
|
|
|
|
import (
|
|
"fmt"
|
|
"sort"
|
|
"time"
|
|
|
|
v1 "k8s.io/api/core/v1"
|
|
"k8s.io/apimachinery/pkg/util/sets"
|
|
utilfeature "k8s.io/apiserver/pkg/util/feature"
|
|
"k8s.io/client-go/tools/cache"
|
|
"k8s.io/klog/v2"
|
|
apipod "k8s.io/kubernetes/pkg/api/v1/pod"
|
|
"k8s.io/kubernetes/pkg/features"
|
|
"k8s.io/utils/clock"
|
|
"k8s.io/utils/ptr"
|
|
)
|
|
|
|
type backoffRecord struct {
|
|
key string
|
|
failuresAfterLastSuccess int32
|
|
lastFailureTime *time.Time
|
|
}
|
|
|
|
type backoffStore struct {
|
|
store cache.Store
|
|
}
|
|
|
|
func (s *backoffStore) updateBackoffRecord(record backoffRecord) error {
|
|
b, ok, err := s.store.GetByKey(record.key)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
if !ok {
|
|
err = s.store.Add(&record)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
} else {
|
|
backoffRecord := b.(*backoffRecord)
|
|
backoffRecord.failuresAfterLastSuccess = record.failuresAfterLastSuccess
|
|
backoffRecord.lastFailureTime = record.lastFailureTime
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (s *backoffStore) removeBackoffRecord(jobId string) error {
|
|
b, ok, err := s.store.GetByKey(jobId)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
if ok {
|
|
err = s.store.Delete(b)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
func newBackoffStore() *backoffStore {
|
|
return &backoffStore{
|
|
store: cache.NewStore(backoffRecordKeyFunc),
|
|
}
|
|
}
|
|
|
|
var backoffRecordKeyFunc = func(obj interface{}) (string, error) {
|
|
if u, ok := obj.(*backoffRecord); ok {
|
|
return u.key, nil
|
|
}
|
|
return "", fmt.Errorf("could not find key for obj %#v", obj)
|
|
}
|
|
|
|
func (backoffRecordStore *backoffStore) newBackoffRecord(key string, newSucceededPods []*v1.Pod, newFailedPods []*v1.Pod) backoffRecord {
|
|
var backoff *backoffRecord
|
|
|
|
if b, exists, _ := backoffRecordStore.store.GetByKey(key); exists {
|
|
old := b.(*backoffRecord)
|
|
backoff = &backoffRecord{
|
|
key: old.key,
|
|
failuresAfterLastSuccess: old.failuresAfterLastSuccess,
|
|
lastFailureTime: old.lastFailureTime,
|
|
}
|
|
} else {
|
|
backoff = &backoffRecord{
|
|
key: key,
|
|
failuresAfterLastSuccess: 0,
|
|
lastFailureTime: nil,
|
|
}
|
|
}
|
|
|
|
sortByFinishedTime(newSucceededPods)
|
|
sortByFinishedTime(newFailedPods)
|
|
|
|
if len(newSucceededPods) == 0 {
|
|
if len(newFailedPods) == 0 {
|
|
return *backoff
|
|
}
|
|
|
|
backoff.failuresAfterLastSuccess = backoff.failuresAfterLastSuccess + int32(len(newFailedPods))
|
|
lastFailureTime := getFinishedTime(newFailedPods[len(newFailedPods)-1])
|
|
backoff.lastFailureTime = &lastFailureTime
|
|
return *backoff
|
|
|
|
} else {
|
|
if len(newFailedPods) == 0 {
|
|
backoff.failuresAfterLastSuccess = 0
|
|
backoff.lastFailureTime = nil
|
|
return *backoff
|
|
}
|
|
|
|
backoff.failuresAfterLastSuccess = 0
|
|
backoff.lastFailureTime = nil
|
|
|
|
lastSuccessTime := getFinishedTime(newSucceededPods[len(newSucceededPods)-1])
|
|
for i := len(newFailedPods) - 1; i >= 0; i-- {
|
|
failedTime := getFinishedTime(newFailedPods[i])
|
|
if !failedTime.After(lastSuccessTime) {
|
|
break
|
|
}
|
|
if backoff.lastFailureTime == nil {
|
|
backoff.lastFailureTime = &failedTime
|
|
}
|
|
backoff.failuresAfterLastSuccess += 1
|
|
}
|
|
|
|
return *backoff
|
|
|
|
}
|
|
|
|
}
|
|
|
|
func sortByFinishedTime(pods []*v1.Pod) {
|
|
sort.Slice(pods, func(i, j int) bool {
|
|
p1 := pods[i]
|
|
p2 := pods[j]
|
|
p1FinishTime := getFinishedTime(p1)
|
|
p2FinishTime := getFinishedTime(p2)
|
|
|
|
return p1FinishTime.Before(p2FinishTime)
|
|
})
|
|
}
|
|
|
|
// Returns the pod finish time using the following lookups:
|
|
// 1. if all containers finished, use the latest time
|
|
// 2. if the pod has Ready=False condition, use the last transition time
|
|
// 3. if the pod has been deleted, use the `deletionTimestamp - grace_period` to estimate the moment of deletion
|
|
// 4. fallback to pod's creation time
|
|
//
|
|
// Pods owned by Kubelet are marked with Ready=False condition when
|
|
// transitioning to terminal phase, thus being handled by (1.) or (2.).
|
|
// Orphaned pods are deleted by PodGC, thus being handled by (3.).
|
|
func getFinishedTime(p *v1.Pod) time.Time {
|
|
if finishTime := getFinishTimeFromContainers(p); finishTime != nil {
|
|
return *finishTime
|
|
}
|
|
if finishTime := getFinishTimeFromPodReadyFalseCondition(p); finishTime != nil {
|
|
return *finishTime
|
|
}
|
|
if finishTime := getFinishTimeFromDeletionTimestamp(p); finishTime != nil {
|
|
return *finishTime
|
|
}
|
|
// This should not happen in clusters with Kubelet and PodGC running.
|
|
return p.CreationTimestamp.Time
|
|
}
|
|
|
|
func getFinishTimeFromContainers(p *v1.Pod) *time.Time {
|
|
finishTime := latestFinishTime(nil, p.Status.ContainerStatuses, nil)
|
|
if utilfeature.DefaultFeatureGate.Enabled(features.SidecarContainers) {
|
|
// We need to check InitContainerStatuses here also,
|
|
// because with the sidecar (restartable init) containers,
|
|
// sidecar containers will always finish later than regular containers.
|
|
names := sets.New[string]()
|
|
for _, c := range p.Spec.InitContainers {
|
|
if c.RestartPolicy != nil && *c.RestartPolicy == v1.ContainerRestartPolicyAlways {
|
|
names.Insert(c.Name)
|
|
}
|
|
}
|
|
finishTime = latestFinishTime(finishTime, p.Status.InitContainerStatuses, func(status v1.ContainerStatus) bool {
|
|
return names.Has(status.Name)
|
|
})
|
|
}
|
|
return finishTime
|
|
}
|
|
|
|
func latestFinishTime(prevFinishTime *time.Time, cs []v1.ContainerStatus, check func(status v1.ContainerStatus) bool) *time.Time {
|
|
var finishTime = prevFinishTime
|
|
for _, containerState := range cs {
|
|
if check != nil && !check(containerState) {
|
|
continue
|
|
}
|
|
if containerState.State.Terminated == nil ||
|
|
containerState.State.Terminated.FinishedAt.Time.IsZero() {
|
|
return nil
|
|
}
|
|
if finishTime == nil || finishTime.Before(containerState.State.Terminated.FinishedAt.Time) {
|
|
finishTime = &containerState.State.Terminated.FinishedAt.Time
|
|
}
|
|
}
|
|
return finishTime
|
|
}
|
|
|
|
func getFinishTimeFromPodReadyFalseCondition(p *v1.Pod) *time.Time {
|
|
if _, c := apipod.GetPodCondition(&p.Status, v1.PodReady); c != nil && c.Status == v1.ConditionFalse && !c.LastTransitionTime.Time.IsZero() {
|
|
return &c.LastTransitionTime.Time
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func getFinishTimeFromDeletionTimestamp(p *v1.Pod) *time.Time {
|
|
if p.DeletionTimestamp != nil {
|
|
finishTime := p.DeletionTimestamp.Time.Add(-time.Duration(ptr.Deref(p.DeletionGracePeriodSeconds, 0)) * time.Second)
|
|
return &finishTime
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (backoff backoffRecord) getRemainingTime(clock clock.WithTicker, defaultBackoff time.Duration, maxBackoff time.Duration) time.Duration {
|
|
return getRemainingTimeForFailuresCount(clock, defaultBackoff, maxBackoff, backoff.failuresAfterLastSuccess, backoff.lastFailureTime)
|
|
}
|
|
|
|
// getRemainingTimePerIndex returns the remaining time left for a given index to
|
|
// create the replacement pods. The number of consecutive pod failures for the
|
|
// index is retrieved from the `job-index-failure-count` annotation of the
|
|
// last failed pod within the index (represented by `lastFailedPod`).
|
|
// The last failed pod is also used to determine the time of the last failure.
|
|
func getRemainingTimePerIndex(logger klog.Logger, clock clock.WithTicker, defaultBackoff time.Duration, maxBackoff time.Duration, lastFailedPod *v1.Pod) time.Duration {
|
|
if lastFailedPod == nil {
|
|
// There is no previous failed pod for this index
|
|
return time.Duration(0)
|
|
}
|
|
failureCount := getIndexAbsoluteFailureCount(logger, lastFailedPod) + 1
|
|
lastFailureTime := getFinishedTime(lastFailedPod)
|
|
return getRemainingTimeForFailuresCount(clock, defaultBackoff, maxBackoff, failureCount, &lastFailureTime)
|
|
}
|
|
|
|
func getRemainingTimeForFailuresCount(clock clock.WithTicker, defaultBackoff time.Duration, maxBackoff time.Duration, failuresCount int32, lastFailureTime *time.Time) time.Duration {
|
|
if failuresCount == 0 {
|
|
return 0
|
|
}
|
|
|
|
backoffDuration := defaultBackoff
|
|
for i := 1; i < int(failuresCount); i++ {
|
|
backoffDuration = backoffDuration * 2
|
|
if backoffDuration >= maxBackoff {
|
|
backoffDuration = maxBackoff
|
|
break
|
|
}
|
|
}
|
|
|
|
timeElapsedSinceLastFailure := clock.Since(*lastFailureTime)
|
|
|
|
if backoffDuration < timeElapsedSinceLastFailure {
|
|
return 0
|
|
}
|
|
|
|
return backoffDuration - timeElapsedSinceLastFailure
|
|
}
|