
* batch/job: decouple backoff from workqueue Signed-off-by: Sathyanarayanan Saravanamuthu <sathyanarays@vmware.com> * Resolving review comments * Resolving more review comments * Resolving review comments Signed-off-by: Sathyanarayanan Saravanamuthu <sathyanarays@vmware.com> * Computing finish time to now when FinishedAt is unix epoch * Addressing review comments Signed-off-by: Sathyanarayanan Saravanamuthu <sathyanarays@vmware.com> --------- Signed-off-by: Sathyanarayanan Saravanamuthu <sathyanarays@vmware.com>
206 lines
4.9 KiB
Go
206 lines
4.9 KiB
Go
/*
|
|
Copyright 2023 The Kubernetes Authors.
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
*/
|
|
|
|
package job
|
|
|
|
import (
|
|
"fmt"
|
|
"sort"
|
|
"time"
|
|
|
|
v1 "k8s.io/api/core/v1"
|
|
"k8s.io/client-go/tools/cache"
|
|
"k8s.io/utils/clock"
|
|
)
|
|
|
|
type backoffRecord struct {
|
|
key string
|
|
failuresAfterLastSuccess int32
|
|
lastFailureTime *time.Time
|
|
}
|
|
|
|
type backoffStore struct {
|
|
store cache.Store
|
|
}
|
|
|
|
func (s *backoffStore) updateBackoffRecord(record backoffRecord) error {
|
|
b, ok, err := s.store.GetByKey(record.key)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
if !ok {
|
|
err = s.store.Add(&record)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
} else {
|
|
backoffRecord := b.(*backoffRecord)
|
|
backoffRecord.failuresAfterLastSuccess = record.failuresAfterLastSuccess
|
|
backoffRecord.lastFailureTime = record.lastFailureTime
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (s *backoffStore) removeBackoffRecord(jobId string) error {
|
|
b, ok, err := s.store.GetByKey(jobId)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
if ok {
|
|
err = s.store.Delete(b)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
func newBackoffRecordStore() *backoffStore {
|
|
return &backoffStore{
|
|
store: cache.NewStore(backoffRecordKeyFunc),
|
|
}
|
|
}
|
|
|
|
var backoffRecordKeyFunc = func(obj interface{}) (string, error) {
|
|
if u, ok := obj.(*backoffRecord); ok {
|
|
return u.key, nil
|
|
}
|
|
return "", fmt.Errorf("could not find key for obj %#v", obj)
|
|
}
|
|
|
|
func (backoffRecordStore *backoffStore) newBackoffRecord(clock clock.WithTicker, key string, newSucceededPods []*v1.Pod, newFailedPods []*v1.Pod) backoffRecord {
|
|
now := clock.Now()
|
|
var backoff *backoffRecord
|
|
|
|
if b, exists, _ := backoffRecordStore.store.GetByKey(key); exists {
|
|
old := b.(*backoffRecord)
|
|
backoff = &backoffRecord{
|
|
key: old.key,
|
|
failuresAfterLastSuccess: old.failuresAfterLastSuccess,
|
|
lastFailureTime: old.lastFailureTime,
|
|
}
|
|
} else {
|
|
backoff = &backoffRecord{
|
|
key: key,
|
|
failuresAfterLastSuccess: 0,
|
|
lastFailureTime: nil,
|
|
}
|
|
}
|
|
|
|
sortByFinishedTime(newSucceededPods, now)
|
|
sortByFinishedTime(newFailedPods, now)
|
|
|
|
if len(newSucceededPods) == 0 {
|
|
if len(newFailedPods) == 0 {
|
|
return *backoff
|
|
}
|
|
|
|
backoff.failuresAfterLastSuccess = backoff.failuresAfterLastSuccess + int32(len(newFailedPods))
|
|
lastFailureTime := getFinishedTime(newFailedPods[len(newFailedPods)-1], now)
|
|
backoff.lastFailureTime = &lastFailureTime
|
|
return *backoff
|
|
|
|
} else {
|
|
if len(newFailedPods) == 0 {
|
|
backoff.failuresAfterLastSuccess = 0
|
|
backoff.lastFailureTime = nil
|
|
return *backoff
|
|
}
|
|
|
|
backoff.failuresAfterLastSuccess = 0
|
|
backoff.lastFailureTime = nil
|
|
|
|
lastSuccessTime := getFinishedTime(newSucceededPods[len(newSucceededPods)-1], now)
|
|
for i := len(newFailedPods) - 1; i >= 0; i-- {
|
|
failedTime := getFinishedTime(newFailedPods[i], now)
|
|
if !failedTime.After(lastSuccessTime) {
|
|
break
|
|
}
|
|
if backoff.lastFailureTime == nil {
|
|
backoff.lastFailureTime = &failedTime
|
|
}
|
|
backoff.failuresAfterLastSuccess += 1
|
|
}
|
|
|
|
return *backoff
|
|
|
|
}
|
|
|
|
}
|
|
|
|
func sortByFinishedTime(pods []*v1.Pod, currentTime time.Time) {
|
|
sort.Slice(pods, func(i, j int) bool {
|
|
p1 := pods[i]
|
|
p2 := pods[j]
|
|
p1FinishTime := getFinishedTime(p1, currentTime)
|
|
p2FinishTime := getFinishedTime(p2, currentTime)
|
|
|
|
return p1FinishTime.Before(p2FinishTime)
|
|
})
|
|
}
|
|
|
|
func getFinishedTime(p *v1.Pod, currentTime time.Time) time.Time {
|
|
var finishTime *time.Time
|
|
for _, containerState := range p.Status.ContainerStatuses {
|
|
if containerState.State.Terminated == nil {
|
|
finishTime = nil
|
|
break
|
|
}
|
|
|
|
if finishTime == nil {
|
|
finishTime = &containerState.State.Terminated.FinishedAt.Time
|
|
} else {
|
|
if finishTime.Before(containerState.State.Terminated.FinishedAt.Time) {
|
|
finishTime = &containerState.State.Terminated.FinishedAt.Time
|
|
}
|
|
}
|
|
}
|
|
|
|
if finishTime == nil || finishTime.IsZero() {
|
|
return currentTime
|
|
}
|
|
|
|
return *finishTime
|
|
}
|
|
|
|
func (backoff backoffRecord) getRemainingTime(clock clock.WithTicker, defaultBackoff time.Duration, maxBackoff time.Duration) time.Duration {
|
|
if backoff.failuresAfterLastSuccess == 0 {
|
|
return 0
|
|
}
|
|
|
|
backoffDuration := defaultBackoff
|
|
for i := 1; i < int(backoff.failuresAfterLastSuccess); i++ {
|
|
backoffDuration = backoffDuration * 2
|
|
if backoffDuration >= maxBackoff {
|
|
backoffDuration = maxBackoff
|
|
break
|
|
}
|
|
}
|
|
|
|
timeElapsedSinceLastFailure := clock.Since(*backoff.lastFailureTime)
|
|
|
|
if backoffDuration < timeElapsedSinceLastFailure {
|
|
return 0
|
|
}
|
|
|
|
return backoffDuration - timeElapsedSinceLastFailure
|
|
}
|