Job controller implementation of backoff limit per index (#118009)
This commit is contained in:
		| @@ -23,6 +23,7 @@ import ( | |||||||
|  |  | ||||||
| 	v1 "k8s.io/api/core/v1" | 	v1 "k8s.io/api/core/v1" | ||||||
| 	"k8s.io/client-go/tools/cache" | 	"k8s.io/client-go/tools/cache" | ||||||
|  | 	"k8s.io/klog/v2" | ||||||
| 	apipod "k8s.io/kubernetes/pkg/api/v1/pod" | 	apipod "k8s.io/kubernetes/pkg/api/v1/pod" | ||||||
| 	"k8s.io/utils/clock" | 	"k8s.io/utils/clock" | ||||||
| 	"k8s.io/utils/pointer" | 	"k8s.io/utils/pointer" | ||||||
| @@ -213,12 +214,31 @@ func getFinishTimeFromDeletionTimestamp(p *v1.Pod) *time.Time { | |||||||
| } | } | ||||||
|  |  | ||||||
| func (backoff backoffRecord) getRemainingTime(clock clock.WithTicker, defaultBackoff time.Duration, maxBackoff time.Duration) time.Duration { | func (backoff backoffRecord) getRemainingTime(clock clock.WithTicker, defaultBackoff time.Duration, maxBackoff time.Duration) time.Duration { | ||||||
| 	if backoff.failuresAfterLastSuccess == 0 { | 	return getRemainingTimeForFailuresCount(clock, defaultBackoff, maxBackoff, backoff.failuresAfterLastSuccess, backoff.lastFailureTime) | ||||||
|  | } | ||||||
|  |  | ||||||
|  | // getRemainingTimePerIndex returns the remaining time left for a given index to | ||||||
|  | // create the replacement pods. The number of consecutive pod failures for the | ||||||
|  | // index is retrieved from the `job-index-failure-count` annotation of the | ||||||
|  | // last failed pod within the index (represented by `lastFailedPod`). | ||||||
|  | // The last failed pod is also used to determine the time of the last failure. | ||||||
|  | func getRemainingTimePerIndex(logger klog.Logger, clock clock.WithTicker, defaultBackoff time.Duration, maxBackoff time.Duration, lastFailedPod *v1.Pod) time.Duration { | ||||||
|  | 	if lastFailedPod == nil { | ||||||
|  | 		// There is no previous failed pod for this index | ||||||
|  | 		return time.Duration(0) | ||||||
|  | 	} | ||||||
|  | 	failureCount := getIndexAbsoluteFailureCount(logger, lastFailedPod) + 1 | ||||||
|  | 	lastFailureTime := getFinishedTime(lastFailedPod) | ||||||
|  | 	return getRemainingTimeForFailuresCount(clock, defaultBackoff, maxBackoff, failureCount, &lastFailureTime) | ||||||
|  | } | ||||||
|  |  | ||||||
|  | func getRemainingTimeForFailuresCount(clock clock.WithTicker, defaultBackoff time.Duration, maxBackoff time.Duration, failuresCount int32, lastFailureTime *time.Time) time.Duration { | ||||||
|  | 	if failuresCount == 0 { | ||||||
| 		return 0 | 		return 0 | ||||||
| 	} | 	} | ||||||
|  |  | ||||||
| 	backoffDuration := defaultBackoff | 	backoffDuration := defaultBackoff | ||||||
| 	for i := 1; i < int(backoff.failuresAfterLastSuccess); i++ { | 	for i := 1; i < int(failuresCount); i++ { | ||||||
| 		backoffDuration = backoffDuration * 2 | 		backoffDuration = backoffDuration * 2 | ||||||
| 		if backoffDuration >= maxBackoff { | 		if backoffDuration >= maxBackoff { | ||||||
| 			backoffDuration = maxBackoff | 			backoffDuration = maxBackoff | ||||||
| @@ -226,7 +246,7 @@ func (backoff backoffRecord) getRemainingTime(clock clock.WithTicker, defaultBac | |||||||
| 		} | 		} | ||||||
| 	} | 	} | ||||||
|  |  | ||||||
| 	timeElapsedSinceLastFailure := clock.Since(*backoff.lastFailureTime) | 	timeElapsedSinceLastFailure := clock.Since(*lastFailureTime) | ||||||
|  |  | ||||||
| 	if backoffDuration < timeElapsedSinceLastFailure { | 	if backoffDuration < timeElapsedSinceLastFailure { | ||||||
| 		return 0 | 		return 0 | ||||||
|   | |||||||
| @@ -23,6 +23,7 @@ import ( | |||||||
| 	"github.com/google/go-cmp/cmp" | 	"github.com/google/go-cmp/cmp" | ||||||
| 	v1 "k8s.io/api/core/v1" | 	v1 "k8s.io/api/core/v1" | ||||||
| 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" | 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" | ||||||
|  | 	"k8s.io/klog/v2/ktesting" | ||||||
| 	clocktesting "k8s.io/utils/clock/testing" | 	clocktesting "k8s.io/utils/clock/testing" | ||||||
| 	"k8s.io/utils/pointer" | 	"k8s.io/utils/pointer" | ||||||
| ) | ) | ||||||
| @@ -466,3 +467,46 @@ func TestGetRemainingBackoffTime(t *testing.T) { | |||||||
| 		}) | 		}) | ||||||
| 	} | 	} | ||||||
| } | } | ||||||
|  |  | ||||||
|  | func TestGetRemainingBackoffTimePerIndex(t *testing.T) { | ||||||
|  | 	defaultTestTime := metav1.NewTime(time.Date(2009, time.November, 10, 23, 0, 0, 0, time.UTC)) | ||||||
|  | 	testCases := map[string]struct { | ||||||
|  | 		currentTime    time.Time | ||||||
|  | 		maxBackoff     time.Duration | ||||||
|  | 		defaultBackoff time.Duration | ||||||
|  | 		lastFailedPod  *v1.Pod | ||||||
|  | 		wantDuration   time.Duration | ||||||
|  | 	}{ | ||||||
|  | 		"no failures": { | ||||||
|  | 			lastFailedPod:  nil, | ||||||
|  | 			defaultBackoff: 5 * time.Second, | ||||||
|  | 			maxBackoff:     700 * time.Second, | ||||||
|  | 			wantDuration:   0 * time.Second, | ||||||
|  | 		}, | ||||||
|  | 		"two prev failures; current time and failure time are same": { | ||||||
|  | 			lastFailedPod:  buildPod().phase(v1.PodFailed).indexFailureCount("2").customDeletionTimestamp(defaultTestTime.Time).Pod, | ||||||
|  | 			currentTime:    defaultTestTime.Time, | ||||||
|  | 			defaultBackoff: 5 * time.Second, | ||||||
|  | 			maxBackoff:     700 * time.Second, | ||||||
|  | 			wantDuration:   20 * time.Second, | ||||||
|  | 		}, | ||||||
|  | 		"one prev failure counted and one ignored; current time and failure time are same": { | ||||||
|  | 			lastFailedPod:  buildPod().phase(v1.PodFailed).indexFailureCount("1").indexIgnoredFailureCount("1").customDeletionTimestamp(defaultTestTime.Time).Pod, | ||||||
|  | 			currentTime:    defaultTestTime.Time, | ||||||
|  | 			defaultBackoff: 5 * time.Second, | ||||||
|  | 			maxBackoff:     700 * time.Second, | ||||||
|  | 			wantDuration:   20 * time.Second, | ||||||
|  | 		}, | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	for name, tc := range testCases { | ||||||
|  | 		t.Run(name, func(t *testing.T) { | ||||||
|  | 			logger, _ := ktesting.NewTestContext(t) | ||||||
|  | 			fakeClock := clocktesting.NewFakeClock(tc.currentTime.Truncate(time.Second)) | ||||||
|  | 			d := getRemainingTimePerIndex(logger, fakeClock, tc.defaultBackoff, tc.maxBackoff, tc.lastFailedPod) | ||||||
|  | 			if d.Seconds() != tc.wantDuration.Seconds() { | ||||||
|  | 				t.Errorf("Expected value of duration %v; got %v", tc.wantDuration, d) | ||||||
|  | 			} | ||||||
|  | 		}) | ||||||
|  | 	} | ||||||
|  | } | ||||||
|   | |||||||
| @@ -18,6 +18,7 @@ package job | |||||||
|  |  | ||||||
| import ( | import ( | ||||||
| 	"fmt" | 	"fmt" | ||||||
|  | 	"math" | ||||||
| 	"sort" | 	"sort" | ||||||
| 	"strconv" | 	"strconv" | ||||||
| 	"strings" | 	"strings" | ||||||
| @@ -41,6 +42,10 @@ func isIndexedJob(job *batch.Job) bool { | |||||||
| 	return job.Spec.CompletionMode != nil && *job.Spec.CompletionMode == batch.IndexedCompletion | 	return job.Spec.CompletionMode != nil && *job.Spec.CompletionMode == batch.IndexedCompletion | ||||||
| } | } | ||||||
|  |  | ||||||
|  | func hasBackoffLimitPerIndex(job *batch.Job) bool { | ||||||
|  | 	return feature.DefaultFeatureGate.Enabled(features.JobBackoffLimitPerIndex) && job.Spec.BackoffLimitPerIndex != nil | ||||||
|  | } | ||||||
|  |  | ||||||
| type interval struct { | type interval struct { | ||||||
| 	First int | 	First int | ||||||
| 	Last  int | 	Last  int | ||||||
| @@ -54,7 +59,7 @@ type orderedIntervals []interval | |||||||
| // empty list if this Job is not tracked with finalizers. The new list includes | // empty list if this Job is not tracked with finalizers. The new list includes | ||||||
| // the indexes that succeeded since the last sync. | // the indexes that succeeded since the last sync. | ||||||
| func calculateSucceededIndexes(logger klog.Logger, job *batch.Job, pods []*v1.Pod) (orderedIntervals, orderedIntervals) { | func calculateSucceededIndexes(logger klog.Logger, job *batch.Job, pods []*v1.Pod) (orderedIntervals, orderedIntervals) { | ||||||
| 	prevIntervals := succeededIndexesFromString(logger, job.Status.CompletedIndexes, int(*job.Spec.Completions)) | 	prevIntervals := parseIndexesFromString(logger, job.Status.CompletedIndexes, int(*job.Spec.Completions)) | ||||||
| 	newSucceeded := sets.New[int]() | 	newSucceeded := sets.New[int]() | ||||||
| 	for _, p := range pods { | 	for _, p := range pods { | ||||||
| 		ix := getCompletionIndex(p.Annotations) | 		ix := getCompletionIndex(p.Annotations) | ||||||
| @@ -69,9 +74,55 @@ func calculateSucceededIndexes(logger klog.Logger, job *batch.Job, pods []*v1.Po | |||||||
| 	return prevIntervals, result | 	return prevIntervals, result | ||||||
| } | } | ||||||
|  |  | ||||||
|  | // calculateFailedIndexes returns the list of failed indexes in compressed | ||||||
|  | // format (intervals). The list includes indexes already present in | ||||||
|  | // .status.failedIndexes and indexes that failed since the last sync. | ||||||
|  | func calculateFailedIndexes(logger klog.Logger, job *batch.Job, pods []*v1.Pod) *orderedIntervals { | ||||||
|  | 	var prevIntervals orderedIntervals | ||||||
|  | 	if job.Status.FailedIndexes != nil { | ||||||
|  | 		prevIntervals = parseIndexesFromString(logger, *job.Status.FailedIndexes, int(*job.Spec.Completions)) | ||||||
|  | 	} | ||||||
|  | 	newFailed := sets.New[int]() | ||||||
|  | 	for _, p := range pods { | ||||||
|  | 		ix := getCompletionIndex(p.Annotations) | ||||||
|  | 		// Failed Pod with valid index and has a finalizer (meaning that it is not counted yet). | ||||||
|  | 		if ix != unknownCompletionIndex && ix < int(*job.Spec.Completions) && hasJobTrackingFinalizer(p) && isIndexFailed(logger, job, p) { | ||||||
|  | 			newFailed.Insert(ix) | ||||||
|  | 		} | ||||||
|  | 	} | ||||||
|  | 	// List returns the items of the set in order. | ||||||
|  | 	result := prevIntervals.withOrderedIndexes(sets.List(newFailed)) | ||||||
|  | 	return &result | ||||||
|  | } | ||||||
|  |  | ||||||
|  | func isIndexFailed(logger klog.Logger, job *batch.Job, pod *v1.Pod) bool { | ||||||
|  | 	isPodFailedCounted := false | ||||||
|  | 	if isPodFailed(pod, job) { | ||||||
|  | 		if feature.DefaultFeatureGate.Enabled(features.JobPodFailurePolicy) && job.Spec.PodFailurePolicy != nil { | ||||||
|  | 			_, countFailed, action := matchPodFailurePolicy(job.Spec.PodFailurePolicy, pod) | ||||||
|  | 			if action != nil && *action == batch.PodFailurePolicyActionFailIndex { | ||||||
|  | 				return true | ||||||
|  | 			} | ||||||
|  | 			isPodFailedCounted = countFailed | ||||||
|  | 		} else { | ||||||
|  | 			isPodFailedCounted = true | ||||||
|  | 		} | ||||||
|  | 	} | ||||||
|  | 	return isPodFailedCounted && getIndexFailureCount(logger, pod) >= *job.Spec.BackoffLimitPerIndex | ||||||
|  | } | ||||||
|  |  | ||||||
| // withOrderedIndexes returns a new list of ordered intervals that contains | // withOrderedIndexes returns a new list of ordered intervals that contains | ||||||
| // the newIndexes, provided in increasing order. | // the newIndexes, provided in increasing order. | ||||||
| func (oi orderedIntervals) withOrderedIndexes(newIndexes []int) orderedIntervals { | func (oi orderedIntervals) withOrderedIndexes(newIndexes []int) orderedIntervals { | ||||||
|  | 	newIndexIntervals := make(orderedIntervals, len(newIndexes)) | ||||||
|  | 	for i, newIndex := range newIndexes { | ||||||
|  | 		newIndexIntervals[i] = interval{newIndex, newIndex} | ||||||
|  | 	} | ||||||
|  | 	return oi.merge(newIndexIntervals) | ||||||
|  | } | ||||||
|  |  | ||||||
|  | // with returns a new list of ordered intervals that contains the newOrderedIntervals. | ||||||
|  | func (oi orderedIntervals) merge(newOi orderedIntervals) orderedIntervals { | ||||||
| 	var result orderedIntervals | 	var result orderedIntervals | ||||||
| 	i := 0 | 	i := 0 | ||||||
| 	j := 0 | 	j := 0 | ||||||
| @@ -84,12 +135,12 @@ func (oi orderedIntervals) withOrderedIndexes(newIndexes []int) orderedIntervals | |||||||
| 			lastInterval.Last = thisInterval.Last | 			lastInterval.Last = thisInterval.Last | ||||||
| 		} | 		} | ||||||
| 	} | 	} | ||||||
| 	for i < len(oi) && j < len(newIndexes) { | 	for i < len(oi) && j < len(newOi) { | ||||||
| 		if oi[i].First < newIndexes[j] { | 		if oi[i].First < newOi[j].First { | ||||||
| 			appendOrMergeWithLastInterval(oi[i]) | 			appendOrMergeWithLastInterval(oi[i]) | ||||||
| 			i++ | 			i++ | ||||||
| 		} else { | 		} else { | ||||||
| 			appendOrMergeWithLastInterval(interval{newIndexes[j], newIndexes[j]}) | 			appendOrMergeWithLastInterval(newOi[j]) | ||||||
| 			j++ | 			j++ | ||||||
| 		} | 		} | ||||||
| 	} | 	} | ||||||
| @@ -97,8 +148,8 @@ func (oi orderedIntervals) withOrderedIndexes(newIndexes []int) orderedIntervals | |||||||
| 		appendOrMergeWithLastInterval(oi[i]) | 		appendOrMergeWithLastInterval(oi[i]) | ||||||
| 		i++ | 		i++ | ||||||
| 	} | 	} | ||||||
| 	for j < len(newIndexes) { | 	for j < len(newOi) { | ||||||
| 		appendOrMergeWithLastInterval(interval{newIndexes[j], newIndexes[j]}) | 		appendOrMergeWithLastInterval(newOi[j]) | ||||||
| 		j++ | 		j++ | ||||||
| 	} | 	} | ||||||
| 	return result | 	return result | ||||||
| @@ -150,19 +201,19 @@ func (oi orderedIntervals) has(ix int) bool { | |||||||
| 	return oi[hi].First <= ix | 	return oi[hi].First <= ix | ||||||
| } | } | ||||||
|  |  | ||||||
| func succeededIndexesFromString(logger klog.Logger, completedIndexes string, completions int) orderedIntervals { | func parseIndexesFromString(logger klog.Logger, indexesStr string, completions int) orderedIntervals { | ||||||
| 	if completedIndexes == "" { | 	if indexesStr == "" { | ||||||
| 		return nil | 		return nil | ||||||
| 	} | 	} | ||||||
| 	var result orderedIntervals | 	var result orderedIntervals | ||||||
| 	var lastInterval *interval | 	var lastInterval *interval | ||||||
| 	for _, intervalStr := range strings.Split(completedIndexes, ",") { | 	for _, intervalStr := range strings.Split(indexesStr, ",") { | ||||||
| 		limitsStr := strings.Split(intervalStr, "-") | 		limitsStr := strings.Split(intervalStr, "-") | ||||||
| 		var inter interval | 		var inter interval | ||||||
| 		var err error | 		var err error | ||||||
| 		inter.First, err = strconv.Atoi(limitsStr[0]) | 		inter.First, err = strconv.Atoi(limitsStr[0]) | ||||||
| 		if err != nil { | 		if err != nil { | ||||||
| 			logger.Info("Corrupted completed indexes interval, ignoring", "interval", intervalStr, "err", err) | 			logger.Info("Corrupted indexes interval, ignoring", "interval", intervalStr, "err", err) | ||||||
| 			continue | 			continue | ||||||
| 		} | 		} | ||||||
| 		if inter.First >= completions { | 		if inter.First >= completions { | ||||||
| @@ -171,7 +222,7 @@ func succeededIndexesFromString(logger klog.Logger, completedIndexes string, com | |||||||
| 		if len(limitsStr) > 1 { | 		if len(limitsStr) > 1 { | ||||||
| 			inter.Last, err = strconv.Atoi(limitsStr[1]) | 			inter.Last, err = strconv.Atoi(limitsStr[1]) | ||||||
| 			if err != nil { | 			if err != nil { | ||||||
| 				logger.Info("Corrupted completed indexes interval, ignoring", "interval", intervalStr, "err", err) | 				logger.Info("Corrupted indexes interval, ignoring", "interval", intervalStr, "err", err) | ||||||
| 				continue | 				continue | ||||||
| 			} | 			} | ||||||
| 			if inter.Last >= completions { | 			if inter.Last >= completions { | ||||||
| @@ -191,20 +242,17 @@ func succeededIndexesFromString(logger klog.Logger, completedIndexes string, com | |||||||
| } | } | ||||||
|  |  | ||||||
| // firstPendingIndexes returns `count` indexes less than `completions` that are | // firstPendingIndexes returns `count` indexes less than `completions` that are | ||||||
| // not covered by `activePods` or `succeededIndexes`. | // not covered by `activePods`, `succeededIndexes` or `failedIndexes`. | ||||||
| func firstPendingIndexes(jobCtx *syncJobCtx, count, completions int) []int { | func firstPendingIndexes(jobCtx *syncJobCtx, count, completions int) []int { | ||||||
| 	if count == 0 { | 	if count == 0 { | ||||||
| 		return nil | 		return nil | ||||||
| 	} | 	} | ||||||
| 	active := sets.New[int]() | 	active := getIndexes(jobCtx.activePods) | ||||||
| 	for _, p := range jobCtx.activePods { |  | ||||||
| 		ix := getCompletionIndex(p.Annotations) |  | ||||||
| 		if ix != unknownCompletionIndex { |  | ||||||
| 			active.Insert(ix) |  | ||||||
| 		} |  | ||||||
| 	} |  | ||||||
| 	result := make([]int, 0, count) | 	result := make([]int, 0, count) | ||||||
| 	nonPending := jobCtx.succeededIndexes.withOrderedIndexes(sets.List(active)) | 	nonPending := jobCtx.succeededIndexes.withOrderedIndexes(sets.List(active)) | ||||||
|  | 	if jobCtx.failedIndexes != nil { | ||||||
|  | 		nonPending = nonPending.merge(*jobCtx.failedIndexes) | ||||||
|  | 	} | ||||||
| 	// The following algorithm is bounded by len(nonPending) and count. | 	// The following algorithm is bounded by len(nonPending) and count. | ||||||
| 	candidate := 0 | 	candidate := 0 | ||||||
| 	for _, sInterval := range nonPending { | 	for _, sInterval := range nonPending { | ||||||
| @@ -221,6 +269,18 @@ func firstPendingIndexes(jobCtx *syncJobCtx, count, completions int) []int { | |||||||
| 	return result | 	return result | ||||||
| } | } | ||||||
|  |  | ||||||
|  | // Returns the list of indexes corresponding to the set of pods | ||||||
|  | func getIndexes(pods []*v1.Pod) sets.Set[int] { | ||||||
|  | 	result := sets.New[int]() | ||||||
|  | 	for _, p := range pods { | ||||||
|  | 		ix := getCompletionIndex(p.Annotations) | ||||||
|  | 		if ix != unknownCompletionIndex { | ||||||
|  | 			result.Insert(ix) | ||||||
|  | 		} | ||||||
|  | 	} | ||||||
|  | 	return result | ||||||
|  | } | ||||||
|  |  | ||||||
| // appendDuplicatedIndexPodsForRemoval scans active `pods` for duplicated | // appendDuplicatedIndexPodsForRemoval scans active `pods` for duplicated | ||||||
| // completion indexes. For each index, it selects n-1 pods for removal, where n | // completion indexes. For each index, it selects n-1 pods for removal, where n | ||||||
| // is the number of repetitions. The pods to be removed are appended to `rm`, | // is the number of repetitions. The pods to be removed are appended to `rm`, | ||||||
| @@ -248,6 +308,69 @@ func appendDuplicatedIndexPodsForRemoval(rm, left, pods []*v1.Pod, completions i | |||||||
| 	return appendPodsWithSameIndexForRemovalAndRemaining(rm, left, pods[firstRepeatPos:countLooped], lastIndex) | 	return appendPodsWithSameIndexForRemovalAndRemaining(rm, left, pods[firstRepeatPos:countLooped], lastIndex) | ||||||
| } | } | ||||||
|  |  | ||||||
|  | // getPodsWithDelayedDeletionPerIndex returns the pod which removal is delayed | ||||||
|  | // in order to await for recreation. This map is used when BackoffLimitPerIndex | ||||||
|  | // is enabled to delay pod finalizer removal, and thus pod deletion, until the | ||||||
|  | // replacement pod is created. The pod deletion is delayed so that the | ||||||
|  | // replacement pod can have the batch.kubernetes.io/job-index-failure-count | ||||||
|  | // annotation set properly keeping track of the number of failed pods within | ||||||
|  | // the index. | ||||||
|  | func getPodsWithDelayedDeletionPerIndex(logger klog.Logger, jobCtx *syncJobCtx) map[int]*v1.Pod { | ||||||
|  | 	// the failed pods corresponding to currently active indexes can be safely | ||||||
|  | 	// deleted as the failure count annotation is present in the currently | ||||||
|  | 	// active pods. | ||||||
|  | 	activeIndexes := getIndexes(jobCtx.activePods) | ||||||
|  |  | ||||||
|  | 	podsWithDelayedDeletionPerIndex := make(map[int]*v1.Pod) | ||||||
|  | 	getValidPodsWithFilter(jobCtx, nil, func(p *v1.Pod) bool { | ||||||
|  | 		if isPodFailed(p, jobCtx.job) { | ||||||
|  | 			if ix := getCompletionIndex(p.Annotations); ix != unknownCompletionIndex && ix < int(*jobCtx.job.Spec.Completions) { | ||||||
|  | 				if jobCtx.succeededIndexes.has(ix) || jobCtx.failedIndexes.has(ix) || activeIndexes.Has(ix) { | ||||||
|  | 					return false | ||||||
|  | 				} | ||||||
|  | 				if lastPodWithDelayedDeletion, ok := podsWithDelayedDeletionPerIndex[ix]; ok { | ||||||
|  | 					if getIndexAbsoluteFailureCount(logger, lastPodWithDelayedDeletion) <= getIndexAbsoluteFailureCount(logger, p) && !getFinishedTime(p).Before(getFinishedTime(lastPodWithDelayedDeletion)) { | ||||||
|  | 						podsWithDelayedDeletionPerIndex[ix] = p | ||||||
|  | 					} | ||||||
|  | 				} else { | ||||||
|  | 					podsWithDelayedDeletionPerIndex[ix] = p | ||||||
|  | 				} | ||||||
|  | 			} | ||||||
|  | 		} | ||||||
|  | 		return false | ||||||
|  | 	}) | ||||||
|  | 	return podsWithDelayedDeletionPerIndex | ||||||
|  | } | ||||||
|  |  | ||||||
|  | func addIndexFailureCountAnnotation(logger klog.Logger, template *v1.PodTemplateSpec, job *batch.Job, podBeingReplaced *v1.Pod) { | ||||||
|  | 	indexFailureCount, indexIgnoredFailureCount := getNewIndexFailureCounts(logger, job, podBeingReplaced) | ||||||
|  | 	template.Annotations[batch.JobIndexFailureCountAnnotation] = strconv.Itoa(int(indexFailureCount)) | ||||||
|  | 	if indexIgnoredFailureCount > 0 { | ||||||
|  | 		template.Annotations[batch.JobIndexIgnoredFailureCountAnnotation] = strconv.Itoa(int(indexIgnoredFailureCount)) | ||||||
|  | 	} | ||||||
|  | } | ||||||
|  |  | ||||||
|  | // getNewIndexFailureCount returns the value of the index-failure-count | ||||||
|  | // annotation for the new pod being created | ||||||
|  | func getNewIndexFailureCounts(logger klog.Logger, job *batch.Job, podBeingReplaced *v1.Pod) (int32, int32) { | ||||||
|  | 	if podBeingReplaced != nil { | ||||||
|  | 		indexFailureCount := parseIndexFailureCountAnnotation(logger, podBeingReplaced) | ||||||
|  | 		indexIgnoredFailureCount := parseIndexFailureIgnoreCountAnnotation(logger, podBeingReplaced) | ||||||
|  | 		if feature.DefaultFeatureGate.Enabled(features.JobPodFailurePolicy) && job.Spec.PodFailurePolicy != nil { | ||||||
|  | 			_, countFailed, _ := matchPodFailurePolicy(job.Spec.PodFailurePolicy, podBeingReplaced) | ||||||
|  | 			if countFailed { | ||||||
|  | 				indexFailureCount++ | ||||||
|  | 			} else { | ||||||
|  | 				indexIgnoredFailureCount++ | ||||||
|  | 			} | ||||||
|  | 		} else { | ||||||
|  | 			indexFailureCount++ | ||||||
|  | 		} | ||||||
|  | 		return indexFailureCount, indexIgnoredFailureCount | ||||||
|  | 	} | ||||||
|  | 	return 0, 0 | ||||||
|  | } | ||||||
|  |  | ||||||
| func appendPodsWithSameIndexForRemovalAndRemaining(rm, left, pods []*v1.Pod, ix int) ([]*v1.Pod, []*v1.Pod) { | func appendPodsWithSameIndexForRemovalAndRemaining(rm, left, pods []*v1.Pod, ix int) ([]*v1.Pod, []*v1.Pod) { | ||||||
| 	if ix == unknownCompletionIndex { | 	if ix == unknownCompletionIndex { | ||||||
| 		rm = append(rm, pods...) | 		rm = append(rm, pods...) | ||||||
| @@ -281,6 +404,49 @@ func getCompletionIndex(annotations map[string]string) int { | |||||||
| 	return i | 	return i | ||||||
| } | } | ||||||
|  |  | ||||||
|  | // getIndexFailureCount returns the value of the batch.kubernetes.io/job-index-failure-count | ||||||
|  | // annotation as int32. It fallbacks to 0 when: | ||||||
|  | //   - there is no annotation - for example the pod was created when the BackoffLimitPerIndex | ||||||
|  | //     feature was temporarily disabled, or the annotation was manually removed by the user, | ||||||
|  | //   - the value of the annotation isn't parsable as int - for example because | ||||||
|  | //     it was set by a malicious user, | ||||||
|  | //   - the value of the annotation is negative or greater by int32 - for example | ||||||
|  | //     because it was set by a malicious user. | ||||||
|  | func getIndexFailureCount(logger klog.Logger, pod *v1.Pod) int32 { | ||||||
|  | 	return parseIndexFailureCountAnnotation(logger, pod) | ||||||
|  | } | ||||||
|  |  | ||||||
|  | func getIndexAbsoluteFailureCount(logger klog.Logger, pod *v1.Pod) int32 { | ||||||
|  | 	return parseIndexFailureCountAnnotation(logger, pod) + parseIndexFailureIgnoreCountAnnotation(logger, pod) | ||||||
|  | } | ||||||
|  |  | ||||||
|  | func parseIndexFailureCountAnnotation(logger klog.Logger, pod *v1.Pod) int32 { | ||||||
|  | 	if value, ok := pod.Annotations[batch.JobIndexFailureCountAnnotation]; ok { | ||||||
|  | 		return parseInt32(logger, value) | ||||||
|  | 	} | ||||||
|  | 	logger.V(3).Info("There is no expected annotation", "annotationKey", batch.JobIndexFailureCountAnnotation, "pod", klog.KObj(pod), "podUID", pod.UID) | ||||||
|  | 	return 0 | ||||||
|  | } | ||||||
|  |  | ||||||
|  | func parseIndexFailureIgnoreCountAnnotation(logger klog.Logger, pod *v1.Pod) int32 { | ||||||
|  | 	if value, ok := pod.Annotations[batch.JobIndexIgnoredFailureCountAnnotation]; ok { | ||||||
|  | 		return parseInt32(logger, value) | ||||||
|  | 	} | ||||||
|  | 	return 0 | ||||||
|  | } | ||||||
|  |  | ||||||
|  | func parseInt32(logger klog.Logger, vStr string) int32 { | ||||||
|  | 	if vInt, err := strconv.Atoi(vStr); err != nil { | ||||||
|  | 		logger.Error(err, "Failed to parse the value", "value", vStr) | ||||||
|  | 		return 0 | ||||||
|  | 	} else if vInt < 0 || vInt > math.MaxInt32 { | ||||||
|  | 		logger.Info("The value is invalid", "value", vInt) | ||||||
|  | 		return 0 | ||||||
|  | 	} else { | ||||||
|  | 		return int32(vInt) | ||||||
|  | 	} | ||||||
|  | } | ||||||
|  |  | ||||||
| func addCompletionIndexEnvVariables(template *v1.PodTemplateSpec) { | func addCompletionIndexEnvVariables(template *v1.PodTemplateSpec) { | ||||||
| 	for i := range template.Spec.InitContainers { | 	for i := range template.Spec.InitContainers { | ||||||
| 		addCompletionIndexEnvVariable(&template.Spec.InitContainers[i]) | 		addCompletionIndexEnvVariable(&template.Spec.InitContainers[i]) | ||||||
|   | |||||||
| @@ -17,12 +17,20 @@ limitations under the License. | |||||||
| package job | package job | ||||||
|  |  | ||||||
| import ( | import ( | ||||||
|  | 	"math" | ||||||
|  | 	"strconv" | ||||||
| 	"testing" | 	"testing" | ||||||
|  | 	"time" | ||||||
|  |  | ||||||
| 	"github.com/google/go-cmp/cmp" | 	"github.com/google/go-cmp/cmp" | ||||||
| 	batch "k8s.io/api/batch/v1" | 	batch "k8s.io/api/batch/v1" | ||||||
| 	v1 "k8s.io/api/core/v1" | 	v1 "k8s.io/api/core/v1" | ||||||
|  | 	"k8s.io/apimachinery/pkg/util/sets" | ||||||
|  | 	"k8s.io/apiserver/pkg/util/feature" | ||||||
|  | 	featuregatetesting "k8s.io/component-base/featuregate/testing" | ||||||
| 	"k8s.io/klog/v2/ktesting" | 	"k8s.io/klog/v2/ktesting" | ||||||
|  | 	"k8s.io/kubernetes/pkg/controller" | ||||||
|  | 	"k8s.io/kubernetes/pkg/features" | ||||||
| 	"k8s.io/utils/pointer" | 	"k8s.io/utils/pointer" | ||||||
| ) | ) | ||||||
|  |  | ||||||
| @@ -219,6 +227,427 @@ func TestCalculateSucceededIndexes(t *testing.T) { | |||||||
| 	} | 	} | ||||||
| } | } | ||||||
|  |  | ||||||
|  | func TestIsIndexFailed(t *testing.T) { | ||||||
|  | 	logger, _ := ktesting.NewTestContext(t) | ||||||
|  | 	cases := map[string]struct { | ||||||
|  | 		enableJobPodFailurePolicy bool | ||||||
|  | 		job                       batch.Job | ||||||
|  | 		pod                       *v1.Pod | ||||||
|  | 		wantResult                bool | ||||||
|  | 	}{ | ||||||
|  | 		"failed pod exceeding backoffLimitPerIndex, when backoffLimitPerIndex=0": { | ||||||
|  | 			job: batch.Job{ | ||||||
|  | 				Spec: batch.JobSpec{ | ||||||
|  | 					Completions:          pointer.Int32(2), | ||||||
|  | 					BackoffLimitPerIndex: pointer.Int32(0), | ||||||
|  | 				}, | ||||||
|  | 			}, | ||||||
|  | 			pod:        buildPod().indexFailureCount("0").phase(v1.PodFailed).index("0").trackingFinalizer().Pod, | ||||||
|  | 			wantResult: true, | ||||||
|  | 		}, | ||||||
|  | 		"failed pod exceeding backoffLimitPerIndex, when backoffLimitPerIndex=1": { | ||||||
|  | 			job: batch.Job{ | ||||||
|  | 				Spec: batch.JobSpec{ | ||||||
|  | 					Completions:          pointer.Int32(2), | ||||||
|  | 					BackoffLimitPerIndex: pointer.Int32(1), | ||||||
|  | 				}, | ||||||
|  | 			}, | ||||||
|  | 			pod:        buildPod().indexFailureCount("1").phase(v1.PodFailed).index("1").trackingFinalizer().Pod, | ||||||
|  | 			wantResult: true, | ||||||
|  | 		}, | ||||||
|  | 		"matching FailIndex pod failure policy; JobPodFailurePolicy enabled": { | ||||||
|  | 			enableJobPodFailurePolicy: true, | ||||||
|  | 			job: batch.Job{ | ||||||
|  | 				Spec: batch.JobSpec{ | ||||||
|  | 					Completions:          pointer.Int32(2), | ||||||
|  | 					BackoffLimitPerIndex: pointer.Int32(1), | ||||||
|  | 					PodFailurePolicy: &batch.PodFailurePolicy{ | ||||||
|  | 						Rules: []batch.PodFailurePolicyRule{ | ||||||
|  | 							{ | ||||||
|  | 								Action: batch.PodFailurePolicyActionFailIndex, | ||||||
|  | 								OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{ | ||||||
|  | 									Operator: batch.PodFailurePolicyOnExitCodesOpIn, | ||||||
|  | 									Values:   []int32{3}, | ||||||
|  | 								}, | ||||||
|  | 							}, | ||||||
|  | 						}, | ||||||
|  | 					}, | ||||||
|  | 				}, | ||||||
|  | 			}, | ||||||
|  | 			pod: buildPod().indexFailureCount("0").status(v1.PodStatus{ | ||||||
|  | 				Phase: v1.PodFailed, | ||||||
|  | 				ContainerStatuses: []v1.ContainerStatus{ | ||||||
|  | 					{ | ||||||
|  | 						State: v1.ContainerState{ | ||||||
|  | 							Terminated: &v1.ContainerStateTerminated{ | ||||||
|  | 								ExitCode: 3, | ||||||
|  | 							}, | ||||||
|  | 						}, | ||||||
|  | 					}, | ||||||
|  | 				}, | ||||||
|  | 			}).index("0").trackingFinalizer().Pod, | ||||||
|  | 			wantResult: true, | ||||||
|  | 		}, | ||||||
|  | 		"matching FailIndex pod failure policy; JobPodFailurePolicy disabled": { | ||||||
|  | 			enableJobPodFailurePolicy: false, | ||||||
|  | 			job: batch.Job{ | ||||||
|  | 				Spec: batch.JobSpec{ | ||||||
|  | 					Completions:          pointer.Int32(2), | ||||||
|  | 					BackoffLimitPerIndex: pointer.Int32(1), | ||||||
|  | 					PodFailurePolicy: &batch.PodFailurePolicy{ | ||||||
|  | 						Rules: []batch.PodFailurePolicyRule{ | ||||||
|  | 							{ | ||||||
|  | 								Action: batch.PodFailurePolicyActionFailIndex, | ||||||
|  | 								OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{ | ||||||
|  | 									Operator: batch.PodFailurePolicyOnExitCodesOpIn, | ||||||
|  | 									Values:   []int32{3}, | ||||||
|  | 								}, | ||||||
|  | 							}, | ||||||
|  | 						}, | ||||||
|  | 					}, | ||||||
|  | 				}, | ||||||
|  | 			}, | ||||||
|  | 			pod: buildPod().indexFailureCount("0").status(v1.PodStatus{ | ||||||
|  | 				Phase: v1.PodFailed, | ||||||
|  | 				ContainerStatuses: []v1.ContainerStatus{ | ||||||
|  | 					{ | ||||||
|  | 						State: v1.ContainerState{ | ||||||
|  | 							Terminated: &v1.ContainerStateTerminated{ | ||||||
|  | 								ExitCode: 3, | ||||||
|  | 							}, | ||||||
|  | 						}, | ||||||
|  | 					}, | ||||||
|  | 				}, | ||||||
|  | 			}).index("0").trackingFinalizer().Pod, | ||||||
|  | 			wantResult: false, | ||||||
|  | 		}, | ||||||
|  | 	} | ||||||
|  | 	for name, tc := range cases { | ||||||
|  | 		t.Run(name, func(t *testing.T) { | ||||||
|  | 			defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobBackoffLimitPerIndex, true)() | ||||||
|  | 			defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobPodFailurePolicy, tc.enableJobPodFailurePolicy)() | ||||||
|  | 			gotResult := isIndexFailed(logger, &tc.job, tc.pod) | ||||||
|  | 			if diff := cmp.Diff(tc.wantResult, gotResult); diff != "" { | ||||||
|  | 				t.Errorf("Unexpected result (-want,+got):\n%s", diff) | ||||||
|  | 			} | ||||||
|  | 		}) | ||||||
|  | 	} | ||||||
|  | } | ||||||
|  |  | ||||||
|  | func TestCalculateFailedIndexes(t *testing.T) { | ||||||
|  | 	logger, _ := ktesting.NewTestContext(t) | ||||||
|  | 	cases := map[string]struct { | ||||||
|  | 		enableJobPodFailurePolicy bool | ||||||
|  | 		job                       batch.Job | ||||||
|  | 		pods                      []*v1.Pod | ||||||
|  | 		wantPrevFailedIndexes     orderedIntervals | ||||||
|  | 		wantFailedIndexes         orderedIntervals | ||||||
|  | 	}{ | ||||||
|  | 		"one new index failed": { | ||||||
|  | 			job: batch.Job{ | ||||||
|  | 				Spec: batch.JobSpec{ | ||||||
|  | 					Completions:          pointer.Int32(2), | ||||||
|  | 					BackoffLimitPerIndex: pointer.Int32(1), | ||||||
|  | 				}, | ||||||
|  | 			}, | ||||||
|  | 			pods: []*v1.Pod{ | ||||||
|  | 				buildPod().indexFailureCount("0").phase(v1.PodFailed).index("0").trackingFinalizer().Pod, | ||||||
|  | 				buildPod().indexFailureCount("1").phase(v1.PodFailed).index("1").trackingFinalizer().Pod, | ||||||
|  | 			}, | ||||||
|  | 			wantFailedIndexes: []interval{{1, 1}}, | ||||||
|  | 		}, | ||||||
|  | 		"pod without finalizer is ignored": { | ||||||
|  | 			job: batch.Job{ | ||||||
|  | 				Spec: batch.JobSpec{ | ||||||
|  | 					Completions:          pointer.Int32(2), | ||||||
|  | 					BackoffLimitPerIndex: pointer.Int32(0), | ||||||
|  | 				}, | ||||||
|  | 			}, | ||||||
|  | 			pods: []*v1.Pod{ | ||||||
|  | 				buildPod().indexFailureCount("0").phase(v1.PodFailed).index("0").Pod, | ||||||
|  | 			}, | ||||||
|  | 			wantFailedIndexes: nil, | ||||||
|  | 		}, | ||||||
|  | 		"pod outside completions is ignored": { | ||||||
|  | 			job: batch.Job{ | ||||||
|  | 				Spec: batch.JobSpec{ | ||||||
|  | 					Completions:          pointer.Int32(2), | ||||||
|  | 					BackoffLimitPerIndex: pointer.Int32(0), | ||||||
|  | 				}, | ||||||
|  | 			}, | ||||||
|  | 			pods: []*v1.Pod{ | ||||||
|  | 				buildPod().indexFailureCount("0").phase(v1.PodFailed).index("3").Pod, | ||||||
|  | 			}, | ||||||
|  | 			wantFailedIndexes: nil, | ||||||
|  | 		}, | ||||||
|  | 		"extend the failed indexes": { | ||||||
|  | 			job: batch.Job{ | ||||||
|  | 				Status: batch.JobStatus{ | ||||||
|  | 					FailedIndexes: pointer.String("0"), | ||||||
|  | 				}, | ||||||
|  | 				Spec: batch.JobSpec{ | ||||||
|  | 					Completions:          pointer.Int32(2), | ||||||
|  | 					BackoffLimitPerIndex: pointer.Int32(0), | ||||||
|  | 				}, | ||||||
|  | 			}, | ||||||
|  | 			pods: []*v1.Pod{ | ||||||
|  | 				buildPod().indexFailureCount("0").phase(v1.PodFailed).index("1").trackingFinalizer().Pod, | ||||||
|  | 			}, | ||||||
|  | 			wantFailedIndexes: []interval{{0, 1}}, | ||||||
|  | 		}, | ||||||
|  | 		"prev failed indexes empty": { | ||||||
|  | 			job: batch.Job{ | ||||||
|  | 				Status: batch.JobStatus{ | ||||||
|  | 					FailedIndexes: pointer.String(""), | ||||||
|  | 				}, | ||||||
|  | 				Spec: batch.JobSpec{ | ||||||
|  | 					Completions:          pointer.Int32(2), | ||||||
|  | 					BackoffLimitPerIndex: pointer.Int32(0), | ||||||
|  | 				}, | ||||||
|  | 			}, | ||||||
|  | 			pods: []*v1.Pod{ | ||||||
|  | 				buildPod().indexFailureCount("0").phase(v1.PodFailed).index("1").trackingFinalizer().Pod, | ||||||
|  | 			}, | ||||||
|  | 			wantFailedIndexes: []interval{{1, 1}}, | ||||||
|  | 		}, | ||||||
|  | 		"prev failed indexes outside the completions": { | ||||||
|  | 			job: batch.Job{ | ||||||
|  | 				Status: batch.JobStatus{ | ||||||
|  | 					FailedIndexes: pointer.String("9"), | ||||||
|  | 				}, | ||||||
|  | 				Spec: batch.JobSpec{ | ||||||
|  | 					Completions:          pointer.Int32(2), | ||||||
|  | 					BackoffLimitPerIndex: pointer.Int32(0), | ||||||
|  | 				}, | ||||||
|  | 			}, | ||||||
|  | 			pods: []*v1.Pod{ | ||||||
|  | 				buildPod().indexFailureCount("0").phase(v1.PodFailed).index("1").trackingFinalizer().Pod, | ||||||
|  | 			}, | ||||||
|  | 			wantFailedIndexes: []interval{{1, 1}}, | ||||||
|  | 		}, | ||||||
|  | 	} | ||||||
|  | 	for name, tc := range cases { | ||||||
|  | 		t.Run(name, func(t *testing.T) { | ||||||
|  | 			failedIndexes := calculateFailedIndexes(logger, &tc.job, tc.pods) | ||||||
|  | 			if diff := cmp.Diff(&tc.wantFailedIndexes, failedIndexes); diff != "" { | ||||||
|  | 				t.Errorf("Unexpected failed indexes (-want,+got):\n%s", diff) | ||||||
|  | 			} | ||||||
|  | 		}) | ||||||
|  | 	} | ||||||
|  | } | ||||||
|  |  | ||||||
|  | func TestGetPodsWithDelayedDeletionPerIndex(t *testing.T) { | ||||||
|  | 	logger, _ := ktesting.NewTestContext(t) | ||||||
|  | 	now := time.Now() | ||||||
|  | 	cases := map[string]struct { | ||||||
|  | 		enableJobPodFailurePolicy           bool | ||||||
|  | 		job                                 batch.Job | ||||||
|  | 		pods                                []*v1.Pod | ||||||
|  | 		expectedRmFinalizers                sets.Set[string] | ||||||
|  | 		wantPodsWithDelayedDeletionPerIndex []string | ||||||
|  | 	}{ | ||||||
|  | 		"failed pods are kept corresponding to non-failed indexes are kept": { | ||||||
|  | 			job: batch.Job{ | ||||||
|  | 				Spec: batch.JobSpec{ | ||||||
|  | 					Completions:          pointer.Int32(3), | ||||||
|  | 					BackoffLimitPerIndex: pointer.Int32(1), | ||||||
|  | 				}, | ||||||
|  | 			}, | ||||||
|  | 			pods: []*v1.Pod{ | ||||||
|  | 				buildPod().uid("a").indexFailureCount("0").phase(v1.PodFailed).index("0").trackingFinalizer().Pod, | ||||||
|  | 				buildPod().uid("b").indexFailureCount("1").phase(v1.PodFailed).index("1").trackingFinalizer().Pod, | ||||||
|  | 				buildPod().uid("c").indexFailureCount("0").phase(v1.PodFailed).index("2").trackingFinalizer().Pod, | ||||||
|  | 			}, | ||||||
|  | 			wantPodsWithDelayedDeletionPerIndex: []string{"a", "c"}, | ||||||
|  | 		}, | ||||||
|  | 		"failed pod without finalizer; the pod's deletion is not delayed as it already started": { | ||||||
|  | 			job: batch.Job{ | ||||||
|  | 				Spec: batch.JobSpec{ | ||||||
|  | 					Completions:          pointer.Int32(2), | ||||||
|  | 					BackoffLimitPerIndex: pointer.Int32(0), | ||||||
|  | 				}, | ||||||
|  | 			}, | ||||||
|  | 			pods: []*v1.Pod{ | ||||||
|  | 				buildPod().uid("a").indexFailureCount("0").phase(v1.PodFailed).index("0").Pod, | ||||||
|  | 			}, | ||||||
|  | 			wantPodsWithDelayedDeletionPerIndex: []string{}, | ||||||
|  | 		}, | ||||||
|  | 		"failed pod with expected finalizer removal; the pod's deletion is not delayed as it already started": { | ||||||
|  | 			job: batch.Job{ | ||||||
|  | 				Spec: batch.JobSpec{ | ||||||
|  | 					Completions:          pointer.Int32(2), | ||||||
|  | 					BackoffLimitPerIndex: pointer.Int32(0), | ||||||
|  | 				}, | ||||||
|  | 			}, | ||||||
|  | 			pods: []*v1.Pod{ | ||||||
|  | 				buildPod().uid("a").indexFailureCount("0").phase(v1.PodFailed).index("0").trackingFinalizer().Pod, | ||||||
|  | 			}, | ||||||
|  | 			expectedRmFinalizers:                sets.New("a"), | ||||||
|  | 			wantPodsWithDelayedDeletionPerIndex: []string{}, | ||||||
|  | 		}, | ||||||
|  | 		"failed pod with index outside of completions; the pod's deletion is not delayed": { | ||||||
|  | 			job: batch.Job{ | ||||||
|  | 				Spec: batch.JobSpec{ | ||||||
|  | 					Completions:          pointer.Int32(2), | ||||||
|  | 					BackoffLimitPerIndex: pointer.Int32(0), | ||||||
|  | 				}, | ||||||
|  | 			}, | ||||||
|  | 			pods: []*v1.Pod{ | ||||||
|  | 				buildPod().uid("a").indexFailureCount("0").phase(v1.PodFailed).index("4").trackingFinalizer().Pod, | ||||||
|  | 			}, | ||||||
|  | 			wantPodsWithDelayedDeletionPerIndex: []string{}, | ||||||
|  | 		}, | ||||||
|  | 		"failed pod for active index; the pod's deletion is not delayed as it is already replaced": { | ||||||
|  | 			job: batch.Job{ | ||||||
|  | 				Spec: batch.JobSpec{ | ||||||
|  | 					Completions:          pointer.Int32(2), | ||||||
|  | 					BackoffLimitPerIndex: pointer.Int32(1), | ||||||
|  | 				}, | ||||||
|  | 			}, | ||||||
|  | 			pods: []*v1.Pod{ | ||||||
|  | 				buildPod().uid("a1").indexFailureCount("0").phase(v1.PodFailed).index("0").trackingFinalizer().Pod, | ||||||
|  | 				buildPod().uid("a2").indexFailureCount("1").phase(v1.PodRunning).index("0").trackingFinalizer().Pod, | ||||||
|  | 			}, | ||||||
|  | 			wantPodsWithDelayedDeletionPerIndex: []string{}, | ||||||
|  | 		}, | ||||||
|  | 		"failed pod for succeeded index; the pod's deletion is not delayed as it is already replaced": { | ||||||
|  | 			job: batch.Job{ | ||||||
|  | 				Spec: batch.JobSpec{ | ||||||
|  | 					Completions:          pointer.Int32(2), | ||||||
|  | 					BackoffLimitPerIndex: pointer.Int32(1), | ||||||
|  | 				}, | ||||||
|  | 			}, | ||||||
|  | 			pods: []*v1.Pod{ | ||||||
|  | 				buildPod().uid("a1").indexFailureCount("0").phase(v1.PodFailed).index("0").trackingFinalizer().Pod, | ||||||
|  | 				buildPod().uid("a2").indexFailureCount("1").phase(v1.PodSucceeded).index("0").trackingFinalizer().Pod, | ||||||
|  | 			}, | ||||||
|  | 			wantPodsWithDelayedDeletionPerIndex: []string{}, | ||||||
|  | 		}, | ||||||
|  | 		"multiple failed pods for index with different failure count; only the pod with highest failure count is kept": { | ||||||
|  | 			job: batch.Job{ | ||||||
|  | 				Spec: batch.JobSpec{ | ||||||
|  | 					Completions:          pointer.Int32(2), | ||||||
|  | 					BackoffLimitPerIndex: pointer.Int32(4), | ||||||
|  | 				}, | ||||||
|  | 			}, | ||||||
|  | 			pods: []*v1.Pod{ | ||||||
|  | 				buildPod().uid("a1").indexFailureCount("0").phase(v1.PodFailed).index("0").trackingFinalizer().Pod, | ||||||
|  | 				buildPod().uid("a3").indexFailureCount("2").phase(v1.PodFailed).index("0").trackingFinalizer().Pod, | ||||||
|  | 				buildPod().uid("a2").indexFailureCount("1").phase(v1.PodFailed).index("0").trackingFinalizer().Pod, | ||||||
|  | 			}, | ||||||
|  | 			wantPodsWithDelayedDeletionPerIndex: []string{"a3"}, | ||||||
|  | 		}, | ||||||
|  | 		"multiple failed pods for index with different finish times; only the last failed pod is kept": { | ||||||
|  | 			job: batch.Job{ | ||||||
|  | 				Spec: batch.JobSpec{ | ||||||
|  | 					Completions:          pointer.Int32(2), | ||||||
|  | 					BackoffLimitPerIndex: pointer.Int32(4), | ||||||
|  | 				}, | ||||||
|  | 			}, | ||||||
|  | 			pods: []*v1.Pod{ | ||||||
|  | 				buildPod().uid("a1").indexFailureCount("1").phase(v1.PodFailed).index("0").customDeletionTimestamp(now.Add(-time.Second)).trackingFinalizer().Pod, | ||||||
|  | 				buildPod().uid("a3").indexFailureCount("1").phase(v1.PodFailed).index("0").customDeletionTimestamp(now).trackingFinalizer().Pod, | ||||||
|  | 				buildPod().uid("a2").indexFailureCount("1").phase(v1.PodFailed).index("0").customDeletionTimestamp(now.Add(-2 * time.Second)).trackingFinalizer().Pod, | ||||||
|  | 			}, | ||||||
|  | 			wantPodsWithDelayedDeletionPerIndex: []string{"a3"}, | ||||||
|  | 		}, | ||||||
|  | 	} | ||||||
|  | 	for name, tc := range cases { | ||||||
|  | 		t.Run(name, func(t *testing.T) { | ||||||
|  | 			defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobBackoffLimitPerIndex, true)() | ||||||
|  | 			activePods := controller.FilterActivePods(logger, tc.pods) | ||||||
|  | 			failedIndexes := calculateFailedIndexes(logger, &tc.job, tc.pods) | ||||||
|  | 			_, succeededIndexes := calculateSucceededIndexes(logger, &tc.job, tc.pods) | ||||||
|  | 			jobCtx := &syncJobCtx{ | ||||||
|  | 				job:                  &tc.job, | ||||||
|  | 				pods:                 tc.pods, | ||||||
|  | 				activePods:           activePods, | ||||||
|  | 				succeededIndexes:     succeededIndexes, | ||||||
|  | 				failedIndexes:        failedIndexes, | ||||||
|  | 				expectedRmFinalizers: tc.expectedRmFinalizers, | ||||||
|  | 			} | ||||||
|  | 			gotPodsWithDelayedDeletionPerIndex := getPodsWithDelayedDeletionPerIndex(logger, jobCtx) | ||||||
|  | 			gotPodsWithDelayedDeletionPerIndexSet := sets.New[string]() | ||||||
|  | 			for _, pod := range gotPodsWithDelayedDeletionPerIndex { | ||||||
|  | 				gotPodsWithDelayedDeletionPerIndexSet.Insert(string(pod.UID)) | ||||||
|  | 			} | ||||||
|  | 			if diff := cmp.Diff(tc.wantPodsWithDelayedDeletionPerIndex, sets.List(gotPodsWithDelayedDeletionPerIndexSet)); diff != "" { | ||||||
|  | 				t.Errorf("Unexpected set of pods with delayed deletion (-want,+got):\n%s", diff) | ||||||
|  | 			} | ||||||
|  | 		}) | ||||||
|  | 	} | ||||||
|  | } | ||||||
|  |  | ||||||
|  | func TestGetNewIndexFailureCountValue(t *testing.T) { | ||||||
|  | 	logger, _ := ktesting.NewTestContext(t) | ||||||
|  | 	cases := map[string]struct { | ||||||
|  | 		enableJobPodFailurePolicy       bool | ||||||
|  | 		job                             batch.Job | ||||||
|  | 		pod                             *v1.Pod | ||||||
|  | 		wantNewIndexFailureCount        int32 | ||||||
|  | 		wantNewIndexIgnoredFailureCount int32 | ||||||
|  | 	}{ | ||||||
|  | 		"first pod created": { | ||||||
|  | 			job:                      batch.Job{}, | ||||||
|  | 			wantNewIndexFailureCount: 0, | ||||||
|  | 		}, | ||||||
|  | 		"failed pod being replaced with 0 index failure count": { | ||||||
|  | 			job:                      batch.Job{}, | ||||||
|  | 			pod:                      buildPod().uid("a").indexFailureCount("0").phase(v1.PodFailed).index("0").trackingFinalizer().Pod, | ||||||
|  | 			wantNewIndexFailureCount: 1, | ||||||
|  | 		}, | ||||||
|  | 		"failed pod being replaced with >0 index failure count": { | ||||||
|  | 			job:                      batch.Job{}, | ||||||
|  | 			pod:                      buildPod().uid("a").indexFailureCount("3").phase(v1.PodFailed).index("0").trackingFinalizer().Pod, | ||||||
|  | 			wantNewIndexFailureCount: 4, | ||||||
|  | 		}, | ||||||
|  | 		"failed pod being replaced, matching the ignore rule; JobPodFailurePolicy enabled": { | ||||||
|  | 			enableJobPodFailurePolicy: true, | ||||||
|  | 			job: batch.Job{ | ||||||
|  | 				Spec: batch.JobSpec{ | ||||||
|  | 					PodFailurePolicy: &batch.PodFailurePolicy{ | ||||||
|  | 						Rules: []batch.PodFailurePolicyRule{ | ||||||
|  | 							{ | ||||||
|  | 								Action: batch.PodFailurePolicyActionIgnore, | ||||||
|  | 								OnPodConditions: []batch.PodFailurePolicyOnPodConditionsPattern{ | ||||||
|  | 									{ | ||||||
|  | 										Type:   v1.DisruptionTarget, | ||||||
|  | 										Status: v1.ConditionTrue, | ||||||
|  | 									}, | ||||||
|  | 								}, | ||||||
|  | 							}, | ||||||
|  | 						}, | ||||||
|  | 					}, | ||||||
|  | 				}, | ||||||
|  | 			}, | ||||||
|  | 			pod: buildPod().uid("a").indexFailureCount("3").status(v1.PodStatus{ | ||||||
|  | 				Phase: v1.PodFailed, | ||||||
|  | 				Conditions: []v1.PodCondition{ | ||||||
|  | 					{ | ||||||
|  | 						Type:   v1.DisruptionTarget, | ||||||
|  | 						Status: v1.ConditionTrue, | ||||||
|  | 					}, | ||||||
|  | 				}, | ||||||
|  | 			}).index("3").trackingFinalizer().Pod, | ||||||
|  | 			wantNewIndexFailureCount:        3, | ||||||
|  | 			wantNewIndexIgnoredFailureCount: 1, | ||||||
|  | 		}, | ||||||
|  | 	} | ||||||
|  | 	for name, tc := range cases { | ||||||
|  | 		t.Run(name, func(t *testing.T) { | ||||||
|  | 			defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobBackoffLimitPerIndex, true)() | ||||||
|  | 			defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobPodFailurePolicy, tc.enableJobPodFailurePolicy)() | ||||||
|  | 			gotNewIndexFailureCount, gotNewIndexIgnoredFailureCount := getNewIndexFailureCounts(logger, &tc.job, tc.pod) | ||||||
|  | 			if diff := cmp.Diff(tc.wantNewIndexFailureCount, gotNewIndexFailureCount); diff != "" { | ||||||
|  | 				t.Errorf("Unexpected set of pods with delayed deletion (-want,+got):\n%s", diff) | ||||||
|  | 			} | ||||||
|  | 			if diff := cmp.Diff(tc.wantNewIndexIgnoredFailureCount, gotNewIndexIgnoredFailureCount); diff != "" { | ||||||
|  | 				t.Errorf("Unexpected set of pods with delayed deletion (-want,+got):\n%s", diff) | ||||||
|  | 			} | ||||||
|  | 		}) | ||||||
|  | 	} | ||||||
|  | } | ||||||
|  |  | ||||||
| func TestIntervalsHaveIndex(t *testing.T) { | func TestIntervalsHaveIndex(t *testing.T) { | ||||||
| 	cases := map[string]struct { | 	cases := map[string]struct { | ||||||
| 		intervals orderedIntervals | 		intervals orderedIntervals | ||||||
| @@ -267,6 +696,7 @@ func TestFirstPendingIndexes(t *testing.T) { | |||||||
| 		completions      int | 		completions      int | ||||||
| 		activePods       []indexPhase | 		activePods       []indexPhase | ||||||
| 		succeededIndexes []interval | 		succeededIndexes []interval | ||||||
|  | 		failedIndexes    *orderedIntervals | ||||||
| 		want             []int | 		want             []int | ||||||
| 	}{ | 	}{ | ||||||
| 		"cnt greater than completions": { | 		"cnt greater than completions": { | ||||||
| @@ -310,12 +740,24 @@ func TestFirstPendingIndexes(t *testing.T) { | |||||||
| 			completions:      20, | 			completions:      20, | ||||||
| 			want:             []int{0, 1, 6, 7, 10}, | 			want:             []int{0, 1, 6, 7, 10}, | ||||||
| 		}, | 		}, | ||||||
|  | 		"with failed indexes": { | ||||||
|  | 			activePods: []indexPhase{ | ||||||
|  | 				{"3", v1.PodPending}, | ||||||
|  | 				{"9", v1.PodPending}, | ||||||
|  | 			}, | ||||||
|  | 			succeededIndexes: []interval{{1, 1}, {5, 5}, {9, 9}}, | ||||||
|  | 			failedIndexes:    &orderedIntervals{{2, 2}, {6, 7}}, | ||||||
|  | 			cnt:              5, | ||||||
|  | 			completions:      20, | ||||||
|  | 			want:             []int{0, 4, 8, 10, 11}, | ||||||
|  | 		}, | ||||||
| 	} | 	} | ||||||
| 	for name, tc := range cases { | 	for name, tc := range cases { | ||||||
| 		t.Run(name, func(t *testing.T) { | 		t.Run(name, func(t *testing.T) { | ||||||
| 			jobCtx := &syncJobCtx{ | 			jobCtx := &syncJobCtx{ | ||||||
| 				activePods:       hollowPodsWithIndexPhase(tc.activePods), | 				activePods:       hollowPodsWithIndexPhase(tc.activePods), | ||||||
| 				succeededIndexes: tc.succeededIndexes, | 				succeededIndexes: tc.succeededIndexes, | ||||||
|  | 				failedIndexes:    tc.failedIndexes, | ||||||
| 			} | 			} | ||||||
| 			got := firstPendingIndexes(jobCtx, tc.cnt, tc.completions) | 			got := firstPendingIndexes(jobCtx, tc.cnt, tc.completions) | ||||||
| 			if diff := cmp.Diff(tc.want, got); diff != "" { | 			if diff := cmp.Diff(tc.want, got); diff != "" { | ||||||
| @@ -446,6 +888,47 @@ func TestPodGenerateNameWithIndex(t *testing.T) { | |||||||
| 	} | 	} | ||||||
| } | } | ||||||
|  |  | ||||||
|  | func TestGetIndexFailureCount(t *testing.T) { | ||||||
|  | 	logger, _ := ktesting.NewTestContext(t) | ||||||
|  | 	cases := map[string]struct { | ||||||
|  | 		pod        *v1.Pod | ||||||
|  | 		wantResult int32 | ||||||
|  | 	}{ | ||||||
|  | 		"no annotation": { | ||||||
|  | 			pod:        &v1.Pod{}, | ||||||
|  | 			wantResult: 0, | ||||||
|  | 		}, | ||||||
|  | 		"valid value": { | ||||||
|  | 			pod:        buildPod().indexFailureCount("2").Pod, | ||||||
|  | 			wantResult: 2, | ||||||
|  | 		}, | ||||||
|  | 		"valid maxint32 value": { | ||||||
|  | 			pod:        buildPod().indexFailureCount(strconv.Itoa(math.MaxInt32)).Pod, | ||||||
|  | 			wantResult: math.MaxInt32, | ||||||
|  | 		}, | ||||||
|  | 		"too large value": { | ||||||
|  | 			pod:        buildPod().indexFailureCount(strconv.Itoa(math.MaxInt32 + 1)).Pod, | ||||||
|  | 			wantResult: 0, | ||||||
|  | 		}, | ||||||
|  | 		"negative value": { | ||||||
|  | 			pod:        buildPod().indexFailureCount("-1").Pod, | ||||||
|  | 			wantResult: 0, | ||||||
|  | 		}, | ||||||
|  | 		"invalid int value": { | ||||||
|  | 			pod:        buildPod().indexFailureCount("xyz").Pod, | ||||||
|  | 			wantResult: 0, | ||||||
|  | 		}, | ||||||
|  | 	} | ||||||
|  | 	for name, tc := range cases { | ||||||
|  | 		t.Run(name, func(t *testing.T) { | ||||||
|  | 			gotResult := getIndexFailureCount(logger, tc.pod) | ||||||
|  | 			if diff := cmp.Equal(tc.wantResult, gotResult); !diff { | ||||||
|  | 				t.Errorf("Unexpected result. want: %d, got: %d", tc.wantResult, gotResult) | ||||||
|  | 			} | ||||||
|  | 		}) | ||||||
|  | 	} | ||||||
|  | } | ||||||
|  |  | ||||||
| func hollowPodsWithIndexPhase(descs []indexPhase) []*v1.Pod { | func hollowPodsWithIndexPhase(descs []indexPhase) []*v1.Pod { | ||||||
| 	pods := make([]*v1.Pod, 0, len(descs)) | 	pods := make([]*v1.Pod, 0, len(descs)) | ||||||
| 	for _, desc := range descs { | 	for _, desc := range descs { | ||||||
|   | |||||||
| @@ -132,16 +132,18 @@ type Controller struct { | |||||||
| } | } | ||||||
|  |  | ||||||
| type syncJobCtx struct { | type syncJobCtx struct { | ||||||
| 	job                  *batch.Job | 	job                             *batch.Job | ||||||
| 	pods                 []*v1.Pod | 	pods                            []*v1.Pod | ||||||
| 	finishedCondition    *batch.JobCondition | 	finishedCondition               *batch.JobCondition | ||||||
| 	activePods           []*v1.Pod | 	activePods                      []*v1.Pod | ||||||
| 	succeeded            int32 | 	succeeded                       int32 | ||||||
| 	prevSucceededIndexes orderedIntervals | 	prevSucceededIndexes            orderedIntervals | ||||||
| 	succeededIndexes     orderedIntervals | 	succeededIndexes                orderedIntervals | ||||||
| 	newBackoffRecord     backoffRecord | 	failedIndexes                   *orderedIntervals | ||||||
| 	expectedRmFinalizers sets.Set[string] | 	newBackoffRecord                backoffRecord | ||||||
| 	uncounted            *uncountedTerminatedPods | 	expectedRmFinalizers            sets.Set[string] | ||||||
|  | 	uncounted                       *uncountedTerminatedPods | ||||||
|  | 	podsWithDelayedDeletionPerIndex map[int]*v1.Pod | ||||||
| } | } | ||||||
|  |  | ||||||
| // NewController creates a new Job controller that keeps the relevant pods | // NewController creates a new Job controller that keeps the relevant pods | ||||||
| @@ -835,6 +837,17 @@ func (jm *Controller) syncJob(ctx context.Context, key string) (rErr error) { | |||||||
| 	if isIndexedJob(&job) { | 	if isIndexedJob(&job) { | ||||||
| 		jobCtx.prevSucceededIndexes, jobCtx.succeededIndexes = calculateSucceededIndexes(logger, &job, pods) | 		jobCtx.prevSucceededIndexes, jobCtx.succeededIndexes = calculateSucceededIndexes(logger, &job, pods) | ||||||
| 		jobCtx.succeeded = int32(jobCtx.succeededIndexes.total()) | 		jobCtx.succeeded = int32(jobCtx.succeededIndexes.total()) | ||||||
|  | 		if hasBackoffLimitPerIndex(&job) { | ||||||
|  | 			jobCtx.failedIndexes = calculateFailedIndexes(logger, &job, pods) | ||||||
|  | 			if jobCtx.finishedCondition == nil { | ||||||
|  | 				if job.Spec.MaxFailedIndexes != nil && jobCtx.failedIndexes.total() > int(*job.Spec.MaxFailedIndexes) { | ||||||
|  | 					jobCtx.finishedCondition = newCondition(batch.JobFailed, v1.ConditionTrue, "MaxFailedIndexesExceeded", "Job has exceeded the specified maximal number of failed indexes", jm.clock.Now()) | ||||||
|  | 				} else if jobCtx.failedIndexes.total() > 0 && jobCtx.failedIndexes.total()+jobCtx.succeededIndexes.total() >= int(*job.Spec.Completions) { | ||||||
|  | 					jobCtx.finishedCondition = newCondition(batch.JobFailed, v1.ConditionTrue, "FailedIndexes", "Job has failed indexes", jm.clock.Now()) | ||||||
|  | 				} | ||||||
|  | 			} | ||||||
|  | 			jobCtx.podsWithDelayedDeletionPerIndex = getPodsWithDelayedDeletionPerIndex(logger, jobCtx) | ||||||
|  | 		} | ||||||
| 	} | 	} | ||||||
| 	suspendCondChanged := false | 	suspendCondChanged := false | ||||||
| 	// Remove active pods if Job failed. | 	// Remove active pods if Job failed. | ||||||
| @@ -1017,9 +1030,10 @@ func (jm *Controller) trackJobStatusAndRemoveFinalizers(ctx context.Context, job | |||||||
| 			continue | 			continue | ||||||
| 		} | 		} | ||||||
| 		considerPodFailed := isPodFailed(pod, jobCtx.job) | 		considerPodFailed := isPodFailed(pod, jobCtx.job) | ||||||
| 		if podutil.IsPodTerminal(pod) || considerPodFailed || jobCtx.finishedCondition != nil || jobCtx.job.DeletionTimestamp != nil { | 		if !canRemoveFinalizer(logger, jobCtx, pod, considerPodFailed) { | ||||||
| 			podsToRemoveFinalizer = append(podsToRemoveFinalizer, pod) | 			continue | ||||||
| 		} | 		} | ||||||
|  | 		podsToRemoveFinalizer = append(podsToRemoveFinalizer, pod) | ||||||
| 		if pod.Status.Phase == v1.PodSucceeded && !jobCtx.uncounted.failed.Has(string(pod.UID)) { | 		if pod.Status.Phase == v1.PodSucceeded && !jobCtx.uncounted.failed.Has(string(pod.UID)) { | ||||||
| 			if isIndexed { | 			if isIndexed { | ||||||
| 				// The completion index is enough to avoid recounting succeeded pods. | 				// The completion index is enough to avoid recounting succeeded pods. | ||||||
| @@ -1073,6 +1087,14 @@ func (jm *Controller) trackJobStatusAndRemoveFinalizers(ctx context.Context, job | |||||||
| 		} | 		} | ||||||
| 		jobCtx.job.Status.Succeeded = int32(jobCtx.succeededIndexes.total()) | 		jobCtx.job.Status.Succeeded = int32(jobCtx.succeededIndexes.total()) | ||||||
| 		jobCtx.job.Status.CompletedIndexes = succeededIndexesStr | 		jobCtx.job.Status.CompletedIndexes = succeededIndexesStr | ||||||
|  | 		var failedIndexesStr *string | ||||||
|  | 		if jobCtx.failedIndexes != nil { | ||||||
|  | 			failedIndexesStr = pointer.String(jobCtx.failedIndexes.String()) | ||||||
|  | 		} | ||||||
|  | 		if !pointer.StringEqual(jobCtx.job.Status.FailedIndexes, failedIndexesStr) { | ||||||
|  | 			jobCtx.job.Status.FailedIndexes = failedIndexesStr | ||||||
|  | 			needsFlush = true | ||||||
|  | 		} | ||||||
| 	} | 	} | ||||||
| 	if feature.DefaultFeatureGate.Enabled(features.JobPodFailurePolicy) { | 	if feature.DefaultFeatureGate.Enabled(features.JobPodFailurePolicy) { | ||||||
| 		if jobCtx.finishedCondition != nil && jobCtx.finishedCondition.Type == batch.JobFailureTarget { | 		if jobCtx.finishedCondition != nil && jobCtx.finishedCondition.Type == batch.JobFailureTarget { | ||||||
| @@ -1106,6 +1128,32 @@ func (jm *Controller) trackJobStatusAndRemoveFinalizers(ctx context.Context, job | |||||||
| 	return nil | 	return nil | ||||||
| } | } | ||||||
|  |  | ||||||
|  | // canRemoveFinalizer determines if the pod's finalizer can be safely removed. | ||||||
|  | // The finalizer can be removed when: | ||||||
|  | //   - the entire Job is terminating; or | ||||||
|  | //   - the pod's index is succeeded; or | ||||||
|  | //   - the Pod is considered failed, unless it's removal is delayed for the | ||||||
|  | //     purpose of transferring the JobIndexFailureCount annotations to the | ||||||
|  | //     replacement pod. the entire Job is terminating the finalizer can be | ||||||
|  | //     removed unconditionally. | ||||||
|  | func canRemoveFinalizer(logger klog.Logger, jobCtx *syncJobCtx, pod *v1.Pod, considerPodFailed bool) bool { | ||||||
|  | 	if jobCtx.job.DeletionTimestamp != nil || jobCtx.finishedCondition != nil || pod.Status.Phase == v1.PodSucceeded { | ||||||
|  | 		return true | ||||||
|  | 	} | ||||||
|  | 	if !considerPodFailed { | ||||||
|  | 		return false | ||||||
|  | 	} | ||||||
|  | 	if hasBackoffLimitPerIndex(jobCtx.job) { | ||||||
|  | 		if index := getCompletionIndex(pod.Annotations); index != unknownCompletionIndex { | ||||||
|  | 			if p, ok := jobCtx.podsWithDelayedDeletionPerIndex[index]; ok && p.UID == pod.UID { | ||||||
|  | 				logger.V(3).Info("Delaying pod finalizer removal to await for pod recreation within the index", "pod", klog.KObj(pod)) | ||||||
|  | 				return false | ||||||
|  | 			} | ||||||
|  | 		} | ||||||
|  | 	} | ||||||
|  | 	return true | ||||||
|  | } | ||||||
|  |  | ||||||
| // flushUncountedAndRemoveFinalizers does: | // flushUncountedAndRemoveFinalizers does: | ||||||
| //  1. flush the Job status that might include new uncounted Pod UIDs. Also flush the interim FailureTarget condition | //  1. flush the Job status that might include new uncounted Pod UIDs. Also flush the interim FailureTarget condition | ||||||
| //     if present. | //     if present. | ||||||
| @@ -1443,7 +1491,11 @@ func (jm *Controller) manageJob(ctx context.Context, job *batch.Job, jobCtx *syn | |||||||
| 	} | 	} | ||||||
|  |  | ||||||
| 	if active < wantActive { | 	if active < wantActive { | ||||||
| 		remainingTime := jobCtx.newBackoffRecord.getRemainingTime(jm.clock, DefaultJobPodFailureBackOff, MaxJobPodFailureBackOff) | 		var remainingTime time.Duration | ||||||
|  | 		if !hasBackoffLimitPerIndex(job) { | ||||||
|  | 			// we compute the global remaining time for pod creation when backoffLimitPerIndex is not used | ||||||
|  | 			remainingTime = jobCtx.newBackoffRecord.getRemainingTime(jm.clock, DefaultJobPodFailureBackOff, MaxJobPodFailureBackOff) | ||||||
|  | 		} | ||||||
| 		if remainingTime > 0 { | 		if remainingTime > 0 { | ||||||
| 			jm.enqueueSyncJobWithDelay(logger, job, remainingTime) | 			jm.enqueueSyncJobWithDelay(logger, job, remainingTime) | ||||||
| 			return 0, metrics.JobSyncActionPodsCreated, nil | 			return 0, metrics.JobSyncActionPodsCreated, nil | ||||||
| @@ -1456,6 +1508,13 @@ func (jm *Controller) manageJob(ctx context.Context, job *batch.Job, jobCtx *syn | |||||||
| 		var indexesToAdd []int | 		var indexesToAdd []int | ||||||
| 		if isIndexedJob(job) { | 		if isIndexedJob(job) { | ||||||
| 			indexesToAdd = firstPendingIndexes(jobCtx, int(diff), int(*job.Spec.Completions)) | 			indexesToAdd = firstPendingIndexes(jobCtx, int(diff), int(*job.Spec.Completions)) | ||||||
|  | 			if hasBackoffLimitPerIndex(job) { | ||||||
|  | 				indexesToAdd, remainingTime = jm.getPodCreationInfoForIndependentIndexes(logger, indexesToAdd, jobCtx.podsWithDelayedDeletionPerIndex) | ||||||
|  | 				if remainingTime > 0 { | ||||||
|  | 					jm.enqueueSyncJobWithDelay(logger, job, remainingTime) | ||||||
|  | 					return 0, metrics.JobSyncActionPodsCreated, nil | ||||||
|  | 				} | ||||||
|  | 			} | ||||||
| 			diff = int32(len(indexesToAdd)) | 			diff = int32(len(indexesToAdd)) | ||||||
| 		} | 		} | ||||||
|  |  | ||||||
| @@ -1502,6 +1561,9 @@ func (jm *Controller) manageJob(ctx context.Context, job *batch.Job, jobCtx *syn | |||||||
| 						} | 						} | ||||||
| 						template.Spec.Hostname = fmt.Sprintf("%s-%d", job.Name, completionIndex) | 						template.Spec.Hostname = fmt.Sprintf("%s-%d", job.Name, completionIndex) | ||||||
| 						generateName = podGenerateNameWithIndex(job.Name, completionIndex) | 						generateName = podGenerateNameWithIndex(job.Name, completionIndex) | ||||||
|  | 						if hasBackoffLimitPerIndex(job) { | ||||||
|  | 							addIndexFailureCountAnnotation(logger, template, job, jobCtx.podsWithDelayedDeletionPerIndex[completionIndex]) | ||||||
|  | 						} | ||||||
| 					} | 					} | ||||||
| 					defer wait.Done() | 					defer wait.Done() | ||||||
| 					err := jm.podControl.CreatePodsWithGenerateName(ctx, job.Namespace, template, job, metav1.NewControllerRef(job, controllerKind), generateName) | 					err := jm.podControl.CreatePodsWithGenerateName(ctx, job.Namespace, template, job, metav1.NewControllerRef(job, controllerKind), generateName) | ||||||
| @@ -1544,6 +1606,26 @@ func (jm *Controller) manageJob(ctx context.Context, job *batch.Job, jobCtx *syn | |||||||
| 	return active, metrics.JobSyncActionTracking, nil | 	return active, metrics.JobSyncActionTracking, nil | ||||||
| } | } | ||||||
|  |  | ||||||
|  | // getPodCreationInfoForIndependentIndexes returns a sub-list of all indexes | ||||||
|  | // to create that contains those which can be already created. In case no indexes | ||||||
|  | // are ready to create pods, it returns the lowest remaining time to create pods | ||||||
|  | // out of all indexes. | ||||||
|  | func (jm *Controller) getPodCreationInfoForIndependentIndexes(logger klog.Logger, indexesToAdd []int, podsWithDelayedDeletionPerIndex map[int]*v1.Pod) ([]int, time.Duration) { | ||||||
|  | 	var indexesToAddNow []int | ||||||
|  | 	var minRemainingTimePerIndex *time.Duration | ||||||
|  | 	for _, indexToAdd := range indexesToAdd { | ||||||
|  | 		if remainingTimePerIndex := getRemainingTimePerIndex(logger, jm.clock, DefaultJobPodFailureBackOff, MaxJobPodFailureBackOff, podsWithDelayedDeletionPerIndex[indexToAdd]); remainingTimePerIndex == 0 { | ||||||
|  | 			indexesToAddNow = append(indexesToAddNow, indexToAdd) | ||||||
|  | 		} else if minRemainingTimePerIndex == nil || remainingTimePerIndex < *minRemainingTimePerIndex { | ||||||
|  | 			minRemainingTimePerIndex = &remainingTimePerIndex | ||||||
|  | 		} | ||||||
|  | 	} | ||||||
|  | 	if len(indexesToAddNow) > 0 { | ||||||
|  | 		return indexesToAddNow, 0 | ||||||
|  | 	} | ||||||
|  | 	return indexesToAddNow, pointer.DurationDeref(minRemainingTimePerIndex, 0) | ||||||
|  | } | ||||||
|  |  | ||||||
| // activePodsForRemoval returns Pods that should be removed because there | // activePodsForRemoval returns Pods that should be removed because there | ||||||
| // are too many pods running or, if this is an indexed job, there are repeated | // are too many pods running or, if this is an indexed job, there are repeated | ||||||
| // indexes or invalid indexes or some pods don't have indexes. | // indexes or invalid indexes or some pods don't have indexes. | ||||||
| @@ -1735,7 +1817,7 @@ func recordJobPodFinished(logger klog.Logger, job *batch.Job, oldCounters batch. | |||||||
| 	// now out of range (i.e. index >= spec.Completions). | 	// now out of range (i.e. index >= spec.Completions). | ||||||
| 	if isIndexedJob(job) { | 	if isIndexedJob(job) { | ||||||
| 		if job.Status.CompletedIndexes != oldCounters.CompletedIndexes { | 		if job.Status.CompletedIndexes != oldCounters.CompletedIndexes { | ||||||
| 			diff = succeededIndexesFromString(logger, job.Status.CompletedIndexes, int(*job.Spec.Completions)).total() - succeededIndexesFromString(logger, oldCounters.CompletedIndexes, int(*job.Spec.Completions)).total() | 			diff = parseIndexesFromString(logger, job.Status.CompletedIndexes, int(*job.Spec.Completions)).total() - parseIndexesFromString(logger, oldCounters.CompletedIndexes, int(*job.Spec.Completions)).total() | ||||||
| 		} | 		} | ||||||
| 	} else { | 	} else { | ||||||
| 		diff = int(job.Status.Succeeded) - int(oldCounters.Succeeded) | 		diff = int(job.Status.Succeeded) - int(oldCounters.Succeeded) | ||||||
|   | |||||||
| @@ -20,6 +20,7 @@ import ( | |||||||
| 	"context" | 	"context" | ||||||
| 	"errors" | 	"errors" | ||||||
| 	"fmt" | 	"fmt" | ||||||
|  | 	"math" | ||||||
| 	"sort" | 	"sort" | ||||||
| 	"strconv" | 	"strconv" | ||||||
| 	"testing" | 	"testing" | ||||||
| @@ -1128,6 +1129,9 @@ func TestTrackJobStatusAndRemoveFinalizers(t *testing.T) { | |||||||
| 		wantStatusUpdates       []batch.JobStatus | 		wantStatusUpdates       []batch.JobStatus | ||||||
| 		wantSucceededPodsMetric int | 		wantSucceededPodsMetric int | ||||||
| 		wantFailedPodsMetric    int | 		wantFailedPodsMetric    int | ||||||
|  |  | ||||||
|  | 		// features | ||||||
|  | 		enableJobBackoffLimitPerIndex bool | ||||||
| 	}{ | 	}{ | ||||||
| 		"no updates": {}, | 		"no updates": {}, | ||||||
| 		"new active": { | 		"new active": { | ||||||
| @@ -1649,9 +1653,91 @@ func TestTrackJobStatusAndRemoveFinalizers(t *testing.T) { | |||||||
| 			}, | 			}, | ||||||
| 			wantFailedPodsMetric: 2, | 			wantFailedPodsMetric: 2, | ||||||
| 		}, | 		}, | ||||||
|  | 		"indexed job with a failed pod with delayed finalizer removal; the pod is not counted": { | ||||||
|  | 			enableJobBackoffLimitPerIndex: true, | ||||||
|  | 			job: batch.Job{ | ||||||
|  | 				Spec: batch.JobSpec{ | ||||||
|  | 					CompletionMode:       &indexedCompletion, | ||||||
|  | 					Completions:          pointer.Int32(6), | ||||||
|  | 					BackoffLimitPerIndex: pointer.Int32(1), | ||||||
|  | 				}, | ||||||
|  | 			}, | ||||||
|  | 			pods: []*v1.Pod{ | ||||||
|  | 				buildPod().uid("a").phase(v1.PodFailed).indexFailureCount("0").trackingFinalizer().index("1").Pod, | ||||||
|  | 			}, | ||||||
|  | 			wantStatusUpdates: []batch.JobStatus{ | ||||||
|  | 				{ | ||||||
|  | 					UncountedTerminatedPods: &batch.UncountedTerminatedPods{}, | ||||||
|  | 					FailedIndexes:           pointer.String(""), | ||||||
|  | 				}, | ||||||
|  | 			}, | ||||||
|  | 		}, | ||||||
|  | 		"indexed job with a failed pod which is recreated by a running pod; the pod is counted": { | ||||||
|  | 			enableJobBackoffLimitPerIndex: true, | ||||||
|  | 			job: batch.Job{ | ||||||
|  | 				Spec: batch.JobSpec{ | ||||||
|  | 					CompletionMode:       &indexedCompletion, | ||||||
|  | 					Completions:          pointer.Int32(6), | ||||||
|  | 					BackoffLimitPerIndex: pointer.Int32(1), | ||||||
|  | 				}, | ||||||
|  | 				Status: batch.JobStatus{ | ||||||
|  | 					Active: 1, | ||||||
|  | 				}, | ||||||
|  | 			}, | ||||||
|  | 			pods: []*v1.Pod{ | ||||||
|  | 				buildPod().uid("a1").phase(v1.PodFailed).indexFailureCount("0").trackingFinalizer().index("1").Pod, | ||||||
|  | 				buildPod().uid("a2").phase(v1.PodRunning).indexFailureCount("1").trackingFinalizer().index("1").Pod, | ||||||
|  | 			}, | ||||||
|  | 			wantRmFinalizers: 1, | ||||||
|  | 			wantStatusUpdates: []batch.JobStatus{ | ||||||
|  | 				{ | ||||||
|  | 					Active: 1, | ||||||
|  | 					UncountedTerminatedPods: &batch.UncountedTerminatedPods{ | ||||||
|  | 						Failed: []types.UID{"a1"}, | ||||||
|  | 					}, | ||||||
|  | 					FailedIndexes: pointer.String(""), | ||||||
|  | 				}, | ||||||
|  | 				{ | ||||||
|  | 					Active:                  1, | ||||||
|  | 					Failed:                  1, | ||||||
|  | 					UncountedTerminatedPods: &batch.UncountedTerminatedPods{}, | ||||||
|  | 					FailedIndexes:           pointer.String(""), | ||||||
|  | 				}, | ||||||
|  | 			}, | ||||||
|  | 			wantFailedPodsMetric: 1, | ||||||
|  | 		}, | ||||||
|  | 		"indexed job with a failed pod for a failed index; the pod is counted": { | ||||||
|  | 			enableJobBackoffLimitPerIndex: true, | ||||||
|  | 			job: batch.Job{ | ||||||
|  | 				Spec: batch.JobSpec{ | ||||||
|  | 					CompletionMode:       &indexedCompletion, | ||||||
|  | 					Completions:          pointer.Int32(6), | ||||||
|  | 					BackoffLimitPerIndex: pointer.Int32(1), | ||||||
|  | 				}, | ||||||
|  | 			}, | ||||||
|  | 			pods: []*v1.Pod{ | ||||||
|  | 				buildPod().uid("a").phase(v1.PodFailed).indexFailureCount("1").trackingFinalizer().index("1").Pod, | ||||||
|  | 			}, | ||||||
|  | 			wantRmFinalizers: 1, | ||||||
|  | 			wantStatusUpdates: []batch.JobStatus{ | ||||||
|  | 				{ | ||||||
|  | 					FailedIndexes: pointer.String("1"), | ||||||
|  | 					UncountedTerminatedPods: &batch.UncountedTerminatedPods{ | ||||||
|  | 						Failed: []types.UID{"a"}, | ||||||
|  | 					}, | ||||||
|  | 				}, | ||||||
|  | 				{ | ||||||
|  | 					Failed:                  1, | ||||||
|  | 					FailedIndexes:           pointer.String("1"), | ||||||
|  | 					UncountedTerminatedPods: &batch.UncountedTerminatedPods{}, | ||||||
|  | 				}, | ||||||
|  | 			}, | ||||||
|  | 			wantFailedPodsMetric: 1, | ||||||
|  | 		}, | ||||||
| 	} | 	} | ||||||
| 	for name, tc := range cases { | 	for name, tc := range cases { | ||||||
| 		t.Run(name, func(t *testing.T) { | 		t.Run(name, func(t *testing.T) { | ||||||
|  | 			defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobBackoffLimitPerIndex, tc.enableJobBackoffLimitPerIndex)() | ||||||
| 			clientSet := clientset.NewForConfigOrDie(&restclient.Config{Host: "", ContentConfig: restclient.ContentConfig{GroupVersion: &schema.GroupVersion{Group: "", Version: "v1"}}}) | 			clientSet := clientset.NewForConfigOrDie(&restclient.Config{Host: "", ContentConfig: restclient.ContentConfig{GroupVersion: &schema.GroupVersion{Group: "", Version: "v1"}}}) | ||||||
| 			manager, _ := newControllerFromClient(ctx, clientSet, controller.NoResyncPeriodFunc) | 			manager, _ := newControllerFromClient(ctx, clientSet, controller.NoResyncPeriodFunc) | ||||||
| 			fakePodControl := controller.FakePodControl{Err: tc.podControlErr} | 			fakePodControl := controller.FakePodControl{Err: tc.podControlErr} | ||||||
| @@ -1666,20 +1752,22 @@ func TestTrackJobStatusAndRemoveFinalizers(t *testing.T) { | |||||||
| 			if job.Status.UncountedTerminatedPods == nil { | 			if job.Status.UncountedTerminatedPods == nil { | ||||||
| 				job.Status.UncountedTerminatedPods = &batch.UncountedTerminatedPods{} | 				job.Status.UncountedTerminatedPods = &batch.UncountedTerminatedPods{} | ||||||
| 			} | 			} | ||||||
| 			uncounted := newUncountedTerminatedPods(*job.Status.UncountedTerminatedPods) |  | ||||||
| 			var succeededIndexes orderedIntervals |  | ||||||
| 			if isIndexedJob(job) { |  | ||||||
| 				succeededIndexes = succeededIndexesFromString(logger, job.Status.CompletedIndexes, int(*job.Spec.Completions)) |  | ||||||
| 			} |  | ||||||
| 			jobCtx := &syncJobCtx{ | 			jobCtx := &syncJobCtx{ | ||||||
| 				job:                  job, | 				job:                  job, | ||||||
| 				pods:                 tc.pods, | 				pods:                 tc.pods, | ||||||
| 				succeededIndexes:     succeededIndexes, | 				uncounted:            newUncountedTerminatedPods(*job.Status.UncountedTerminatedPods), | ||||||
| 				uncounted:            uncounted, |  | ||||||
| 				expectedRmFinalizers: tc.expectedRmFinalizers, | 				expectedRmFinalizers: tc.expectedRmFinalizers, | ||||||
| 				finishedCondition:    tc.finishedCond, | 				finishedCondition:    tc.finishedCond, | ||||||
| 				newBackoffRecord:     backoffRecord{}, |  | ||||||
| 			} | 			} | ||||||
|  | 			if isIndexedJob(job) { | ||||||
|  | 				jobCtx.succeededIndexes = parseIndexesFromString(logger, job.Status.CompletedIndexes, int(*job.Spec.Completions)) | ||||||
|  | 				if tc.enableJobBackoffLimitPerIndex && job.Spec.BackoffLimitPerIndex != nil { | ||||||
|  | 					jobCtx.failedIndexes = calculateFailedIndexes(logger, job, tc.pods) | ||||||
|  | 					jobCtx.activePods = controller.FilterActivePods(logger, tc.pods) | ||||||
|  | 					jobCtx.podsWithDelayedDeletionPerIndex = getPodsWithDelayedDeletionPerIndex(logger, jobCtx) | ||||||
|  | 				} | ||||||
|  | 			} | ||||||
|  |  | ||||||
| 			err := manager.trackJobStatusAndRemoveFinalizers(ctx, jobCtx, tc.needsFlush) | 			err := manager.trackJobStatusAndRemoveFinalizers(ctx, jobCtx, tc.needsFlush) | ||||||
| 			if !errors.Is(err, tc.wantErr) { | 			if !errors.Is(err, tc.wantErr) { | ||||||
| 				t.Errorf("Got error %v, want %v", err, tc.wantErr) | 				t.Errorf("Got error %v, want %v", err, tc.wantErr) | ||||||
| @@ -3123,6 +3211,484 @@ func TestSyncJobWithJobPodFailurePolicy(t *testing.T) { | |||||||
| 	} | 	} | ||||||
| } | } | ||||||
|  |  | ||||||
|  | func TestSyncJobWithJobBackoffLimitPerIndex(t *testing.T) { | ||||||
|  | 	_, ctx := ktesting.NewTestContext(t) | ||||||
|  | 	now := time.Now() | ||||||
|  | 	validObjectMeta := metav1.ObjectMeta{ | ||||||
|  | 		Name:      "foobar", | ||||||
|  | 		UID:       uuid.NewUUID(), | ||||||
|  | 		Namespace: metav1.NamespaceDefault, | ||||||
|  | 	} | ||||||
|  | 	validSelector := &metav1.LabelSelector{ | ||||||
|  | 		MatchLabels: map[string]string{"foo": "bar"}, | ||||||
|  | 	} | ||||||
|  | 	validTemplate := v1.PodTemplateSpec{ | ||||||
|  | 		ObjectMeta: metav1.ObjectMeta{ | ||||||
|  | 			Labels: map[string]string{ | ||||||
|  | 				"foo": "bar", | ||||||
|  | 			}, | ||||||
|  | 		}, | ||||||
|  | 		Spec: v1.PodSpec{ | ||||||
|  | 			Containers: []v1.Container{ | ||||||
|  | 				{Image: "foo/bar"}, | ||||||
|  | 			}, | ||||||
|  | 		}, | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	testCases := map[string]struct { | ||||||
|  | 		enableJobBackoffLimitPerIndex bool | ||||||
|  | 		enableJobPodFailurePolicy     bool | ||||||
|  | 		job                           batch.Job | ||||||
|  | 		pods                          []v1.Pod | ||||||
|  | 		wantStatus                    batch.JobStatus | ||||||
|  | 	}{ | ||||||
|  | 		"successful job after a single failure within index": { | ||||||
|  | 			enableJobBackoffLimitPerIndex: true, | ||||||
|  | 			job: batch.Job{ | ||||||
|  | 				TypeMeta:   metav1.TypeMeta{Kind: "Job"}, | ||||||
|  | 				ObjectMeta: validObjectMeta, | ||||||
|  | 				Spec: batch.JobSpec{ | ||||||
|  | 					Selector:             validSelector, | ||||||
|  | 					Template:             validTemplate, | ||||||
|  | 					Parallelism:          pointer.Int32(2), | ||||||
|  | 					Completions:          pointer.Int32(2), | ||||||
|  | 					BackoffLimit:         pointer.Int32(math.MaxInt32), | ||||||
|  | 					CompletionMode:       completionModePtr(batch.IndexedCompletion), | ||||||
|  | 					BackoffLimitPerIndex: pointer.Int32(1), | ||||||
|  | 				}, | ||||||
|  | 			}, | ||||||
|  | 			pods: []v1.Pod{ | ||||||
|  | 				*buildPod().uid("a1").index("0").phase(v1.PodFailed).indexFailureCount("0").trackingFinalizer().Pod, | ||||||
|  | 				*buildPod().uid("a2").index("0").phase(v1.PodSucceeded).indexFailureCount("1").trackingFinalizer().Pod, | ||||||
|  | 				*buildPod().uid("b").index("1").phase(v1.PodSucceeded).indexFailureCount("0").trackingFinalizer().Pod, | ||||||
|  | 			}, | ||||||
|  | 			wantStatus: batch.JobStatus{ | ||||||
|  | 				Failed:                  1, | ||||||
|  | 				Succeeded:               2, | ||||||
|  | 				CompletedIndexes:        "0,1", | ||||||
|  | 				FailedIndexes:           pointer.String(""), | ||||||
|  | 				UncountedTerminatedPods: &batch.UncountedTerminatedPods{}, | ||||||
|  | 				Conditions: []batch.JobCondition{ | ||||||
|  | 					{ | ||||||
|  | 						Type:   batch.JobComplete, | ||||||
|  | 						Status: v1.ConditionTrue, | ||||||
|  | 					}, | ||||||
|  | 				}, | ||||||
|  | 			}, | ||||||
|  | 		}, | ||||||
|  | 		"single failed pod, not counted as the replacement pod creation is delayed": { | ||||||
|  | 			enableJobBackoffLimitPerIndex: true, | ||||||
|  | 			job: batch.Job{ | ||||||
|  | 				TypeMeta:   metav1.TypeMeta{Kind: "Job"}, | ||||||
|  | 				ObjectMeta: validObjectMeta, | ||||||
|  | 				Spec: batch.JobSpec{ | ||||||
|  | 					Selector:             validSelector, | ||||||
|  | 					Template:             validTemplate, | ||||||
|  | 					Parallelism:          pointer.Int32(2), | ||||||
|  | 					Completions:          pointer.Int32(2), | ||||||
|  | 					BackoffLimit:         pointer.Int32(math.MaxInt32), | ||||||
|  | 					CompletionMode:       completionModePtr(batch.IndexedCompletion), | ||||||
|  | 					BackoffLimitPerIndex: pointer.Int32(1), | ||||||
|  | 				}, | ||||||
|  | 			}, | ||||||
|  | 			pods: []v1.Pod{ | ||||||
|  | 				*buildPod().uid("a").index("0").phase(v1.PodFailed).indexFailureCount("0").trackingFinalizer().Pod, | ||||||
|  | 			}, | ||||||
|  | 			wantStatus: batch.JobStatus{ | ||||||
|  | 				Active:                  2, | ||||||
|  | 				UncountedTerminatedPods: &batch.UncountedTerminatedPods{}, | ||||||
|  | 				FailedIndexes:           pointer.String(""), | ||||||
|  | 			}, | ||||||
|  | 		}, | ||||||
|  | 		"single failed pod replaced already": { | ||||||
|  | 			enableJobBackoffLimitPerIndex: true, | ||||||
|  | 			job: batch.Job{ | ||||||
|  | 				TypeMeta:   metav1.TypeMeta{Kind: "Job"}, | ||||||
|  | 				ObjectMeta: validObjectMeta, | ||||||
|  | 				Spec: batch.JobSpec{ | ||||||
|  | 					Selector:             validSelector, | ||||||
|  | 					Template:             validTemplate, | ||||||
|  | 					Parallelism:          pointer.Int32(2), | ||||||
|  | 					Completions:          pointer.Int32(2), | ||||||
|  | 					BackoffLimit:         pointer.Int32(math.MaxInt32), | ||||||
|  | 					CompletionMode:       completionModePtr(batch.IndexedCompletion), | ||||||
|  | 					BackoffLimitPerIndex: pointer.Int32(1), | ||||||
|  | 				}, | ||||||
|  | 			}, | ||||||
|  | 			pods: []v1.Pod{ | ||||||
|  | 				*buildPod().uid("a").index("0").phase(v1.PodFailed).indexFailureCount("0").trackingFinalizer().Pod, | ||||||
|  | 				*buildPod().uid("b").index("0").phase(v1.PodPending).indexFailureCount("1").trackingFinalizer().Pod, | ||||||
|  | 			}, | ||||||
|  | 			wantStatus: batch.JobStatus{ | ||||||
|  | 				Active:                  2, | ||||||
|  | 				Failed:                  1, | ||||||
|  | 				UncountedTerminatedPods: &batch.UncountedTerminatedPods{}, | ||||||
|  | 				FailedIndexes:           pointer.String(""), | ||||||
|  | 			}, | ||||||
|  | 		}, | ||||||
|  | 		"single failed index due to exceeding the backoff limit per index, the job continues": { | ||||||
|  | 			enableJobBackoffLimitPerIndex: true, | ||||||
|  | 			job: batch.Job{ | ||||||
|  | 				TypeMeta:   metav1.TypeMeta{Kind: "Job"}, | ||||||
|  | 				ObjectMeta: validObjectMeta, | ||||||
|  | 				Spec: batch.JobSpec{ | ||||||
|  | 					Selector:             validSelector, | ||||||
|  | 					Template:             validTemplate, | ||||||
|  | 					Parallelism:          pointer.Int32(2), | ||||||
|  | 					Completions:          pointer.Int32(2), | ||||||
|  | 					BackoffLimit:         pointer.Int32(math.MaxInt32), | ||||||
|  | 					CompletionMode:       completionModePtr(batch.IndexedCompletion), | ||||||
|  | 					BackoffLimitPerIndex: pointer.Int32(1), | ||||||
|  | 				}, | ||||||
|  | 			}, | ||||||
|  | 			pods: []v1.Pod{ | ||||||
|  | 				*buildPod().uid("a").index("0").phase(v1.PodFailed).indexFailureCount("1").trackingFinalizer().Pod, | ||||||
|  | 			}, | ||||||
|  | 			wantStatus: batch.JobStatus{ | ||||||
|  | 				Active:                  1, | ||||||
|  | 				Failed:                  1, | ||||||
|  | 				FailedIndexes:           pointer.String("0"), | ||||||
|  | 				UncountedTerminatedPods: &batch.UncountedTerminatedPods{}, | ||||||
|  | 			}, | ||||||
|  | 		}, | ||||||
|  | 		"single failed index due to FailIndex action, the job continues": { | ||||||
|  | 			enableJobBackoffLimitPerIndex: true, | ||||||
|  | 			enableJobPodFailurePolicy:     true, | ||||||
|  | 			job: batch.Job{ | ||||||
|  | 				TypeMeta:   metav1.TypeMeta{Kind: "Job"}, | ||||||
|  | 				ObjectMeta: validObjectMeta, | ||||||
|  | 				Spec: batch.JobSpec{ | ||||||
|  | 					Selector:             validSelector, | ||||||
|  | 					Template:             validTemplate, | ||||||
|  | 					Parallelism:          pointer.Int32(2), | ||||||
|  | 					Completions:          pointer.Int32(2), | ||||||
|  | 					BackoffLimit:         pointer.Int32(math.MaxInt32), | ||||||
|  | 					CompletionMode:       completionModePtr(batch.IndexedCompletion), | ||||||
|  | 					BackoffLimitPerIndex: pointer.Int32(1), | ||||||
|  | 					PodFailurePolicy: &batch.PodFailurePolicy{ | ||||||
|  | 						Rules: []batch.PodFailurePolicyRule{ | ||||||
|  | 							{ | ||||||
|  | 								Action: batch.PodFailurePolicyActionFailIndex, | ||||||
|  | 								OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{ | ||||||
|  | 									Operator: batch.PodFailurePolicyOnExitCodesOpIn, | ||||||
|  | 									Values:   []int32{3}, | ||||||
|  | 								}, | ||||||
|  | 							}, | ||||||
|  | 						}, | ||||||
|  | 					}, | ||||||
|  | 				}, | ||||||
|  | 			}, | ||||||
|  | 			pods: []v1.Pod{ | ||||||
|  | 				*buildPod().uid("a").index("0").status(v1.PodStatus{ | ||||||
|  | 					Phase: v1.PodFailed, | ||||||
|  | 					ContainerStatuses: []v1.ContainerStatus{ | ||||||
|  | 						{ | ||||||
|  | 							State: v1.ContainerState{ | ||||||
|  | 								Terminated: &v1.ContainerStateTerminated{ | ||||||
|  | 									ExitCode: 3, | ||||||
|  | 								}, | ||||||
|  | 							}, | ||||||
|  | 						}, | ||||||
|  | 					}, | ||||||
|  | 				}).indexFailureCount("0").trackingFinalizer().Pod, | ||||||
|  | 			}, | ||||||
|  | 			wantStatus: batch.JobStatus{ | ||||||
|  | 				Active:                  1, | ||||||
|  | 				Failed:                  1, | ||||||
|  | 				FailedIndexes:           pointer.String("0"), | ||||||
|  | 				UncountedTerminatedPods: &batch.UncountedTerminatedPods{}, | ||||||
|  | 			}, | ||||||
|  | 		}, | ||||||
|  | 		"job failed index due to FailJob action": { | ||||||
|  | 			enableJobBackoffLimitPerIndex: true, | ||||||
|  | 			enableJobPodFailurePolicy:     true, | ||||||
|  | 			job: batch.Job{ | ||||||
|  | 				TypeMeta:   metav1.TypeMeta{Kind: "Job"}, | ||||||
|  | 				ObjectMeta: validObjectMeta, | ||||||
|  | 				Spec: batch.JobSpec{ | ||||||
|  | 					Selector:             validSelector, | ||||||
|  | 					Template:             validTemplate, | ||||||
|  | 					Parallelism:          pointer.Int32(2), | ||||||
|  | 					Completions:          pointer.Int32(2), | ||||||
|  | 					BackoffLimit:         pointer.Int32(6), | ||||||
|  | 					CompletionMode:       completionModePtr(batch.IndexedCompletion), | ||||||
|  | 					BackoffLimitPerIndex: pointer.Int32(1), | ||||||
|  | 					PodFailurePolicy: &batch.PodFailurePolicy{ | ||||||
|  | 						Rules: []batch.PodFailurePolicyRule{ | ||||||
|  | 							{ | ||||||
|  | 								Action: batch.PodFailurePolicyActionFailJob, | ||||||
|  | 								OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{ | ||||||
|  | 									Operator: batch.PodFailurePolicyOnExitCodesOpIn, | ||||||
|  | 									Values:   []int32{3}, | ||||||
|  | 								}, | ||||||
|  | 							}, | ||||||
|  | 						}, | ||||||
|  | 					}, | ||||||
|  | 				}, | ||||||
|  | 			}, | ||||||
|  | 			pods: []v1.Pod{ | ||||||
|  | 				*buildPod().uid("a").index("0").status(v1.PodStatus{ | ||||||
|  | 					Phase: v1.PodFailed, | ||||||
|  | 					ContainerStatuses: []v1.ContainerStatus{ | ||||||
|  | 						{ | ||||||
|  | 							Name: "x", | ||||||
|  | 							State: v1.ContainerState{ | ||||||
|  | 								Terminated: &v1.ContainerStateTerminated{ | ||||||
|  | 									ExitCode: 3, | ||||||
|  | 								}, | ||||||
|  | 							}, | ||||||
|  | 						}, | ||||||
|  | 					}, | ||||||
|  | 				}).indexFailureCount("0").trackingFinalizer().Pod, | ||||||
|  | 			}, | ||||||
|  | 			wantStatus: batch.JobStatus{ | ||||||
|  | 				Active:                  0, | ||||||
|  | 				Failed:                  1, | ||||||
|  | 				FailedIndexes:           pointer.String(""), | ||||||
|  | 				UncountedTerminatedPods: &batch.UncountedTerminatedPods{}, | ||||||
|  | 				Conditions: []batch.JobCondition{ | ||||||
|  | 					{ | ||||||
|  | 						Type:    batch.JobFailureTarget, | ||||||
|  | 						Status:  v1.ConditionTrue, | ||||||
|  | 						Reason:  "PodFailurePolicy", | ||||||
|  | 						Message: "Container x for pod default/mypod-0 failed with exit code 3 matching FailJob rule at index 0", | ||||||
|  | 					}, | ||||||
|  | 					{ | ||||||
|  | 						Type:    batch.JobFailed, | ||||||
|  | 						Status:  v1.ConditionTrue, | ||||||
|  | 						Reason:  "PodFailurePolicy", | ||||||
|  | 						Message: "Container x for pod default/mypod-0 failed with exit code 3 matching FailJob rule at index 0", | ||||||
|  | 					}, | ||||||
|  | 				}, | ||||||
|  | 			}, | ||||||
|  | 		}, | ||||||
|  | 		"job pod failure ignored due to matching Ignore action": { | ||||||
|  | 			enableJobBackoffLimitPerIndex: true, | ||||||
|  | 			enableJobPodFailurePolicy:     true, | ||||||
|  | 			job: batch.Job{ | ||||||
|  | 				TypeMeta:   metav1.TypeMeta{Kind: "Job"}, | ||||||
|  | 				ObjectMeta: validObjectMeta, | ||||||
|  | 				Spec: batch.JobSpec{ | ||||||
|  | 					Selector:             validSelector, | ||||||
|  | 					Template:             validTemplate, | ||||||
|  | 					Parallelism:          pointer.Int32(2), | ||||||
|  | 					Completions:          pointer.Int32(2), | ||||||
|  | 					BackoffLimit:         pointer.Int32(6), | ||||||
|  | 					CompletionMode:       completionModePtr(batch.IndexedCompletion), | ||||||
|  | 					BackoffLimitPerIndex: pointer.Int32(1), | ||||||
|  | 					PodFailurePolicy: &batch.PodFailurePolicy{ | ||||||
|  | 						Rules: []batch.PodFailurePolicyRule{ | ||||||
|  | 							{ | ||||||
|  | 								Action: batch.PodFailurePolicyActionIgnore, | ||||||
|  | 								OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{ | ||||||
|  | 									Operator: batch.PodFailurePolicyOnExitCodesOpIn, | ||||||
|  | 									Values:   []int32{3}, | ||||||
|  | 								}, | ||||||
|  | 							}, | ||||||
|  | 						}, | ||||||
|  | 					}, | ||||||
|  | 				}, | ||||||
|  | 			}, | ||||||
|  | 			pods: []v1.Pod{ | ||||||
|  | 				*buildPod().uid("a").index("0").status(v1.PodStatus{ | ||||||
|  | 					Phase: v1.PodFailed, | ||||||
|  | 					ContainerStatuses: []v1.ContainerStatus{ | ||||||
|  | 						{ | ||||||
|  | 							Name: "x", | ||||||
|  | 							State: v1.ContainerState{ | ||||||
|  | 								Terminated: &v1.ContainerStateTerminated{ | ||||||
|  | 									ExitCode: 3, | ||||||
|  | 								}, | ||||||
|  | 							}, | ||||||
|  | 						}, | ||||||
|  | 					}, | ||||||
|  | 				}).indexFailureCount("0").trackingFinalizer().Pod, | ||||||
|  | 			}, | ||||||
|  | 			wantStatus: batch.JobStatus{ | ||||||
|  | 				Active:                  2, | ||||||
|  | 				Failed:                  0, | ||||||
|  | 				FailedIndexes:           pointer.String(""), | ||||||
|  | 				UncountedTerminatedPods: &batch.UncountedTerminatedPods{}, | ||||||
|  | 			}, | ||||||
|  | 		}, | ||||||
|  | 		"job failed due to exceeding backoffLimit before backoffLimitPerIndex": { | ||||||
|  | 			enableJobBackoffLimitPerIndex: true, | ||||||
|  | 			job: batch.Job{ | ||||||
|  | 				TypeMeta:   metav1.TypeMeta{Kind: "Job"}, | ||||||
|  | 				ObjectMeta: validObjectMeta, | ||||||
|  | 				Spec: batch.JobSpec{ | ||||||
|  | 					Selector:             validSelector, | ||||||
|  | 					Template:             validTemplate, | ||||||
|  | 					Parallelism:          pointer.Int32(2), | ||||||
|  | 					Completions:          pointer.Int32(2), | ||||||
|  | 					BackoffLimit:         pointer.Int32(1), | ||||||
|  | 					CompletionMode:       completionModePtr(batch.IndexedCompletion), | ||||||
|  | 					BackoffLimitPerIndex: pointer.Int32(1), | ||||||
|  | 				}, | ||||||
|  | 			}, | ||||||
|  | 			pods: []v1.Pod{ | ||||||
|  | 				*buildPod().uid("a").index("0").phase(v1.PodFailed).indexFailureCount("0").trackingFinalizer().Pod, | ||||||
|  | 				*buildPod().uid("b").index("1").phase(v1.PodFailed).indexFailureCount("0").trackingFinalizer().Pod, | ||||||
|  | 			}, | ||||||
|  | 			wantStatus: batch.JobStatus{ | ||||||
|  | 				Failed:                  2, | ||||||
|  | 				Succeeded:               0, | ||||||
|  | 				FailedIndexes:           pointer.String(""), | ||||||
|  | 				UncountedTerminatedPods: &batch.UncountedTerminatedPods{}, | ||||||
|  | 				Conditions: []batch.JobCondition{ | ||||||
|  | 					{ | ||||||
|  | 						Type:    batch.JobFailed, | ||||||
|  | 						Status:  v1.ConditionTrue, | ||||||
|  | 						Reason:  "BackoffLimitExceeded", | ||||||
|  | 						Message: "Job has reached the specified backoff limit", | ||||||
|  | 					}, | ||||||
|  | 				}, | ||||||
|  | 			}, | ||||||
|  | 		}, | ||||||
|  | 		"job failed due to failed indexes": { | ||||||
|  | 			enableJobBackoffLimitPerIndex: true, | ||||||
|  | 			job: batch.Job{ | ||||||
|  | 				TypeMeta:   metav1.TypeMeta{Kind: "Job"}, | ||||||
|  | 				ObjectMeta: validObjectMeta, | ||||||
|  | 				Spec: batch.JobSpec{ | ||||||
|  | 					Selector:             validSelector, | ||||||
|  | 					Template:             validTemplate, | ||||||
|  | 					Parallelism:          pointer.Int32(2), | ||||||
|  | 					Completions:          pointer.Int32(2), | ||||||
|  | 					BackoffLimit:         pointer.Int32(math.MaxInt32), | ||||||
|  | 					CompletionMode:       completionModePtr(batch.IndexedCompletion), | ||||||
|  | 					BackoffLimitPerIndex: pointer.Int32(1), | ||||||
|  | 				}, | ||||||
|  | 			}, | ||||||
|  | 			pods: []v1.Pod{ | ||||||
|  | 				*buildPod().uid("a").index("0").phase(v1.PodFailed).indexFailureCount("1").trackingFinalizer().Pod, | ||||||
|  | 				*buildPod().uid("b").index("1").phase(v1.PodSucceeded).indexFailureCount("0").trackingFinalizer().Pod, | ||||||
|  | 			}, | ||||||
|  | 			wantStatus: batch.JobStatus{ | ||||||
|  | 				Failed:                  1, | ||||||
|  | 				Succeeded:               1, | ||||||
|  | 				FailedIndexes:           pointer.String("0"), | ||||||
|  | 				CompletedIndexes:        "1", | ||||||
|  | 				UncountedTerminatedPods: &batch.UncountedTerminatedPods{}, | ||||||
|  | 				Conditions: []batch.JobCondition{ | ||||||
|  | 					{ | ||||||
|  | 						Type:    batch.JobFailed, | ||||||
|  | 						Status:  v1.ConditionTrue, | ||||||
|  | 						Reason:  "FailedIndexes", | ||||||
|  | 						Message: "Job has failed indexes", | ||||||
|  | 					}, | ||||||
|  | 				}, | ||||||
|  | 			}, | ||||||
|  | 		}, | ||||||
|  | 		"job failed due to exceeding max failed indexes": { | ||||||
|  | 			enableJobBackoffLimitPerIndex: true, | ||||||
|  | 			job: batch.Job{ | ||||||
|  | 				TypeMeta:   metav1.TypeMeta{Kind: "Job"}, | ||||||
|  | 				ObjectMeta: validObjectMeta, | ||||||
|  | 				Spec: batch.JobSpec{ | ||||||
|  | 					Selector:             validSelector, | ||||||
|  | 					Template:             validTemplate, | ||||||
|  | 					Parallelism:          pointer.Int32(4), | ||||||
|  | 					Completions:          pointer.Int32(4), | ||||||
|  | 					BackoffLimit:         pointer.Int32(math.MaxInt32), | ||||||
|  | 					CompletionMode:       completionModePtr(batch.IndexedCompletion), | ||||||
|  | 					BackoffLimitPerIndex: pointer.Int32(1), | ||||||
|  | 					MaxFailedIndexes:     pointer.Int32(1), | ||||||
|  | 				}, | ||||||
|  | 			}, | ||||||
|  | 			pods: []v1.Pod{ | ||||||
|  | 				*buildPod().uid("a").index("0").phase(v1.PodFailed).indexFailureCount("1").trackingFinalizer().Pod, | ||||||
|  | 				*buildPod().uid("b").index("1").phase(v1.PodSucceeded).indexFailureCount("0").trackingFinalizer().Pod, | ||||||
|  | 				*buildPod().uid("c").index("2").phase(v1.PodFailed).indexFailureCount("1").trackingFinalizer().Pod, | ||||||
|  | 				*buildPod().uid("d").index("3").phase(v1.PodRunning).indexFailureCount("0").trackingFinalizer().Pod, | ||||||
|  | 			}, | ||||||
|  | 			wantStatus: batch.JobStatus{ | ||||||
|  | 				Failed:                  3, | ||||||
|  | 				Succeeded:               1, | ||||||
|  | 				FailedIndexes:           pointer.String("0,2"), | ||||||
|  | 				CompletedIndexes:        "1", | ||||||
|  | 				UncountedTerminatedPods: &batch.UncountedTerminatedPods{}, | ||||||
|  | 				Conditions: []batch.JobCondition{ | ||||||
|  | 					{ | ||||||
|  | 						Type:    batch.JobFailed, | ||||||
|  | 						Status:  v1.ConditionTrue, | ||||||
|  | 						Reason:  "MaxFailedIndexesExceeded", | ||||||
|  | 						Message: "Job has exceeded the specified maximal number of failed indexes", | ||||||
|  | 					}, | ||||||
|  | 				}, | ||||||
|  | 			}, | ||||||
|  | 		}, | ||||||
|  | 		"job with finished indexes; failedIndexes are cleaned when JobBackoffLimitPerIndex disabled": { | ||||||
|  | 			enableJobBackoffLimitPerIndex: false, | ||||||
|  | 			job: batch.Job{ | ||||||
|  | 				TypeMeta:   metav1.TypeMeta{Kind: "Job"}, | ||||||
|  | 				ObjectMeta: validObjectMeta, | ||||||
|  | 				Spec: batch.JobSpec{ | ||||||
|  | 					Selector:             validSelector, | ||||||
|  | 					Template:             validTemplate, | ||||||
|  | 					Parallelism:          pointer.Int32(3), | ||||||
|  | 					Completions:          pointer.Int32(3), | ||||||
|  | 					BackoffLimit:         pointer.Int32(math.MaxInt32), | ||||||
|  | 					CompletionMode:       completionModePtr(batch.IndexedCompletion), | ||||||
|  | 					BackoffLimitPerIndex: pointer.Int32(1), | ||||||
|  | 				}, | ||||||
|  | 				Status: batch.JobStatus{ | ||||||
|  | 					FailedIndexes:    pointer.String("0"), | ||||||
|  | 					CompletedIndexes: "1", | ||||||
|  | 				}, | ||||||
|  | 			}, | ||||||
|  | 			pods: []v1.Pod{ | ||||||
|  | 				*buildPod().uid("c").index("2").phase(v1.PodPending).indexFailureCount("1").trackingFinalizer().Pod, | ||||||
|  | 			}, | ||||||
|  | 			wantStatus: batch.JobStatus{ | ||||||
|  | 				Active:                  2, | ||||||
|  | 				Succeeded:               1, | ||||||
|  | 				CompletedIndexes:        "1", | ||||||
|  | 				UncountedTerminatedPods: &batch.UncountedTerminatedPods{}, | ||||||
|  | 			}, | ||||||
|  | 		}, | ||||||
|  | 	} | ||||||
|  | 	for name, tc := range testCases { | ||||||
|  | 		t.Run(name, func(t *testing.T) { | ||||||
|  | 			defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobBackoffLimitPerIndex, tc.enableJobBackoffLimitPerIndex)() | ||||||
|  | 			defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobPodFailurePolicy, tc.enableJobPodFailurePolicy)() | ||||||
|  | 			clientset := clientset.NewForConfigOrDie(&restclient.Config{Host: "", ContentConfig: restclient.ContentConfig{GroupVersion: &schema.GroupVersion{Group: "", Version: "v1"}}}) | ||||||
|  | 			fakeClock := clocktesting.NewFakeClock(now) | ||||||
|  | 			manager, sharedInformerFactory := newControllerFromClientWithClock(ctx, clientset, controller.NoResyncPeriodFunc, fakeClock) | ||||||
|  | 			fakePodControl := controller.FakePodControl{} | ||||||
|  | 			manager.podControl = &fakePodControl | ||||||
|  | 			manager.podStoreSynced = alwaysReady | ||||||
|  | 			manager.jobStoreSynced = alwaysReady | ||||||
|  | 			job := &tc.job | ||||||
|  |  | ||||||
|  | 			actual := job | ||||||
|  | 			manager.updateStatusHandler = func(ctx context.Context, job *batch.Job) (*batch.Job, error) { | ||||||
|  | 				actual = job | ||||||
|  | 				return job, nil | ||||||
|  | 			} | ||||||
|  | 			sharedInformerFactory.Batch().V1().Jobs().Informer().GetIndexer().Add(job) | ||||||
|  | 			for i, pod := range tc.pods { | ||||||
|  | 				pod := pod | ||||||
|  | 				pb := podBuilder{Pod: &pod}.name(fmt.Sprintf("mypod-%d", i)).job(job) | ||||||
|  | 				if job.Spec.CompletionMode != nil && *job.Spec.CompletionMode == batch.IndexedCompletion { | ||||||
|  | 					pb.index(fmt.Sprintf("%v", getCompletionIndex(pod.Annotations))) | ||||||
|  | 				} | ||||||
|  | 				pb = pb.trackingFinalizer() | ||||||
|  | 				sharedInformerFactory.Core().V1().Pods().Informer().GetIndexer().Add(pb.Pod) | ||||||
|  | 			} | ||||||
|  |  | ||||||
|  | 			manager.syncJob(context.TODO(), testutil.GetKey(job, t)) | ||||||
|  |  | ||||||
|  | 			// validate relevant fields of the status | ||||||
|  | 			if diff := cmp.Diff(tc.wantStatus, actual.Status, | ||||||
|  | 				cmpopts.IgnoreFields(batch.JobStatus{}, "StartTime", "CompletionTime", "Ready"), | ||||||
|  | 				cmpopts.IgnoreFields(batch.JobCondition{}, "LastProbeTime", "LastTransitionTime")); diff != "" { | ||||||
|  | 				t.Errorf("unexpected job status. Diff: %s\n", diff) | ||||||
|  | 			} | ||||||
|  | 		}) | ||||||
|  | 	} | ||||||
|  | } | ||||||
|  |  | ||||||
| func TestSyncJobUpdateRequeue(t *testing.T) { | func TestSyncJobUpdateRequeue(t *testing.T) { | ||||||
| 	_, ctx := ktesting.NewTestContext(t) | 	_, ctx := ktesting.NewTestContext(t) | ||||||
| 	clientset := clientset.NewForConfigOrDie(&restclient.Config{Host: "", ContentConfig: restclient.ContentConfig{GroupVersion: &schema.GroupVersion{Group: "", Version: "v1"}}}) | 	clientset := clientset.NewForConfigOrDie(&restclient.Config{Host: "", ContentConfig: restclient.ContentConfig{GroupVersion: &schema.GroupVersion{Group: "", Version: "v1"}}}) | ||||||
| @@ -3217,6 +3783,69 @@ func TestUpdateJobRequeue(t *testing.T) { | |||||||
| 	} | 	} | ||||||
| } | } | ||||||
|  |  | ||||||
|  | func TestGetPodCreationInfoForIndependentIndexes(t *testing.T) { | ||||||
|  | 	logger, ctx := ktesting.NewTestContext(t) | ||||||
|  | 	now := time.Now() | ||||||
|  | 	clientset := clientset.NewForConfigOrDie(&restclient.Config{Host: "", ContentConfig: restclient.ContentConfig{GroupVersion: &schema.GroupVersion{Group: "", Version: "v1"}}}) | ||||||
|  | 	cases := map[string]struct { | ||||||
|  | 		indexesToAdd                    []int | ||||||
|  | 		podsWithDelayedDeletionPerIndex map[int]*v1.Pod | ||||||
|  | 		wantIndexesToAdd                []int | ||||||
|  | 		wantRemainingTime               time.Duration | ||||||
|  | 	}{ | ||||||
|  | 		"simple index creation": { | ||||||
|  | 			indexesToAdd:     []int{1, 3}, | ||||||
|  | 			wantIndexesToAdd: []int{1, 3}, | ||||||
|  | 		}, | ||||||
|  | 		"subset of indexes can be recreated now": { | ||||||
|  | 			indexesToAdd: []int{1, 3}, | ||||||
|  | 			podsWithDelayedDeletionPerIndex: map[int]*v1.Pod{ | ||||||
|  | 				1: buildPod().indexFailureCount("0").index("1").customDeletionTimestamp(now).Pod, | ||||||
|  | 			}, | ||||||
|  | 			wantIndexesToAdd: []int{3}, | ||||||
|  | 		}, | ||||||
|  | 		"subset of indexes can be recreated now as the pods failed long time ago": { | ||||||
|  | 			indexesToAdd: []int{1, 3}, | ||||||
|  | 			podsWithDelayedDeletionPerIndex: map[int]*v1.Pod{ | ||||||
|  | 				1: buildPod().indexFailureCount("0").customDeletionTimestamp(now).Pod, | ||||||
|  | 				3: buildPod().indexFailureCount("0").customDeletionTimestamp(now.Add(-DefaultJobPodFailureBackOff)).Pod, | ||||||
|  | 			}, | ||||||
|  | 			wantIndexesToAdd: []int{3}, | ||||||
|  | 		}, | ||||||
|  | 		"no indexes can be recreated now, need to wait default pod failure backoff": { | ||||||
|  | 			indexesToAdd: []int{1, 2, 3}, | ||||||
|  | 			podsWithDelayedDeletionPerIndex: map[int]*v1.Pod{ | ||||||
|  | 				1: buildPod().indexFailureCount("1").customDeletionTimestamp(now).Pod, | ||||||
|  | 				2: buildPod().indexFailureCount("0").customDeletionTimestamp(now).Pod, | ||||||
|  | 				3: buildPod().indexFailureCount("2").customDeletionTimestamp(now).Pod, | ||||||
|  | 			}, | ||||||
|  | 			wantRemainingTime: DefaultJobPodFailureBackOff, | ||||||
|  | 		}, | ||||||
|  | 		"no indexes can be recreated now, need to wait but 1s already passed": { | ||||||
|  | 			indexesToAdd: []int{1, 2, 3}, | ||||||
|  | 			podsWithDelayedDeletionPerIndex: map[int]*v1.Pod{ | ||||||
|  | 				1: buildPod().indexFailureCount("1").customDeletionTimestamp(now.Add(-time.Second)).Pod, | ||||||
|  | 				2: buildPod().indexFailureCount("0").customDeletionTimestamp(now.Add(-time.Second)).Pod, | ||||||
|  | 				3: buildPod().indexFailureCount("2").customDeletionTimestamp(now.Add(-time.Second)).Pod, | ||||||
|  | 			}, | ||||||
|  | 			wantRemainingTime: DefaultJobPodFailureBackOff - time.Second, | ||||||
|  | 		}, | ||||||
|  | 	} | ||||||
|  | 	for name, tc := range cases { | ||||||
|  | 		t.Run(name, func(t *testing.T) { | ||||||
|  | 			fakeClock := clocktesting.NewFakeClock(now) | ||||||
|  | 			manager, _ := newControllerFromClientWithClock(ctx, clientset, controller.NoResyncPeriodFunc, fakeClock) | ||||||
|  | 			gotIndexesToAdd, gotRemainingTime := manager.getPodCreationInfoForIndependentIndexes(logger, tc.indexesToAdd, tc.podsWithDelayedDeletionPerIndex) | ||||||
|  | 			if diff := cmp.Diff(tc.wantIndexesToAdd, gotIndexesToAdd); diff != "" { | ||||||
|  | 				t.Fatalf("Unexpected indexes to add: %s", diff) | ||||||
|  | 			} | ||||||
|  | 			if diff := cmp.Diff(tc.wantRemainingTime, gotRemainingTime); diff != "" { | ||||||
|  | 				t.Fatalf("Unexpected remaining time: %s", diff) | ||||||
|  | 			} | ||||||
|  | 		}) | ||||||
|  | 	} | ||||||
|  | } | ||||||
|  |  | ||||||
| func TestJobPodLookup(t *testing.T) { | func TestJobPodLookup(t *testing.T) { | ||||||
| 	_, ctx := ktesting.NewTestContext(t) | 	_, ctx := ktesting.NewTestContext(t) | ||||||
| 	clientset := clientset.NewForConfigOrDie(&restclient.Config{Host: "", ContentConfig: restclient.ContentConfig{GroupVersion: &schema.GroupVersion{Group: "", Version: "v1"}}}) | 	clientset := clientset.NewForConfigOrDie(&restclient.Config{Host: "", ContentConfig: restclient.ContentConfig{GroupVersion: &schema.GroupVersion{Group: "", Version: "v1"}}}) | ||||||
| @@ -4541,10 +5170,27 @@ func (pb podBuilder) clearLabels() podBuilder { | |||||||
| } | } | ||||||
|  |  | ||||||
| func (pb podBuilder) index(ix string) podBuilder { | func (pb podBuilder) index(ix string) podBuilder { | ||||||
|  | 	return pb.annotation(batch.JobCompletionIndexAnnotation, ix) | ||||||
|  | } | ||||||
|  |  | ||||||
|  | func (pb podBuilder) indexFailureCount(count string) podBuilder { | ||||||
|  | 	return pb.annotation(batch.JobIndexFailureCountAnnotation, count) | ||||||
|  | } | ||||||
|  |  | ||||||
|  | func (pb podBuilder) indexIgnoredFailureCount(count string) podBuilder { | ||||||
|  | 	return pb.annotation(batch.JobIndexIgnoredFailureCountAnnotation, count) | ||||||
|  | } | ||||||
|  |  | ||||||
|  | func (pb podBuilder) annotation(key, value string) podBuilder { | ||||||
| 	if pb.Annotations == nil { | 	if pb.Annotations == nil { | ||||||
| 		pb.Annotations = make(map[string]string) | 		pb.Annotations = make(map[string]string) | ||||||
| 	} | 	} | ||||||
| 	pb.Annotations[batch.JobCompletionIndexAnnotation] = ix | 	pb.Annotations[key] = value | ||||||
|  | 	return pb | ||||||
|  | } | ||||||
|  |  | ||||||
|  | func (pb podBuilder) status(s v1.PodStatus) podBuilder { | ||||||
|  | 	pb.Status = s | ||||||
| 	return pb | 	return pb | ||||||
| } | } | ||||||
|  |  | ||||||
| @@ -4568,6 +5214,15 @@ func (pb podBuilder) deletionTimestamp() podBuilder { | |||||||
| 	return pb | 	return pb | ||||||
| } | } | ||||||
|  |  | ||||||
|  | func (pb podBuilder) customDeletionTimestamp(t time.Time) podBuilder { | ||||||
|  | 	pb.DeletionTimestamp = &metav1.Time{Time: t} | ||||||
|  | 	return pb | ||||||
|  | } | ||||||
|  |  | ||||||
|  | func completionModePtr(m batch.CompletionMode) *batch.CompletionMode { | ||||||
|  | 	return &m | ||||||
|  | } | ||||||
|  |  | ||||||
| func setDurationDuringTest(val *time.Duration, newVal time.Duration) func() { | func setDurationDuringTest(val *time.Duration, newVal time.Duration) func() { | ||||||
| 	origVal := *val | 	origVal := *val | ||||||
| 	*val = newVal | 	*val = newVal | ||||||
|   | |||||||
| @@ -21,20 +21,24 @@ import ( | |||||||
|  |  | ||||||
| 	batch "k8s.io/api/batch/v1" | 	batch "k8s.io/api/batch/v1" | ||||||
| 	v1 "k8s.io/api/core/v1" | 	v1 "k8s.io/api/core/v1" | ||||||
|  | 	"k8s.io/apiserver/pkg/util/feature" | ||||||
|  | 	"k8s.io/kubernetes/pkg/features" | ||||||
| ) | ) | ||||||
|  |  | ||||||
| // matchPodFailurePolicy returns information about matching a given failed pod | // matchPodFailurePolicy returns information about matching a given failed pod | ||||||
| // against the pod failure policy rules. The information is represented as an | // against the pod failure policy rules. The information is represented as an | ||||||
| // optional job failure message (present in case the pod matched a 'FailJob' | //   - optional job failure message (present in case the pod matched a 'FailJob' rule), | ||||||
| // rule), a boolean indicating if the failure should be counted towards | //   - a boolean indicating if the failure should be counted towards backoffLimit | ||||||
| // backoffLimit (it should not be counted if the pod matched an 'Ignore' rule), | //     (and backoffLimitPerIndex if specified). It should not be counted | ||||||
| // and a pointer to the matched pod failure policy action. | //     if the pod matched an 'Ignore' rule, | ||||||
|  | //   - a pointer to the matched pod failure policy action. | ||||||
| func matchPodFailurePolicy(podFailurePolicy *batch.PodFailurePolicy, failedPod *v1.Pod) (*string, bool, *batch.PodFailurePolicyAction) { | func matchPodFailurePolicy(podFailurePolicy *batch.PodFailurePolicy, failedPod *v1.Pod) (*string, bool, *batch.PodFailurePolicyAction) { | ||||||
| 	if podFailurePolicy == nil { | 	if podFailurePolicy == nil { | ||||||
| 		return nil, true, nil | 		return nil, true, nil | ||||||
| 	} | 	} | ||||||
| 	ignore := batch.PodFailurePolicyActionIgnore | 	ignore := batch.PodFailurePolicyActionIgnore | ||||||
| 	failJob := batch.PodFailurePolicyActionFailJob | 	failJob := batch.PodFailurePolicyActionFailJob | ||||||
|  | 	failIndex := batch.PodFailurePolicyActionFailIndex | ||||||
| 	count := batch.PodFailurePolicyActionCount | 	count := batch.PodFailurePolicyActionCount | ||||||
| 	for index, podFailurePolicyRule := range podFailurePolicy.Rules { | 	for index, podFailurePolicyRule := range podFailurePolicy.Rules { | ||||||
| 		if podFailurePolicyRule.OnExitCodes != nil { | 		if podFailurePolicyRule.OnExitCodes != nil { | ||||||
| @@ -42,6 +46,10 @@ func matchPodFailurePolicy(podFailurePolicy *batch.PodFailurePolicy, failedPod * | |||||||
| 				switch podFailurePolicyRule.Action { | 				switch podFailurePolicyRule.Action { | ||||||
| 				case batch.PodFailurePolicyActionIgnore: | 				case batch.PodFailurePolicyActionIgnore: | ||||||
| 					return nil, false, &ignore | 					return nil, false, &ignore | ||||||
|  | 				case batch.PodFailurePolicyActionFailIndex: | ||||||
|  | 					if feature.DefaultFeatureGate.Enabled(features.JobBackoffLimitPerIndex) { | ||||||
|  | 						return nil, true, &failIndex | ||||||
|  | 					} | ||||||
| 				case batch.PodFailurePolicyActionCount: | 				case batch.PodFailurePolicyActionCount: | ||||||
| 					return nil, true, &count | 					return nil, true, &count | ||||||
| 				case batch.PodFailurePolicyActionFailJob: | 				case batch.PodFailurePolicyActionFailJob: | ||||||
| @@ -55,6 +63,10 @@ func matchPodFailurePolicy(podFailurePolicy *batch.PodFailurePolicy, failedPod * | |||||||
| 				switch podFailurePolicyRule.Action { | 				switch podFailurePolicyRule.Action { | ||||||
| 				case batch.PodFailurePolicyActionIgnore: | 				case batch.PodFailurePolicyActionIgnore: | ||||||
| 					return nil, false, &ignore | 					return nil, false, &ignore | ||||||
|  | 				case batch.PodFailurePolicyActionFailIndex: | ||||||
|  | 					if feature.DefaultFeatureGate.Enabled(features.JobBackoffLimitPerIndex) { | ||||||
|  | 						return nil, true, &failIndex | ||||||
|  | 					} | ||||||
| 				case batch.PodFailurePolicyActionCount: | 				case batch.PodFailurePolicyActionCount: | ||||||
| 					return nil, true, &count | 					return nil, true, &count | ||||||
| 				case batch.PodFailurePolicyActionFailJob: | 				case batch.PodFailurePolicyActionFailJob: | ||||||
|   | |||||||
| @@ -23,7 +23,10 @@ import ( | |||||||
| 	batch "k8s.io/api/batch/v1" | 	batch "k8s.io/api/batch/v1" | ||||||
| 	v1 "k8s.io/api/core/v1" | 	v1 "k8s.io/api/core/v1" | ||||||
| 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" | 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" | ||||||
|  | 	utilfeature "k8s.io/apiserver/pkg/util/feature" | ||||||
|  | 	featuregatetesting "k8s.io/component-base/featuregate/testing" | ||||||
| 	_ "k8s.io/kubernetes/pkg/apis/core/install" | 	_ "k8s.io/kubernetes/pkg/apis/core/install" | ||||||
|  | 	"k8s.io/kubernetes/pkg/features" | ||||||
| 	"k8s.io/utils/pointer" | 	"k8s.io/utils/pointer" | ||||||
| ) | ) | ||||||
|  |  | ||||||
| @@ -34,14 +37,16 @@ func TestMatchPodFailurePolicy(t *testing.T) { | |||||||
| 	} | 	} | ||||||
| 	ignore := batch.PodFailurePolicyActionIgnore | 	ignore := batch.PodFailurePolicyActionIgnore | ||||||
| 	failJob := batch.PodFailurePolicyActionFailJob | 	failJob := batch.PodFailurePolicyActionFailJob | ||||||
|  | 	failIndex := batch.PodFailurePolicyActionFailIndex | ||||||
| 	count := batch.PodFailurePolicyActionCount | 	count := batch.PodFailurePolicyActionCount | ||||||
|  |  | ||||||
| 	testCases := map[string]struct { | 	testCases := map[string]struct { | ||||||
| 		podFailurePolicy      *batch.PodFailurePolicy | 		enableJobBackoffLimitPerIndex bool | ||||||
| 		failedPod             *v1.Pod | 		podFailurePolicy              *batch.PodFailurePolicy | ||||||
| 		wantJobFailureMessage *string | 		failedPod                     *v1.Pod | ||||||
| 		wantCountFailed       bool | 		wantJobFailureMessage         *string | ||||||
| 		wantAction            *batch.PodFailurePolicyAction | 		wantCountFailed               bool | ||||||
|  | 		wantAction                    *batch.PodFailurePolicyAction | ||||||
| 	}{ | 	}{ | ||||||
| 		"unknown action for rule matching by exit codes - skip rule with unknown action": { | 		"unknown action for rule matching by exit codes - skip rule with unknown action": { | ||||||
| 			podFailurePolicy: &batch.PodFailurePolicy{ | 			podFailurePolicy: &batch.PodFailurePolicy{ | ||||||
| @@ -292,6 +297,68 @@ func TestMatchPodFailurePolicy(t *testing.T) { | |||||||
| 			wantJobFailureMessage: nil, | 			wantJobFailureMessage: nil, | ||||||
| 			wantCountFailed:       true, | 			wantCountFailed:       true, | ||||||
| 		}, | 		}, | ||||||
|  | 		"FailIndex rule matched for exit codes; JobBackoffLimitPerIndex enabled": { | ||||||
|  | 			enableJobBackoffLimitPerIndex: true, | ||||||
|  | 			podFailurePolicy: &batch.PodFailurePolicy{ | ||||||
|  | 				Rules: []batch.PodFailurePolicyRule{ | ||||||
|  | 					{ | ||||||
|  | 						Action: batch.PodFailurePolicyActionFailIndex, | ||||||
|  | 						OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{ | ||||||
|  | 							Operator: batch.PodFailurePolicyOnExitCodesOpIn, | ||||||
|  | 							Values:   []int32{1, 2, 3}, | ||||||
|  | 						}, | ||||||
|  | 					}, | ||||||
|  | 				}, | ||||||
|  | 			}, | ||||||
|  | 			failedPod: &v1.Pod{ | ||||||
|  | 				ObjectMeta: validPodObjectMeta, | ||||||
|  | 				Status: v1.PodStatus{ | ||||||
|  | 					Phase: v1.PodFailed, | ||||||
|  | 					ContainerStatuses: []v1.ContainerStatus{ | ||||||
|  | 						{ | ||||||
|  | 							State: v1.ContainerState{ | ||||||
|  | 								Terminated: &v1.ContainerStateTerminated{ | ||||||
|  | 									ExitCode: 2, | ||||||
|  | 								}, | ||||||
|  | 							}, | ||||||
|  | 						}, | ||||||
|  | 					}, | ||||||
|  | 				}, | ||||||
|  | 			}, | ||||||
|  | 			wantCountFailed: true, | ||||||
|  | 			wantAction:      &failIndex, | ||||||
|  | 		}, | ||||||
|  | 		"FailIndex rule matched for exit codes; JobBackoffLimitPerIndex disabled": { | ||||||
|  | 			enableJobBackoffLimitPerIndex: false, | ||||||
|  | 			podFailurePolicy: &batch.PodFailurePolicy{ | ||||||
|  | 				Rules: []batch.PodFailurePolicyRule{ | ||||||
|  | 					{ | ||||||
|  | 						Action: batch.PodFailurePolicyActionFailIndex, | ||||||
|  | 						OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{ | ||||||
|  | 							Operator: batch.PodFailurePolicyOnExitCodesOpIn, | ||||||
|  | 							Values:   []int32{1, 2, 3}, | ||||||
|  | 						}, | ||||||
|  | 					}, | ||||||
|  | 				}, | ||||||
|  | 			}, | ||||||
|  | 			failedPod: &v1.Pod{ | ||||||
|  | 				ObjectMeta: validPodObjectMeta, | ||||||
|  | 				Status: v1.PodStatus{ | ||||||
|  | 					Phase: v1.PodFailed, | ||||||
|  | 					ContainerStatuses: []v1.ContainerStatus{ | ||||||
|  | 						{ | ||||||
|  | 							State: v1.ContainerState{ | ||||||
|  | 								Terminated: &v1.ContainerStateTerminated{ | ||||||
|  | 									ExitCode: 2, | ||||||
|  | 								}, | ||||||
|  | 							}, | ||||||
|  | 						}, | ||||||
|  | 					}, | ||||||
|  | 				}, | ||||||
|  | 			}, | ||||||
|  | 			wantCountFailed: true, | ||||||
|  | 			wantAction:      nil, | ||||||
|  | 		}, | ||||||
| 		"pod failure policy with NotIn operator and value 0": { | 		"pod failure policy with NotIn operator and value 0": { | ||||||
| 			podFailurePolicy: &batch.PodFailurePolicy{ | 			podFailurePolicy: &batch.PodFailurePolicy{ | ||||||
| 				Rules: []batch.PodFailurePolicyRule{ | 				Rules: []batch.PodFailurePolicyRule{ | ||||||
| @@ -406,6 +473,66 @@ func TestMatchPodFailurePolicy(t *testing.T) { | |||||||
| 			wantCountFailed:       true, | 			wantCountFailed:       true, | ||||||
| 			wantAction:            &count, | 			wantAction:            &count, | ||||||
| 		}, | 		}, | ||||||
|  | 		"FailIndex rule matched for pod conditions; JobBackoffLimitPerIndex enabled": { | ||||||
|  | 			enableJobBackoffLimitPerIndex: true, | ||||||
|  | 			podFailurePolicy: &batch.PodFailurePolicy{ | ||||||
|  | 				Rules: []batch.PodFailurePolicyRule{ | ||||||
|  | 					{ | ||||||
|  | 						Action: batch.PodFailurePolicyActionFailIndex, | ||||||
|  | 						OnPodConditions: []batch.PodFailurePolicyOnPodConditionsPattern{ | ||||||
|  | 							{ | ||||||
|  | 								Type:   v1.DisruptionTarget, | ||||||
|  | 								Status: v1.ConditionTrue, | ||||||
|  | 							}, | ||||||
|  | 						}, | ||||||
|  | 					}, | ||||||
|  | 				}, | ||||||
|  | 			}, | ||||||
|  | 			failedPod: &v1.Pod{ | ||||||
|  | 				ObjectMeta: validPodObjectMeta, | ||||||
|  | 				Status: v1.PodStatus{ | ||||||
|  | 					Phase: v1.PodFailed, | ||||||
|  | 					Conditions: []v1.PodCondition{ | ||||||
|  | 						{ | ||||||
|  | 							Type:   v1.DisruptionTarget, | ||||||
|  | 							Status: v1.ConditionTrue, | ||||||
|  | 						}, | ||||||
|  | 					}, | ||||||
|  | 				}, | ||||||
|  | 			}, | ||||||
|  | 			wantCountFailed: true, | ||||||
|  | 			wantAction:      &failIndex, | ||||||
|  | 		}, | ||||||
|  | 		"FailIndex rule matched for pod conditions; JobBackoffLimitPerIndex disabled": { | ||||||
|  | 			enableJobBackoffLimitPerIndex: false, | ||||||
|  | 			podFailurePolicy: &batch.PodFailurePolicy{ | ||||||
|  | 				Rules: []batch.PodFailurePolicyRule{ | ||||||
|  | 					{ | ||||||
|  | 						Action: batch.PodFailurePolicyActionFailIndex, | ||||||
|  | 						OnPodConditions: []batch.PodFailurePolicyOnPodConditionsPattern{ | ||||||
|  | 							{ | ||||||
|  | 								Type:   v1.DisruptionTarget, | ||||||
|  | 								Status: v1.ConditionTrue, | ||||||
|  | 							}, | ||||||
|  | 						}, | ||||||
|  | 					}, | ||||||
|  | 				}, | ||||||
|  | 			}, | ||||||
|  | 			failedPod: &v1.Pod{ | ||||||
|  | 				ObjectMeta: validPodObjectMeta, | ||||||
|  | 				Status: v1.PodStatus{ | ||||||
|  | 					Phase: v1.PodFailed, | ||||||
|  | 					Conditions: []v1.PodCondition{ | ||||||
|  | 						{ | ||||||
|  | 							Type:   v1.DisruptionTarget, | ||||||
|  | 							Status: v1.ConditionTrue, | ||||||
|  | 						}, | ||||||
|  | 					}, | ||||||
|  | 				}, | ||||||
|  | 			}, | ||||||
|  | 			wantCountFailed: true, | ||||||
|  | 			wantAction:      nil, | ||||||
|  | 		}, | ||||||
| 		"ignore rule matched for pod conditions": { | 		"ignore rule matched for pod conditions": { | ||||||
| 			podFailurePolicy: &batch.PodFailurePolicy{ | 			podFailurePolicy: &batch.PodFailurePolicy{ | ||||||
| 				Rules: []batch.PodFailurePolicyRule{ | 				Rules: []batch.PodFailurePolicyRule{ | ||||||
| @@ -709,6 +836,7 @@ func TestMatchPodFailurePolicy(t *testing.T) { | |||||||
| 	} | 	} | ||||||
| 	for name, tc := range testCases { | 	for name, tc := range testCases { | ||||||
| 		t.Run(name, func(t *testing.T) { | 		t.Run(name, func(t *testing.T) { | ||||||
|  | 			defer featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.JobBackoffLimitPerIndex, tc.enableJobBackoffLimitPerIndex)() | ||||||
| 			jobFailMessage, countFailed, action := matchPodFailurePolicy(tc.podFailurePolicy, tc.failedPod) | 			jobFailMessage, countFailed, action := matchPodFailurePolicy(tc.podFailurePolicy, tc.failedPod) | ||||||
| 			if diff := cmp.Diff(tc.wantJobFailureMessage, jobFailMessage); diff != "" { | 			if diff := cmp.Diff(tc.wantJobFailureMessage, jobFailMessage); diff != "" { | ||||||
| 				t.Errorf("Unexpected job failure message: %s", diff) | 				t.Errorf("Unexpected job failure message: %s", diff) | ||||||
|   | |||||||
| @@ -682,6 +682,633 @@ func TestJobPodFailurePolicy(t *testing.T) { | |||||||
| 	} | 	} | ||||||
| } | } | ||||||
|  |  | ||||||
|  | // TestBackoffLimitPerIndex_DelayedPodDeletion tests the pod deletion is delayed | ||||||
|  | // until the replacement pod is created, so that the replacement pod has the | ||||||
|  | // index-failure-count annotation bumped, when BackoffLimitPerIndex is used. | ||||||
|  | func TestBackoffLimitPerIndex_DelayedPodDeletion(t *testing.T) { | ||||||
|  | 	t.Cleanup(setDurationDuringTest(&jobcontroller.DefaultJobPodFailureBackOff, fastPodFailureBackoff)) | ||||||
|  |  | ||||||
|  | 	defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobBackoffLimitPerIndex, true)() | ||||||
|  | 	closeFn, restConfig, clientSet, ns := setup(t, "backoff-limit-per-index-failed") | ||||||
|  | 	defer closeFn() | ||||||
|  | 	ctx, cancel := startJobControllerAndWaitForCaches(restConfig) | ||||||
|  | 	defer func() { | ||||||
|  | 		cancel() | ||||||
|  | 	}() | ||||||
|  |  | ||||||
|  | 	jobObj, err := createJobWithDefaults(ctx, clientSet, ns.Name, &batchv1.Job{ | ||||||
|  | 		Spec: batchv1.JobSpec{ | ||||||
|  | 			Parallelism:          pointer.Int32(1), | ||||||
|  | 			Completions:          pointer.Int32(1), | ||||||
|  | 			BackoffLimitPerIndex: pointer.Int32(1), | ||||||
|  | 			CompletionMode:       completionModePtr(batchv1.IndexedCompletion), | ||||||
|  | 		}, | ||||||
|  | 	}) | ||||||
|  | 	if err != nil { | ||||||
|  | 		t.Fatalf("Failed to create Job: %v", err) | ||||||
|  | 	} | ||||||
|  | 	validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{ | ||||||
|  | 		Active: 1, | ||||||
|  | 		Ready:  pointer.Int32(0), | ||||||
|  | 	}) | ||||||
|  | 	validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New(0), "", pointer.String("")) | ||||||
|  |  | ||||||
|  | 	// First pod from index 0 failed. | ||||||
|  | 	if err := setJobPhaseForIndex(ctx, clientSet, jobObj, v1.PodFailed, 0); err != nil { | ||||||
|  | 		t.Fatal("Failed trying to fail pod with index 0") | ||||||
|  | 	} | ||||||
|  | 	// Delete the failed pod | ||||||
|  | 	pod, err := getJobPodForIndex(ctx, clientSet, jobObj, 0, func(_ *v1.Pod) bool { return true }) | ||||||
|  | 	if err != nil { | ||||||
|  | 		t.Fatalf("failed to get terminal pod for index: %v", 0) | ||||||
|  | 	} | ||||||
|  | 	if err := clientSet.CoreV1().Pods(pod.Namespace).Delete(ctx, pod.Name, metav1.DeleteOptions{}); err != nil { | ||||||
|  | 		t.Fatalf("failed to delete pod: %v, error: %v", klog.KObj(pod), err) | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{ | ||||||
|  | 		Active: 1, | ||||||
|  | 		Failed: 1, | ||||||
|  | 		Ready:  pointer.Int32(0), | ||||||
|  | 	}) | ||||||
|  | 	validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New(0), "", pointer.String("")) | ||||||
|  |  | ||||||
|  | 	// Verify the replacement pod is created and has the index-failure-count | ||||||
|  | 	// annotation bumped. | ||||||
|  | 	replacement, err := getActivePodForIndex(ctx, clientSet, jobObj, 0) | ||||||
|  | 	if err != nil { | ||||||
|  | 		t.Fatalf("Failed to get active replacement pod for index: %v, error: %v", 0, err) | ||||||
|  | 	} | ||||||
|  | 	gotIndexFailureCount, err := getIndexFailureCount(replacement) | ||||||
|  | 	if err != nil { | ||||||
|  | 		t.Fatalf("Failed read the index failure count annotation for pod: %v, error: %v", klog.KObj(replacement), err) | ||||||
|  | 	} | ||||||
|  | 	if diff := cmp.Diff(1, gotIndexFailureCount); diff != "" { | ||||||
|  | 		t.Errorf("Unexpected index failure count for the replacement pod: %s", diff) | ||||||
|  | 	} | ||||||
|  | 	if err := setJobPhaseForIndex(ctx, clientSet, jobObj, v1.PodSucceeded, 0); err != nil { | ||||||
|  | 		t.Fatal("Failed trying to fail pod with index 0") | ||||||
|  | 	} | ||||||
|  | 	validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{ | ||||||
|  | 		Active:    0, | ||||||
|  | 		Succeeded: 1, | ||||||
|  | 		Failed:    1, | ||||||
|  | 		Ready:     pointer.Int32(0), | ||||||
|  | 	}) | ||||||
|  | 	validateJobSucceeded(ctx, t, clientSet, jobObj) | ||||||
|  | } | ||||||
|  |  | ||||||
|  | // TestBackoffLimitPerIndex_Reenabling tests handling of pod failures when | ||||||
|  | // reenabling the BackoffLimitPerIndex feature. | ||||||
|  | func TestBackoffLimitPerIndex_Reenabling(t *testing.T) { | ||||||
|  | 	t.Cleanup(setDurationDuringTest(&jobcontroller.DefaultJobPodFailureBackOff, fastPodFailureBackoff)) | ||||||
|  |  | ||||||
|  | 	defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobBackoffLimitPerIndex, true)() | ||||||
|  | 	closeFn, restConfig, clientSet, ns := setup(t, "backoff-limit-per-index-reenabled") | ||||||
|  | 	defer closeFn() | ||||||
|  | 	ctx, cancel := startJobControllerAndWaitForCaches(restConfig) | ||||||
|  | 	defer cancel() | ||||||
|  | 	resetMetrics() | ||||||
|  |  | ||||||
|  | 	jobObj, err := createJobWithDefaults(ctx, clientSet, ns.Name, &batchv1.Job{ | ||||||
|  | 		Spec: batchv1.JobSpec{ | ||||||
|  | 			Parallelism:          pointer.Int32(3), | ||||||
|  | 			Completions:          pointer.Int32(3), | ||||||
|  | 			BackoffLimitPerIndex: pointer.Int32(0), | ||||||
|  | 			CompletionMode:       completionModePtr(batchv1.IndexedCompletion), | ||||||
|  | 		}, | ||||||
|  | 	}) | ||||||
|  | 	if err != nil { | ||||||
|  | 		t.Fatalf("Failed to create Job: %v", err) | ||||||
|  | 	} | ||||||
|  | 	validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{ | ||||||
|  | 		Active: 3, | ||||||
|  | 		Ready:  pointer.Int32(0), | ||||||
|  | 	}) | ||||||
|  | 	validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New(0, 1, 2), "", pointer.String("")) | ||||||
|  |  | ||||||
|  | 	// First pod from index 0 failed | ||||||
|  | 	if err := setJobPhaseForIndex(ctx, clientSet, jobObj, v1.PodFailed, 0); err != nil { | ||||||
|  | 		t.Fatal("Failed trying to fail pod with index 0") | ||||||
|  | 	} | ||||||
|  | 	validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{ | ||||||
|  | 		Active: 2, | ||||||
|  | 		Failed: 1, | ||||||
|  | 		Ready:  pointer.Int32(0), | ||||||
|  | 	}) | ||||||
|  | 	validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New(1, 2), "", pointer.String("0")) | ||||||
|  |  | ||||||
|  | 	// Disable the feature | ||||||
|  | 	defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobBackoffLimitPerIndex, false)() | ||||||
|  |  | ||||||
|  | 	// First pod from index 1 failed | ||||||
|  | 	if err := setJobPhaseForIndex(ctx, clientSet, jobObj, v1.PodFailed, 1); err != nil { | ||||||
|  | 		t.Fatal("Failed trying to fail pod with index 1") | ||||||
|  | 	} | ||||||
|  | 	validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{ | ||||||
|  | 		Active: 3, | ||||||
|  | 		Failed: 2, | ||||||
|  | 		Ready:  pointer.Int32(0), | ||||||
|  | 	}) | ||||||
|  | 	validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New(0, 1, 2), "", nil) | ||||||
|  |  | ||||||
|  | 	// Reenable the feature | ||||||
|  | 	defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobBackoffLimitPerIndex, true)() | ||||||
|  |  | ||||||
|  | 	// First pod from index 2 failed | ||||||
|  | 	if err := setJobPhaseForIndex(ctx, clientSet, jobObj, v1.PodFailed, 2); err != nil { | ||||||
|  | 		t.Fatal("Failed trying to fail pod with index 2") | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	// Verify the indexes 0 and 1 are active as the failed pods don't have | ||||||
|  | 	// finalizers at this point, so they are ignored. | ||||||
|  | 	validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{ | ||||||
|  | 		Active: 2, | ||||||
|  | 		Failed: 3, | ||||||
|  | 		Ready:  pointer.Int32(0), | ||||||
|  | 	}) | ||||||
|  | 	validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New(0, 1), "", pointer.String("2")) | ||||||
|  |  | ||||||
|  | 	// mark remaining pods are Succeeded and verify Job status | ||||||
|  | 	if err, _ := setJobPodsPhase(ctx, clientSet, jobObj, v1.PodSucceeded, 2); err != nil { | ||||||
|  | 		t.Fatalf("Failed setting phase %q on Job Pod: %q", v1.PodSucceeded, err) | ||||||
|  | 	} | ||||||
|  | 	validateJobFailed(ctx, t, clientSet, jobObj) | ||||||
|  | 	validateFinishedPodsNoFinalizer(ctx, t, clientSet, jobObj) | ||||||
|  | } | ||||||
|  |  | ||||||
|  | // TestBackoffLimitPerIndex_JobPodsCreatedWithExponentialBackoff tests that the | ||||||
|  | // pods are recreated with expotential backoff delay computed independently | ||||||
|  | // per index. Scenario: | ||||||
|  | // - fail index 0 | ||||||
|  | // - fail index 0 | ||||||
|  | // - fail index 1 | ||||||
|  | // - succeed index 0 | ||||||
|  | // - fail index 1 | ||||||
|  | // - succeed index 1 | ||||||
|  | func TestBackoffLimitPerIndex_JobPodsCreatedWithExponentialBackoff(t *testing.T) { | ||||||
|  | 	defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobBackoffLimitPerIndex, true)() | ||||||
|  | 	t.Cleanup(setDurationDuringTest(&jobcontroller.DefaultJobPodFailureBackOff, 2*time.Second)) | ||||||
|  |  | ||||||
|  | 	closeFn, restConfig, clientSet, ns := setup(t, "simple") | ||||||
|  | 	defer closeFn() | ||||||
|  | 	ctx, cancel := startJobControllerAndWaitForCaches(restConfig) | ||||||
|  | 	defer cancel() | ||||||
|  |  | ||||||
|  | 	jobObj, err := createJobWithDefaults(ctx, clientSet, ns.Name, &batchv1.Job{ | ||||||
|  | 		Spec: batchv1.JobSpec{ | ||||||
|  | 			Completions:          pointer.Int32(2), | ||||||
|  | 			Parallelism:          pointer.Int32(2), | ||||||
|  | 			BackoffLimitPerIndex: pointer.Int32(2), | ||||||
|  | 			CompletionMode:       completionModePtr(batchv1.IndexedCompletion), | ||||||
|  | 		}, | ||||||
|  | 	}) | ||||||
|  | 	if err != nil { | ||||||
|  | 		t.Fatalf("Could not create job: %v", err) | ||||||
|  | 	} | ||||||
|  | 	validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{ | ||||||
|  | 		Active: 2, | ||||||
|  | 		Ready:  pointer.Int32(0), | ||||||
|  | 	}) | ||||||
|  | 	validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New(0, 1), "", pointer.String("")) | ||||||
|  |  | ||||||
|  | 	// Fail the first pod for index 0 | ||||||
|  | 	if err := setJobPhaseForIndex(ctx, clientSet, jobObj, v1.PodFailed, 0); err != nil { | ||||||
|  | 		t.Fatalf("Failed setting phase %s on Job Pod: %v", v1.PodFailed, err) | ||||||
|  | 	} | ||||||
|  | 	validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{ | ||||||
|  | 		Active: 2, | ||||||
|  | 		Failed: 1, | ||||||
|  | 		Ready:  pointer.Int32(0), | ||||||
|  | 	}) | ||||||
|  | 	validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New(0, 1), "", pointer.String("")) | ||||||
|  |  | ||||||
|  | 	// Fail the second pod for index 0 | ||||||
|  | 	if err := setJobPhaseForIndex(ctx, clientSet, jobObj, v1.PodFailed, 0); err != nil { | ||||||
|  | 		t.Fatalf("Failed setting phase %s on Job Pod: %v", v1.PodFailed, err) | ||||||
|  | 	} | ||||||
|  | 	validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{ | ||||||
|  | 		Active: 2, | ||||||
|  | 		Failed: 2, | ||||||
|  | 		Ready:  pointer.Int32(0), | ||||||
|  | 	}) | ||||||
|  | 	validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New(0, 1), "", pointer.String("")) | ||||||
|  |  | ||||||
|  | 	// Fail the first pod for index 1 | ||||||
|  | 	if err := setJobPhaseForIndex(ctx, clientSet, jobObj, v1.PodFailed, 1); err != nil { | ||||||
|  | 		t.Fatalf("Failed setting phase %s on Job Pod: %v", v1.PodFailed, err) | ||||||
|  | 	} | ||||||
|  | 	validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{ | ||||||
|  | 		Active: 2, | ||||||
|  | 		Failed: 3, | ||||||
|  | 		Ready:  pointer.Int32(0), | ||||||
|  | 	}) | ||||||
|  | 	validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New(0, 1), "", pointer.String("")) | ||||||
|  |  | ||||||
|  | 	// Succeed the third pod for index 0 | ||||||
|  | 	if err := setJobPhaseForIndex(ctx, clientSet, jobObj, v1.PodSucceeded, 0); err != nil { | ||||||
|  | 		t.Fatalf("Failed setting phase %s on Job Pod: %v", v1.PodSucceeded, err) | ||||||
|  | 	} | ||||||
|  | 	validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{ | ||||||
|  | 		Active:    1, | ||||||
|  | 		Failed:    3, | ||||||
|  | 		Succeeded: 1, | ||||||
|  | 		Ready:     pointer.Int32(0), | ||||||
|  | 	}) | ||||||
|  | 	validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New(1), "0", pointer.String("")) | ||||||
|  |  | ||||||
|  | 	// Fail the second pod for index 1 | ||||||
|  | 	if err := setJobPhaseForIndex(ctx, clientSet, jobObj, v1.PodFailed, 1); err != nil { | ||||||
|  | 		t.Fatalf("Failed setting phase %s on Job Pod: %v", v1.PodFailed, err) | ||||||
|  | 	} | ||||||
|  | 	validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{ | ||||||
|  | 		Active:    1, | ||||||
|  | 		Failed:    4, | ||||||
|  | 		Succeeded: 1, | ||||||
|  | 		Ready:     pointer.Int32(0), | ||||||
|  | 	}) | ||||||
|  | 	validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New(1), "0", pointer.String("")) | ||||||
|  |  | ||||||
|  | 	// Succeed the third pod for index 1 | ||||||
|  | 	if err := setJobPhaseForIndex(ctx, clientSet, jobObj, v1.PodSucceeded, 1); err != nil { | ||||||
|  | 		t.Fatalf("Failed setting phase %s on Job Pod: %v", v1.PodSucceeded, err) | ||||||
|  | 	} | ||||||
|  | 	validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{ | ||||||
|  | 		Active:    0, | ||||||
|  | 		Failed:    4, | ||||||
|  | 		Succeeded: 2, | ||||||
|  | 		Ready:     pointer.Int32(0), | ||||||
|  | 	}) | ||||||
|  | 	validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New[int](), "0,1", pointer.String("")) | ||||||
|  | 	validateJobSucceeded(ctx, t, clientSet, jobObj) | ||||||
|  |  | ||||||
|  | 	for index := 0; index < int(*jobObj.Spec.Completions); index++ { | ||||||
|  | 		podsForIndex, err := getJobPodsForIndex(ctx, clientSet, jobObj, index, func(_ *v1.Pod) bool { return true }) | ||||||
|  | 		if err != nil { | ||||||
|  | 			t.Fatalf("Failed to list job %q pods for index %v, error: %v", klog.KObj(jobObj), index, err) | ||||||
|  | 		} | ||||||
|  | 		validateExpotentialBackoffDelay(t, jobcontroller.DefaultJobPodFailureBackOff, podsForIndex) | ||||||
|  | 	} | ||||||
|  | } | ||||||
|  |  | ||||||
|  | // TestBackoffLimitPerIndex tests handling of job and its pods when | ||||||
|  | // backoff limit per index is used. | ||||||
|  | func TestBackoffLimitPerIndex(t *testing.T) { | ||||||
|  | 	t.Cleanup(setDurationDuringTest(&jobcontroller.DefaultJobPodFailureBackOff, fastPodFailureBackoff)) | ||||||
|  |  | ||||||
|  | 	type podTerminationWithExpectations struct { | ||||||
|  | 		index                          int | ||||||
|  | 		status                         v1.PodStatus | ||||||
|  | 		wantActive                     int | ||||||
|  | 		wantFailed                     int | ||||||
|  | 		wantSucceeded                  int | ||||||
|  | 		wantActiveIndexes              sets.Set[int] | ||||||
|  | 		wantCompletedIndexes           string | ||||||
|  | 		wantFailedIndexes              *string | ||||||
|  | 		wantReplacementPodFailureCount *int | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	podTemplateSpec := v1.PodTemplateSpec{ | ||||||
|  | 		Spec: v1.PodSpec{ | ||||||
|  | 			Containers: []v1.Container{ | ||||||
|  | 				{ | ||||||
|  | 					Name:                     "main-container", | ||||||
|  | 					Image:                    "foo", | ||||||
|  | 					ImagePullPolicy:          v1.PullIfNotPresent, | ||||||
|  | 					TerminationMessagePolicy: v1.TerminationMessageFallbackToLogsOnError, | ||||||
|  | 				}, | ||||||
|  | 			}, | ||||||
|  | 		}, | ||||||
|  | 	} | ||||||
|  | 	testCases := map[string]struct { | ||||||
|  | 		job                  batchv1.Job | ||||||
|  | 		podTerminations      []podTerminationWithExpectations | ||||||
|  | 		wantJobConditionType batchv1.JobConditionType | ||||||
|  | 	}{ | ||||||
|  | 		"job succeeded": { | ||||||
|  | 			job: batchv1.Job{ | ||||||
|  | 				Spec: batchv1.JobSpec{ | ||||||
|  | 					Parallelism:          pointer.Int32(2), | ||||||
|  | 					Completions:          pointer.Int32(2), | ||||||
|  | 					CompletionMode:       completionModePtr(batchv1.IndexedCompletion), | ||||||
|  | 					BackoffLimitPerIndex: pointer.Int32(1), | ||||||
|  | 					Template:             podTemplateSpec, | ||||||
|  | 				}, | ||||||
|  | 			}, | ||||||
|  | 			podTerminations: []podTerminationWithExpectations{ | ||||||
|  | 				{ | ||||||
|  | 					status: v1.PodStatus{ | ||||||
|  | 						Phase: v1.PodFailed, | ||||||
|  | 					}, | ||||||
|  | 					wantActive:                     2, | ||||||
|  | 					wantFailed:                     1, | ||||||
|  | 					wantActiveIndexes:              sets.New(0, 1), | ||||||
|  | 					wantFailedIndexes:              pointer.String(""), | ||||||
|  | 					wantReplacementPodFailureCount: pointer.Int(1), | ||||||
|  | 				}, | ||||||
|  | 			}, | ||||||
|  | 			wantJobConditionType: batchv1.JobComplete, | ||||||
|  | 		}, | ||||||
|  | 		"job index fails due to exceeding backoff limit per index": { | ||||||
|  | 			job: batchv1.Job{ | ||||||
|  | 				Spec: batchv1.JobSpec{ | ||||||
|  | 					Parallelism:          pointer.Int32(2), | ||||||
|  | 					Completions:          pointer.Int32(2), | ||||||
|  | 					CompletionMode:       completionModePtr(batchv1.IndexedCompletion), | ||||||
|  | 					BackoffLimitPerIndex: pointer.Int32(2), | ||||||
|  | 					Template:             podTemplateSpec, | ||||||
|  | 				}, | ||||||
|  | 			}, | ||||||
|  | 			podTerminations: []podTerminationWithExpectations{ | ||||||
|  | 				{ | ||||||
|  | 					status: v1.PodStatus{ | ||||||
|  | 						Phase: v1.PodFailed, | ||||||
|  | 					}, | ||||||
|  | 					wantActive:                     2, | ||||||
|  | 					wantFailed:                     1, | ||||||
|  | 					wantActiveIndexes:              sets.New(0, 1), | ||||||
|  | 					wantFailedIndexes:              pointer.String(""), | ||||||
|  | 					wantReplacementPodFailureCount: pointer.Int(1), | ||||||
|  | 				}, | ||||||
|  | 				{ | ||||||
|  | 					status: v1.PodStatus{ | ||||||
|  | 						Phase: v1.PodFailed, | ||||||
|  | 					}, | ||||||
|  | 					wantActive:                     2, | ||||||
|  | 					wantFailed:                     2, | ||||||
|  | 					wantActiveIndexes:              sets.New(0, 1), | ||||||
|  | 					wantFailedIndexes:              pointer.String(""), | ||||||
|  | 					wantReplacementPodFailureCount: pointer.Int(2), | ||||||
|  | 				}, | ||||||
|  | 				{ | ||||||
|  | 					status: v1.PodStatus{ | ||||||
|  | 						Phase: v1.PodFailed, | ||||||
|  | 					}, | ||||||
|  | 					wantActive:        1, | ||||||
|  | 					wantFailed:        3, | ||||||
|  | 					wantActiveIndexes: sets.New(1), | ||||||
|  | 					wantFailedIndexes: pointer.String("0"), | ||||||
|  | 				}, | ||||||
|  | 			}, | ||||||
|  | 			wantJobConditionType: batchv1.JobFailed, | ||||||
|  | 		}, | ||||||
|  | 		"job index fails due to exceeding the global backoff limit first": { | ||||||
|  | 			job: batchv1.Job{ | ||||||
|  | 				Spec: batchv1.JobSpec{ | ||||||
|  | 					Parallelism:          pointer.Int32(3), | ||||||
|  | 					Completions:          pointer.Int32(3), | ||||||
|  | 					CompletionMode:       completionModePtr(batchv1.IndexedCompletion), | ||||||
|  | 					BackoffLimitPerIndex: pointer.Int32(1), | ||||||
|  | 					BackoffLimit:         pointer.Int32(2), | ||||||
|  | 					Template:             podTemplateSpec, | ||||||
|  | 				}, | ||||||
|  | 			}, | ||||||
|  | 			podTerminations: []podTerminationWithExpectations{ | ||||||
|  | 				{ | ||||||
|  | 					index: 0, | ||||||
|  | 					status: v1.PodStatus{ | ||||||
|  | 						Phase: v1.PodFailed, | ||||||
|  | 					}, | ||||||
|  | 					wantActive:        3, | ||||||
|  | 					wantFailed:        1, | ||||||
|  | 					wantActiveIndexes: sets.New(0, 1, 2), | ||||||
|  | 					wantFailedIndexes: pointer.String(""), | ||||||
|  | 				}, | ||||||
|  | 				{ | ||||||
|  | 					index: 1, | ||||||
|  | 					status: v1.PodStatus{ | ||||||
|  | 						Phase: v1.PodFailed, | ||||||
|  | 					}, | ||||||
|  | 					wantActive:        3, | ||||||
|  | 					wantFailed:        2, | ||||||
|  | 					wantActiveIndexes: sets.New(0, 1, 2), | ||||||
|  | 					wantFailedIndexes: pointer.String(""), | ||||||
|  | 				}, | ||||||
|  | 				{ | ||||||
|  | 					index: 2, | ||||||
|  | 					status: v1.PodStatus{ | ||||||
|  | 						Phase: v1.PodFailed, | ||||||
|  | 					}, | ||||||
|  | 					wantFailed:        5, | ||||||
|  | 					wantFailedIndexes: pointer.String(""), | ||||||
|  | 				}, | ||||||
|  | 			}, | ||||||
|  | 			wantJobConditionType: batchv1.JobFailed, | ||||||
|  | 		}, | ||||||
|  | 		"job continues execution after a failed index, the job is marked Failed due to the failed index": { | ||||||
|  | 			job: batchv1.Job{ | ||||||
|  | 				Spec: batchv1.JobSpec{ | ||||||
|  | 					Parallelism:          pointer.Int32(2), | ||||||
|  | 					Completions:          pointer.Int32(2), | ||||||
|  | 					CompletionMode:       completionModePtr(batchv1.IndexedCompletion), | ||||||
|  | 					BackoffLimitPerIndex: pointer.Int32(0), | ||||||
|  | 					Template:             podTemplateSpec, | ||||||
|  | 				}, | ||||||
|  | 			}, | ||||||
|  | 			podTerminations: []podTerminationWithExpectations{ | ||||||
|  | 				{ | ||||||
|  | 					index: 0, | ||||||
|  | 					status: v1.PodStatus{ | ||||||
|  | 						Phase: v1.PodFailed, | ||||||
|  | 					}, | ||||||
|  | 					wantActive:        1, | ||||||
|  | 					wantFailed:        1, | ||||||
|  | 					wantActiveIndexes: sets.New(1), | ||||||
|  | 					wantFailedIndexes: pointer.String("0"), | ||||||
|  | 				}, | ||||||
|  | 				{ | ||||||
|  | 					index: 1, | ||||||
|  | 					status: v1.PodStatus{ | ||||||
|  | 						Phase: v1.PodSucceeded, | ||||||
|  | 					}, | ||||||
|  | 					wantFailed:           1, | ||||||
|  | 					wantSucceeded:        1, | ||||||
|  | 					wantFailedIndexes:    pointer.String("0"), | ||||||
|  | 					wantCompletedIndexes: "1", | ||||||
|  | 				}, | ||||||
|  | 			}, | ||||||
|  | 			wantJobConditionType: batchv1.JobFailed, | ||||||
|  | 		}, | ||||||
|  | 		"job execution terminated early due to exceeding max failed indexes": { | ||||||
|  | 			job: batchv1.Job{ | ||||||
|  | 				Spec: batchv1.JobSpec{ | ||||||
|  | 					Parallelism:          pointer.Int32(3), | ||||||
|  | 					Completions:          pointer.Int32(3), | ||||||
|  | 					CompletionMode:       completionModePtr(batchv1.IndexedCompletion), | ||||||
|  | 					BackoffLimitPerIndex: pointer.Int32(0), | ||||||
|  | 					MaxFailedIndexes:     pointer.Int32(1), | ||||||
|  | 					Template:             podTemplateSpec, | ||||||
|  | 				}, | ||||||
|  | 			}, | ||||||
|  | 			podTerminations: []podTerminationWithExpectations{ | ||||||
|  | 				{ | ||||||
|  | 					index: 0, | ||||||
|  | 					status: v1.PodStatus{ | ||||||
|  | 						Phase: v1.PodFailed, | ||||||
|  | 					}, | ||||||
|  | 					wantActive:        2, | ||||||
|  | 					wantFailed:        1, | ||||||
|  | 					wantActiveIndexes: sets.New(1, 2), | ||||||
|  | 					wantFailedIndexes: pointer.String("0"), | ||||||
|  | 				}, | ||||||
|  | 				{ | ||||||
|  | 					index: 1, | ||||||
|  | 					status: v1.PodStatus{ | ||||||
|  | 						Phase: v1.PodFailed, | ||||||
|  | 					}, | ||||||
|  | 					wantActive:        0, | ||||||
|  | 					wantFailed:        3, | ||||||
|  | 					wantFailedIndexes: pointer.String("0,1"), | ||||||
|  | 				}, | ||||||
|  | 			}, | ||||||
|  | 			wantJobConditionType: batchv1.JobFailed, | ||||||
|  | 		}, | ||||||
|  | 		"pod failure matching pod failure policy rule with FailIndex action": { | ||||||
|  | 			job: batchv1.Job{ | ||||||
|  | 				Spec: batchv1.JobSpec{ | ||||||
|  | 					Parallelism:          pointer.Int32(2), | ||||||
|  | 					Completions:          pointer.Int32(2), | ||||||
|  | 					CompletionMode:       completionModePtr(batchv1.IndexedCompletion), | ||||||
|  | 					BackoffLimitPerIndex: pointer.Int32(1), | ||||||
|  | 					Template:             podTemplateSpec, | ||||||
|  | 					PodFailurePolicy: &batchv1.PodFailurePolicy{ | ||||||
|  | 						Rules: []batchv1.PodFailurePolicyRule{ | ||||||
|  | 							{ | ||||||
|  | 								Action: batchv1.PodFailurePolicyActionFailIndex, | ||||||
|  | 								OnExitCodes: &batchv1.PodFailurePolicyOnExitCodesRequirement{ | ||||||
|  | 									Operator: batchv1.PodFailurePolicyOnExitCodesOpIn, | ||||||
|  | 									Values:   []int32{13}, | ||||||
|  | 								}, | ||||||
|  | 							}, | ||||||
|  | 							{ | ||||||
|  | 								Action: batchv1.PodFailurePolicyActionFailIndex, | ||||||
|  | 								OnPodConditions: []batchv1.PodFailurePolicyOnPodConditionsPattern{ | ||||||
|  | 									{ | ||||||
|  | 										Type:   v1.DisruptionTarget, | ||||||
|  | 										Status: v1.ConditionTrue, | ||||||
|  | 									}, | ||||||
|  | 								}, | ||||||
|  | 							}, | ||||||
|  | 						}, | ||||||
|  | 					}, | ||||||
|  | 				}, | ||||||
|  | 			}, | ||||||
|  | 			podTerminations: []podTerminationWithExpectations{ | ||||||
|  | 				{ | ||||||
|  | 					index: 0, | ||||||
|  | 					status: v1.PodStatus{ | ||||||
|  | 						Phase: v1.PodFailed, | ||||||
|  | 						ContainerStatuses: []v1.ContainerStatus{ | ||||||
|  | 							{ | ||||||
|  | 								State: v1.ContainerState{ | ||||||
|  | 									Terminated: &v1.ContainerStateTerminated{ | ||||||
|  | 										ExitCode: 13, | ||||||
|  | 									}, | ||||||
|  | 								}, | ||||||
|  | 							}, | ||||||
|  | 						}, | ||||||
|  | 					}, | ||||||
|  | 					wantActive:        1, | ||||||
|  | 					wantFailed:        1, | ||||||
|  | 					wantActiveIndexes: sets.New(1), | ||||||
|  | 					wantFailedIndexes: pointer.String("0"), | ||||||
|  | 				}, | ||||||
|  | 				{ | ||||||
|  | 					index: 1, | ||||||
|  | 					status: v1.PodStatus{ | ||||||
|  | 						Phase: v1.PodFailed, | ||||||
|  | 						Conditions: []v1.PodCondition{ | ||||||
|  | 							{ | ||||||
|  | 								Type:   v1.DisruptionTarget, | ||||||
|  | 								Status: v1.ConditionTrue, | ||||||
|  | 							}, | ||||||
|  | 						}, | ||||||
|  | 					}, | ||||||
|  | 					wantFailed:        2, | ||||||
|  | 					wantFailedIndexes: pointer.String("0,1"), | ||||||
|  | 				}, | ||||||
|  | 			}, | ||||||
|  | 			wantJobConditionType: batchv1.JobFailed, | ||||||
|  | 		}, | ||||||
|  | 	} | ||||||
|  | 	for name, test := range testCases { | ||||||
|  | 		t.Run(name, func(t *testing.T) { | ||||||
|  | 			resetMetrics() | ||||||
|  | 			defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobPodFailurePolicy, true)() | ||||||
|  | 			defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobBackoffLimitPerIndex, true)() | ||||||
|  |  | ||||||
|  | 			closeFn, restConfig, clientSet, ns := setup(t, "simple") | ||||||
|  | 			defer closeFn() | ||||||
|  | 			ctx, cancel := startJobControllerAndWaitForCaches(restConfig) | ||||||
|  | 			defer func() { | ||||||
|  | 				cancel() | ||||||
|  | 			}() | ||||||
|  | 			jobObj, err := createJobWithDefaults(ctx, clientSet, ns.Name, &test.job) | ||||||
|  | 			if err != nil { | ||||||
|  | 				t.Fatalf("Error %q while creating the job %q", err, jobObj.Name) | ||||||
|  | 			} | ||||||
|  | 			validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{ | ||||||
|  | 				Active: int(*test.job.Spec.Parallelism), | ||||||
|  | 				Ready:  pointer.Int32(0), | ||||||
|  | 			}) | ||||||
|  | 			for _, podTermination := range test.podTerminations { | ||||||
|  | 				pod, err := getActivePodForIndex(ctx, clientSet, jobObj, podTermination.index) | ||||||
|  | 				if err != nil { | ||||||
|  | 					t.Fatalf("listing Job Pods: %q", err) | ||||||
|  | 				} | ||||||
|  | 				pod.Status = podTermination.status | ||||||
|  | 				if _, err = clientSet.CoreV1().Pods(pod.Namespace).UpdateStatus(ctx, pod, metav1.UpdateOptions{}); err != nil { | ||||||
|  | 					t.Fatalf("Error updating the pod %q: %q", klog.KObj(pod), err) | ||||||
|  | 				} | ||||||
|  | 				validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{ | ||||||
|  | 					Active:    podTermination.wantActive, | ||||||
|  | 					Succeeded: podTermination.wantSucceeded, | ||||||
|  | 					Failed:    podTermination.wantFailed, | ||||||
|  | 					Ready:     pointer.Int32(0), | ||||||
|  | 				}) | ||||||
|  | 				validateIndexedJobPods(ctx, t, clientSet, jobObj, podTermination.wantActiveIndexes, podTermination.wantCompletedIndexes, podTermination.wantFailedIndexes) | ||||||
|  | 				if podTermination.wantReplacementPodFailureCount != nil { | ||||||
|  | 					replacement, err := getActivePodForIndex(ctx, clientSet, jobObj, podTermination.index) | ||||||
|  | 					if err != nil { | ||||||
|  | 						t.Fatalf("Failed to get active replacement pod for index: %v, error: %v", podTermination.index, err) | ||||||
|  | 					} | ||||||
|  | 					gotReplacementPodFailureCount, err := getIndexFailureCount(replacement) | ||||||
|  | 					if err != nil { | ||||||
|  | 						t.Fatalf("Failed read the index failure count annotation for pod: %v, error: %v", klog.KObj(replacement), err) | ||||||
|  | 					} | ||||||
|  | 					if *podTermination.wantReplacementPodFailureCount != gotReplacementPodFailureCount { | ||||||
|  | 						t.Fatalf("Unexpected value of the index failure count annotation. Want: %v, got: %v", *podTermination.wantReplacementPodFailureCount, gotReplacementPodFailureCount) | ||||||
|  | 					} | ||||||
|  | 				} | ||||||
|  | 			} | ||||||
|  |  | ||||||
|  | 			remainingActive := test.podTerminations[len(test.podTerminations)-1].wantActive | ||||||
|  | 			if remainingActive > 0 { | ||||||
|  | 				if err, _ := setJobPodsPhase(ctx, clientSet, jobObj, v1.PodSucceeded, remainingActive); err != nil { | ||||||
|  | 					t.Fatalf("Failed setting phase %q on Job Pod: %q", v1.PodSucceeded, err) | ||||||
|  | 				} | ||||||
|  | 			} | ||||||
|  | 			validateJobCondition(ctx, t, clientSet, jobObj, test.wantJobConditionType) | ||||||
|  | 			validateFinishedPodsNoFinalizer(ctx, t, clientSet, jobObj) | ||||||
|  | 		}) | ||||||
|  | 	} | ||||||
|  | } | ||||||
|  |  | ||||||
|  | func getIndexFailureCount(p *v1.Pod) (int, error) { | ||||||
|  | 	if p.Annotations == nil { | ||||||
|  | 		return 0, errors.New("no annotations found") | ||||||
|  | 	} | ||||||
|  | 	v, ok := p.Annotations[batchv1.JobIndexFailureCountAnnotation] | ||||||
|  | 	if !ok { | ||||||
|  | 		return 0, fmt.Errorf("annotation %s not found", batchv1.JobIndexFailureCountAnnotation) | ||||||
|  | 	} | ||||||
|  | 	return strconv.Atoi(v) | ||||||
|  | } | ||||||
|  |  | ||||||
|  | func completionModePtr(cm batchv1.CompletionMode) *batchv1.CompletionMode { | ||||||
|  | 	return &cm | ||||||
|  | } | ||||||
|  |  | ||||||
| // TestNonParallelJob tests that a Job that only executes one Pod. The test | // TestNonParallelJob tests that a Job that only executes one Pod. The test | ||||||
| // recreates the Job controller at some points to make sure a new controller | // recreates the Job controller at some points to make sure a new controller | ||||||
| // is able to pickup. | // is able to pickup. | ||||||
| @@ -999,7 +1626,7 @@ func TestIndexedJob(t *testing.T) { | |||||||
| 		Active: 3, | 		Active: 3, | ||||||
| 		Ready:  pointer.Int32(0), | 		Ready:  pointer.Int32(0), | ||||||
| 	}) | 	}) | ||||||
| 	validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New(0, 1, 2), "") | 	validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New(0, 1, 2), "", nil) | ||||||
|  |  | ||||||
| 	// One Pod succeeds. | 	// One Pod succeeds. | ||||||
| 	if err := setJobPhaseForIndex(ctx, clientSet, jobObj, v1.PodSucceeded, 1); err != nil { | 	if err := setJobPhaseForIndex(ctx, clientSet, jobObj, v1.PodSucceeded, 1); err != nil { | ||||||
| @@ -1010,7 +1637,7 @@ func TestIndexedJob(t *testing.T) { | |||||||
| 		Succeeded: 1, | 		Succeeded: 1, | ||||||
| 		Ready:     pointer.Int32(0), | 		Ready:     pointer.Int32(0), | ||||||
| 	}) | 	}) | ||||||
| 	validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New(0, 2, 3), "1") | 	validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New(0, 2, 3), "1", nil) | ||||||
|  |  | ||||||
| 	// One Pod fails, which should be recreated. | 	// One Pod fails, which should be recreated. | ||||||
| 	if err := setJobPhaseForIndex(ctx, clientSet, jobObj, v1.PodFailed, 2); err != nil { | 	if err := setJobPhaseForIndex(ctx, clientSet, jobObj, v1.PodFailed, 2); err != nil { | ||||||
| @@ -1022,7 +1649,7 @@ func TestIndexedJob(t *testing.T) { | |||||||
| 		Succeeded: 1, | 		Succeeded: 1, | ||||||
| 		Ready:     pointer.Int32(0), | 		Ready:     pointer.Int32(0), | ||||||
| 	}) | 	}) | ||||||
| 	validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New(0, 2, 3), "1") | 	validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New(0, 2, 3), "1", nil) | ||||||
|  |  | ||||||
| 	// Remaining Pods succeed. | 	// Remaining Pods succeed. | ||||||
| 	if err, _ := setJobPodsPhase(ctx, clientSet, jobObj, v1.PodSucceeded, 3); err != nil { | 	if err, _ := setJobPodsPhase(ctx, clientSet, jobObj, v1.PodSucceeded, 3); err != nil { | ||||||
| @@ -1034,7 +1661,7 @@ func TestIndexedJob(t *testing.T) { | |||||||
| 		Succeeded: 4, | 		Succeeded: 4, | ||||||
| 		Ready:     pointer.Int32(0), | 		Ready:     pointer.Int32(0), | ||||||
| 	}) | 	}) | ||||||
| 	validateIndexedJobPods(ctx, t, clientSet, jobObj, nil, "0-3") | 	validateIndexedJobPods(ctx, t, clientSet, jobObj, nil, "0-3", nil) | ||||||
| 	validateJobSucceeded(ctx, t, clientSet, jobObj) | 	validateJobSucceeded(ctx, t, clientSet, jobObj) | ||||||
| 	validateFinishedPodsNoFinalizer(ctx, t, clientSet, jobObj) | 	validateFinishedPodsNoFinalizer(ctx, t, clientSet, jobObj) | ||||||
| 	validateTerminatedPodsTrackingFinalizerMetric(t, 5) | 	validateTerminatedPodsTrackingFinalizerMetric(t, 5) | ||||||
| @@ -1208,7 +1835,7 @@ func TestElasticIndexedJob(t *testing.T) { | |||||||
| 					Failed:    update.wantFailed, | 					Failed:    update.wantFailed, | ||||||
| 					Ready:     pointer.Int32(0), | 					Ready:     pointer.Int32(0), | ||||||
| 				}) | 				}) | ||||||
| 				validateIndexedJobPods(ctx, t, clientSet, jobObj, update.wantRemainingIndexes, update.wantSucceededIndexes) | 				validateIndexedJobPods(ctx, t, clientSet, jobObj, update.wantRemainingIndexes, update.wantSucceededIndexes, nil) | ||||||
| 			} | 			} | ||||||
|  |  | ||||||
| 			validateJobSucceeded(ctx, t, clientSet, jobObj) | 			validateJobSucceeded(ctx, t, clientSet, jobObj) | ||||||
| @@ -1424,10 +2051,14 @@ func TestJobPodsCreatedWithExponentialBackoff(t *testing.T) { | |||||||
| 	if len(jobPods) != 3 { | 	if len(jobPods) != 3 { | ||||||
| 		t.Fatalf("Expected to get %v pods, received %v", 4, len(jobPods)) | 		t.Fatalf("Expected to get %v pods, received %v", 4, len(jobPods)) | ||||||
| 	} | 	} | ||||||
|  | 	validateExpotentialBackoffDelay(t, jobcontroller.DefaultJobPodFailureBackOff, jobPods) | ||||||
|  | } | ||||||
|  |  | ||||||
|  | func validateExpotentialBackoffDelay(t *testing.T, defaultPodFailureBackoff time.Duration, pods []*v1.Pod) { | ||||||
|  | 	t.Helper() | ||||||
| 	creationTime := []time.Time{} | 	creationTime := []time.Time{} | ||||||
| 	finishTime := []time.Time{} | 	finishTime := []time.Time{} | ||||||
| 	for _, pod := range jobPods { | 	for _, pod := range pods { | ||||||
| 		creationTime = append(creationTime, pod.CreationTimestamp.Time) | 		creationTime = append(creationTime, pod.CreationTimestamp.Time) | ||||||
| 		if len(pod.Status.ContainerStatuses) > 0 { | 		if len(pod.Status.ContainerStatuses) > 0 { | ||||||
| 			finishTime = append(finishTime, pod.Status.ContainerStatuses[0].State.Terminated.FinishedAt.Time) | 			finishTime = append(finishTime, pod.Status.ContainerStatuses[0].State.Terminated.FinishedAt.Time) | ||||||
| @@ -1441,25 +2072,24 @@ func TestJobPodsCreatedWithExponentialBackoff(t *testing.T) { | |||||||
| 		return finishTime[i].Before(finishTime[j]) | 		return finishTime[i].Before(finishTime[j]) | ||||||
| 	}) | 	}) | ||||||
|  |  | ||||||
| 	if creationTime[1].Sub(finishTime[0]).Seconds() < jobcontroller.DefaultJobPodFailureBackOff.Seconds() { | 	diff := creationTime[1].Sub(finishTime[0]) | ||||||
| 		t.Fatalf("Second pod should be created at least %v seconds after the first pod", jobcontroller.DefaultJobPodFailureBackOff) |  | ||||||
|  | 	if diff < defaultPodFailureBackoff { | ||||||
|  | 		t.Fatalf("Second pod should be created at least %v seconds after the first pod, time difference: %v", defaultPodFailureBackoff, diff) | ||||||
| 	} | 	} | ||||||
|  |  | ||||||
| 	if creationTime[1].Sub(finishTime[0]).Seconds() >= 2*jobcontroller.DefaultJobPodFailureBackOff.Seconds() { | 	if diff >= 2*defaultPodFailureBackoff { | ||||||
| 		t.Fatalf("Second pod should be created before %v seconds after the first pod", 2*jobcontroller.DefaultJobPodFailureBackOff) | 		t.Fatalf("Second pod should be created before %v seconds after the first pod, time difference: %v", 2*defaultPodFailureBackoff, diff) | ||||||
| 	} | 	} | ||||||
|  |  | ||||||
| 	diff := creationTime[2].Sub(finishTime[1]).Seconds() | 	diff = creationTime[2].Sub(finishTime[1]) | ||||||
|  |  | ||||||
| 	// The third pod should not be created before 4 seconds | 	if diff < 2*defaultPodFailureBackoff { | ||||||
| 	if diff < 2*jobcontroller.DefaultJobPodFailureBackOff.Seconds() { | 		t.Fatalf("Third pod should be created at least %v seconds after the second pod, time difference: %v", 2*defaultPodFailureBackoff, diff) | ||||||
| 		t.Fatalf("Third pod should be created at least %v seconds after the second pod", 2*jobcontroller.DefaultJobPodFailureBackOff) |  | ||||||
| 	} | 	} | ||||||
|  |  | ||||||
| 	// The third pod should be created within 8 seconds | 	if diff >= 4*defaultPodFailureBackoff { | ||||||
| 	// This check rules out double counting | 		t.Fatalf("Third pod should be created before %v seconds after the second pod, time difference: %v", 4*defaultPodFailureBackoff, diff) | ||||||
| 	if diff >= 4*jobcontroller.DefaultJobPodFailureBackOff.Seconds() { |  | ||||||
| 		t.Fatalf("Third pod should be created before %v seconds after the second pod", 4*jobcontroller.DefaultJobPodFailureBackOff) |  | ||||||
| 	} | 	} | ||||||
| } | } | ||||||
|  |  | ||||||
| @@ -1815,7 +2445,7 @@ func validateFinishedPodsNoFinalizer(ctx context.Context, t *testing.T, clientSe | |||||||
| // validateIndexedJobPods validates indexes and hostname of | // validateIndexedJobPods validates indexes and hostname of | ||||||
| // active and completed Pods of an Indexed Job. | // active and completed Pods of an Indexed Job. | ||||||
| // Call after validateJobPodsStatus | // Call after validateJobPodsStatus | ||||||
| func validateIndexedJobPods(ctx context.Context, t *testing.T, clientSet clientset.Interface, jobObj *batchv1.Job, wantActive sets.Set[int], gotCompleted string) { | func validateIndexedJobPods(ctx context.Context, t *testing.T, clientSet clientset.Interface, jobObj *batchv1.Job, wantActive sets.Set[int], gotCompleted string, wantFailed *string) { | ||||||
| 	t.Helper() | 	t.Helper() | ||||||
| 	updatedJob, err := clientSet.BatchV1().Jobs(jobObj.Namespace).Get(ctx, jobObj.Name, metav1.GetOptions{}) | 	updatedJob, err := clientSet.BatchV1().Jobs(jobObj.Namespace).Get(ctx, jobObj.Name, metav1.GetOptions{}) | ||||||
| 	if err != nil { | 	if err != nil { | ||||||
| @@ -1824,6 +2454,9 @@ func validateIndexedJobPods(ctx context.Context, t *testing.T, clientSet clients | |||||||
| 	if updatedJob.Status.CompletedIndexes != gotCompleted { | 	if updatedJob.Status.CompletedIndexes != gotCompleted { | ||||||
| 		t.Errorf("Got completed indexes %q, want %q", updatedJob.Status.CompletedIndexes, gotCompleted) | 		t.Errorf("Got completed indexes %q, want %q", updatedJob.Status.CompletedIndexes, gotCompleted) | ||||||
| 	} | 	} | ||||||
|  | 	if diff := cmp.Diff(wantFailed, updatedJob.Status.FailedIndexes); diff != "" { | ||||||
|  | 		t.Errorf("Got unexpected failed indexes: %s", diff) | ||||||
|  | 	} | ||||||
| 	pods, err := clientSet.CoreV1().Pods(jobObj.Namespace).List(ctx, metav1.ListOptions{}) | 	pods, err := clientSet.CoreV1().Pods(jobObj.Namespace).List(ctx, metav1.ListOptions{}) | ||||||
| 	if err != nil { | 	if err != nil { | ||||||
| 		t.Fatalf("Failed to list Job Pods: %v", err) | 		t.Fatalf("Failed to list Job Pods: %v", err) | ||||||
| @@ -2005,6 +2638,17 @@ func setJobPhaseForIndex(ctx context.Context, clientSet clientset.Interface, job | |||||||
| 		} | 		} | ||||||
| 		if pix, err := getCompletionIndex(&pod); err == nil && pix == ix { | 		if pix, err := getCompletionIndex(&pod); err == nil && pix == ix { | ||||||
| 			pod.Status.Phase = phase | 			pod.Status.Phase = phase | ||||||
|  | 			if phase == v1.PodFailed || phase == v1.PodSucceeded { | ||||||
|  | 				pod.Status.ContainerStatuses = []v1.ContainerStatus{ | ||||||
|  | 					{ | ||||||
|  | 						State: v1.ContainerState{ | ||||||
|  | 							Terminated: &v1.ContainerStateTerminated{ | ||||||
|  | 								FinishedAt: metav1.Now(), | ||||||
|  | 							}, | ||||||
|  | 						}, | ||||||
|  | 					}, | ||||||
|  | 				} | ||||||
|  | 			} | ||||||
| 			_, err := clientSet.CoreV1().Pods(pod.Namespace).UpdateStatus(ctx, &pod, metav1.UpdateOptions{}) | 			_, err := clientSet.CoreV1().Pods(pod.Namespace).UpdateStatus(ctx, &pod, metav1.UpdateOptions{}) | ||||||
| 			if err != nil { | 			if err != nil { | ||||||
| 				return fmt.Errorf("updating pod %s status: %w", pod.Name, err) | 				return fmt.Errorf("updating pod %s status: %w", pod.Name, err) | ||||||
| @@ -2015,6 +2659,44 @@ func setJobPhaseForIndex(ctx context.Context, clientSet clientset.Interface, job | |||||||
| 	return errors.New("no pod matching index found") | 	return errors.New("no pod matching index found") | ||||||
| } | } | ||||||
|  |  | ||||||
|  | func getActivePodForIndex(ctx context.Context, clientSet clientset.Interface, jobObj *batchv1.Job, ix int) (*v1.Pod, error) { | ||||||
|  | 	return getJobPodForIndex(ctx, clientSet, jobObj, ix, func(p *v1.Pod) bool { | ||||||
|  | 		return !podutil.IsPodTerminal(p) | ||||||
|  | 	}) | ||||||
|  | } | ||||||
|  |  | ||||||
|  | func getJobPodForIndex(ctx context.Context, clientSet clientset.Interface, jobObj *batchv1.Job, ix int, filter func(*v1.Pod) bool) (*v1.Pod, error) { | ||||||
|  | 	pods, err := getJobPodsForIndex(ctx, clientSet, jobObj, ix, filter) | ||||||
|  | 	if err != nil { | ||||||
|  | 		return nil, err | ||||||
|  | 	} | ||||||
|  | 	if len(pods) == 0 { | ||||||
|  | 		return nil, fmt.Errorf("Pod not found for index: %v", ix) | ||||||
|  | 	} | ||||||
|  | 	return pods[0], nil | ||||||
|  | } | ||||||
|  |  | ||||||
|  | func getJobPodsForIndex(ctx context.Context, clientSet clientset.Interface, jobObj *batchv1.Job, ix int, filter func(*v1.Pod) bool) ([]*v1.Pod, error) { | ||||||
|  | 	pods, err := clientSet.CoreV1().Pods(jobObj.Namespace).List(ctx, metav1.ListOptions{}) | ||||||
|  | 	if err != nil { | ||||||
|  | 		return nil, fmt.Errorf("listing Job Pods: %w", err) | ||||||
|  | 	} | ||||||
|  | 	var result []*v1.Pod | ||||||
|  | 	for _, pod := range pods.Items { | ||||||
|  | 		pod := pod | ||||||
|  | 		if !metav1.IsControlledBy(&pod, jobObj) { | ||||||
|  | 			continue | ||||||
|  | 		} | ||||||
|  | 		if !filter(&pod) { | ||||||
|  | 			continue | ||||||
|  | 		} | ||||||
|  | 		if pix, err := getCompletionIndex(&pod); err == nil && pix == ix { | ||||||
|  | 			result = append(result, &pod) | ||||||
|  | 		} | ||||||
|  | 	} | ||||||
|  | 	return result, nil | ||||||
|  | } | ||||||
|  |  | ||||||
| func getCompletionIndex(p *v1.Pod) (int, error) { | func getCompletionIndex(p *v1.Pod) (int, error) { | ||||||
| 	if p.Annotations == nil { | 	if p.Annotations == nil { | ||||||
| 		return 0, errors.New("no annotations found") | 		return 0, errors.New("no annotations found") | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Michał Woźniak
					Michał Woźniak