kubernetes/test/integration/job/job_test.go

4527 lines
151 KiB
Go

/*
Copyright 2021 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package job
import (
"context"
"errors"
"fmt"
"sort"
"strconv"
"strings"
"sync"
"sync/atomic"
"testing"
"time"
"github.com/google/go-cmp/cmp"
"github.com/google/go-cmp/cmp/cmpopts"
batchv1 "k8s.io/api/batch/v1"
v1 "k8s.io/api/core/v1"
eventsv1 "k8s.io/api/events/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/apimachinery/pkg/watch"
"k8s.io/apiserver/pkg/util/feature"
"k8s.io/client-go/informers"
clientset "k8s.io/client-go/kubernetes"
typedv1 "k8s.io/client-go/kubernetes/typed/batch/v1"
restclient "k8s.io/client-go/rest"
"k8s.io/client-go/tools/record"
"k8s.io/client-go/util/retry"
featuregatetesting "k8s.io/component-base/featuregate/testing"
basemetrics "k8s.io/component-base/metrics"
"k8s.io/component-base/metrics/testutil"
"k8s.io/klog/v2"
kubeapiservertesting "k8s.io/kubernetes/cmd/kube-apiserver/app/testing"
podutil "k8s.io/kubernetes/pkg/api/v1/pod"
"k8s.io/kubernetes/pkg/controller"
jobcontroller "k8s.io/kubernetes/pkg/controller/job"
"k8s.io/kubernetes/pkg/controller/job/metrics"
"k8s.io/kubernetes/pkg/features"
"k8s.io/kubernetes/test/integration/framework"
"k8s.io/kubernetes/test/integration/util"
"k8s.io/utils/ptr"
)
const waitInterval = time.Second
const fastPodFailureBackoff = 100 * time.Millisecond
// Time duration used to account for controller latency in tests in which it is
// expected the Job controller does not make a change. In that cases we wait a
// little bit (more than the typical time for a couple of controller syncs) and
// verify there is no change.
const sleepDurationForControllerLatency = 100 * time.Millisecond
type metricLabelsWithValue struct {
Labels []string
Value int
}
func validateCounterMetric(ctx context.Context, t *testing.T, counterVec *basemetrics.CounterVec, wantMetric metricLabelsWithValue) {
t.Helper()
var cmpErr error
err := wait.PollUntilContextTimeout(ctx, 10*time.Millisecond, 10*time.Second, true, func(ctx context.Context) (bool, error) {
cmpErr = nil
value, err := testutil.GetCounterMetricValue(counterVec.WithLabelValues(wantMetric.Labels...))
if err != nil {
return true, fmt.Errorf("collecting the %q metric: %w", counterVec.Name, err)
}
if wantMetric.Value != int(value) {
cmpErr = fmt.Errorf("Unexpected metric delta for %q metric with labels %q. want: %v, got: %v", counterVec.Name, wantMetric.Labels, wantMetric.Value, int(value))
return false, nil
}
return true, nil
})
if err != nil {
t.Errorf("Failed waiting for expected metric: %v", err)
}
if cmpErr != nil {
t.Error(cmpErr)
}
}
func validateTerminatedPodsTrackingFinalizerMetric(ctx context.Context, t *testing.T, want int) {
validateCounterMetric(ctx, t, metrics.TerminatedPodsTrackingFinalizerTotal, metricLabelsWithValue{
Value: want,
Labels: []string{metrics.Add},
})
validateCounterMetric(ctx, t, metrics.TerminatedPodsTrackingFinalizerTotal, metricLabelsWithValue{
Value: want,
Labels: []string{metrics.Delete},
})
}
// TestJobPodFailurePolicyWithFailedPodDeletedDuringControllerRestart verifies that the job is properly marked as Failed
// in a scenario when the job controller crashes between removing pod finalizers and marking the job as Failed (based on
// the pod failure policy). After the finalizer for the failed pod is removed we remove the failed pod. This step is
// done to simulate what PodGC would do. Then, the test spawns the second instance of the controller to check that it
// will pick up the job state properly and will mark it as Failed, even if th pod triggering the pod failure policy is
// already deleted.
// Note: this scenario requires the use of finalizers. Without finalizers there is no guarantee a failed pod would be
// checked against the pod failure policy rules before its removal by PodGC.
func TestJobPodFailurePolicyWithFailedPodDeletedDuringControllerRestart(t *testing.T) {
count := int32(3)
job := batchv1.Job{
Spec: batchv1.JobSpec{
Template: v1.PodTemplateSpec{
Spec: v1.PodSpec{
Containers: []v1.Container{
{
Name: "main-container",
Image: "foo",
ImagePullPolicy: v1.PullIfNotPresent,
TerminationMessagePolicy: v1.TerminationMessageFallbackToLogsOnError,
},
},
},
},
Parallelism: &count,
Completions: &count,
PodFailurePolicy: &batchv1.PodFailurePolicy{
Rules: []batchv1.PodFailurePolicyRule{
{
Action: batchv1.PodFailurePolicyActionFailJob,
OnExitCodes: &batchv1.PodFailurePolicyOnExitCodesRequirement{
Operator: batchv1.PodFailurePolicyOnExitCodesOpIn,
Values: []int32{5},
},
},
},
},
},
}
podStatusMatchingOnExitCodesTerminateRule := v1.PodStatus{
Phase: v1.PodFailed,
ContainerStatuses: []v1.ContainerStatus{
{
Name: "main-container",
State: v1.ContainerState{
Terminated: &v1.ContainerStateTerminated{
ExitCode: 5,
},
},
},
},
}
closeFn, restConfig, cs, ns := setup(t, "simple")
defer closeFn()
// Make the job controller significantly slower to trigger race condition.
restConfig.QPS = 1
restConfig.Burst = 1
ctx, cancel := startJobControllerAndWaitForCaches(t, restConfig)
defer func() {
cancel()
}()
resetMetrics()
restConfig.QPS = 200
restConfig.Burst = 200
// create a job with a failed pod matching the exit code rule and a couple of successful pods
jobObj, err := createJobWithDefaults(ctx, cs, ns.Name, &job)
if err != nil {
t.Fatalf("Failed to create Job: %v", err)
}
validateJobPodsStatus(ctx, t, cs, jobObj, podsByStatus{
Active: int(count),
Ready: ptr.To[int32](0),
Terminating: ptr.To[int32](0),
})
jobPods, err := getJobPods(ctx, t, cs, jobObj, func(s v1.PodStatus) bool {
return (s.Phase == v1.PodPending || s.Phase == v1.PodRunning)
})
if err != nil {
t.Fatalf("Failed to list Job Pods: %v", err)
}
failedIndex := 1
wg := sync.WaitGroup{}
wg.Add(1)
// Await for the failed pod (with index failedIndex) to have its finalizer
// removed. The finalizer will be removed by the job controller just after
// appending the FailureTarget condition to the job to mark it as targeted
// for failure.
go func(ctx context.Context) {
err := wait.PollUntilContextTimeout(ctx, 10*time.Millisecond, time.Minute, true, func(ctx context.Context) (bool, error) {
failedPodUpdated, err := cs.CoreV1().Pods(jobObj.Namespace).Get(ctx, jobPods[failedIndex].Name, metav1.GetOptions{})
if err != nil {
return true, err
}
if len(failedPodUpdated.Finalizers) == 0 {
return true, nil
}
return false, nil
})
if err != nil {
t.Logf("Failed awaiting for the finalizer removal for pod %v", klog.KObj(jobPods[failedIndex]))
}
wg.Done()
}(ctx)
// We update one pod as failed with state matching the pod failure policy rule. This results in removal
// of the pod finalizer from the pod by the job controller.
failedPod := jobPods[failedIndex]
updatedPod := failedPod.DeepCopy()
updatedPod.Status = podStatusMatchingOnExitCodesTerminateRule
_, err = updatePodStatuses(ctx, cs, []v1.Pod{*updatedPod})
if err != nil {
t.Fatalf("Failed to update pod statuses %q for pods of job %q", err, klog.KObj(jobObj))
}
wg.Wait()
t.Logf("Finalizer is removed for the failed pod %q. Shutting down the controller.", klog.KObj(failedPod))
// shut down the first job controller as soon as it removed the finalizer for the failed pod. This will
// likely happen before the first controller is able to mark the job as Failed.
cancel()
// Delete the failed pod to make sure it is not used by the second instance of the controller
ctx, cancel = context.WithCancel(context.Background())
err = cs.CoreV1().Pods(failedPod.Namespace).Delete(ctx, failedPod.Name, metav1.DeleteOptions{GracePeriodSeconds: ptr.To[int64](0)})
if err != nil {
t.Fatalf("Error: '%v' while deleting pod: '%v'", err, klog.KObj(failedPod))
}
t.Logf("The failed pod %q is deleted", klog.KObj(failedPod))
cancel()
// start the second controller to promote the interim FailureTarget job condition as Failed
ctx, cancel = startJobControllerAndWaitForCaches(t, restConfig)
// verify the job is correctly marked as Failed
validateJobFailed(ctx, t, cs, jobObj)
validateNoOrphanPodsWithFinalizers(ctx, t, cs, jobObj)
}
// TestJobPodFailurePolicy tests handling of pod failures with respect to the
// configured pod failure policy rules
func TestJobPodFailurePolicy(t *testing.T) {
t.Cleanup(setDurationDuringTest(&jobcontroller.DefaultJobPodFailureBackOff, fastPodFailureBackoff))
job := batchv1.Job{
Spec: batchv1.JobSpec{
Template: v1.PodTemplateSpec{
Spec: v1.PodSpec{
Containers: []v1.Container{
{
Name: "main-container",
Image: "foo",
ImagePullPolicy: v1.PullIfNotPresent,
TerminationMessagePolicy: v1.TerminationMessageFallbackToLogsOnError,
},
},
},
},
PodFailurePolicy: &batchv1.PodFailurePolicy{
Rules: []batchv1.PodFailurePolicyRule{
{
Action: batchv1.PodFailurePolicyActionIgnore,
OnPodConditions: []batchv1.PodFailurePolicyOnPodConditionsPattern{
{
Type: v1.DisruptionTarget,
},
},
},
{
Action: batchv1.PodFailurePolicyActionCount,
OnExitCodes: &batchv1.PodFailurePolicyOnExitCodesRequirement{
Operator: batchv1.PodFailurePolicyOnExitCodesOpIn,
Values: []int32{10},
},
},
{
Action: batchv1.PodFailurePolicyActionFailJob,
OnExitCodes: &batchv1.PodFailurePolicyOnExitCodesRequirement{
Operator: batchv1.PodFailurePolicyOnExitCodesOpIn,
Values: []int32{5, 6, 7},
},
},
},
},
},
}
podStatusMatchingOnExitCodesTerminateRule := v1.PodStatus{
Phase: v1.PodFailed,
ContainerStatuses: []v1.ContainerStatus{
{
Name: "main-container",
State: v1.ContainerState{
Terminated: &v1.ContainerStateTerminated{
ExitCode: 5,
},
},
},
},
}
podStatusMatchingOnExitCodesCountRule := v1.PodStatus{
Phase: v1.PodFailed,
ContainerStatuses: []v1.ContainerStatus{
{
Name: "main-container",
State: v1.ContainerState{
Terminated: &v1.ContainerStateTerminated{
ExitCode: 10,
},
},
},
},
}
podStatusMatchingOnPodConditionsIgnoreRule := v1.PodStatus{
Phase: v1.PodFailed,
Conditions: []v1.PodCondition{
{
Type: v1.DisruptionTarget,
Status: v1.ConditionTrue,
},
},
}
podStatusNotMatchingAnyRule := v1.PodStatus{
Phase: v1.PodFailed,
ContainerStatuses: []v1.ContainerStatus{
{
State: v1.ContainerState{
Terminated: &v1.ContainerStateTerminated{},
},
},
},
}
testCases := map[string]struct {
restartController bool
job batchv1.Job
podStatus v1.PodStatus
wantActive int
wantFailed int
wantJobConditionType batchv1.JobConditionType
wantJobFinishedMetric metricLabelsWithValue
wantPodFailuresHandledByPolicyRuleMetric *metricLabelsWithValue
}{
"pod status matching the configured FailJob rule on exit codes; job terminated": {
job: job,
podStatus: podStatusMatchingOnExitCodesTerminateRule,
wantActive: 0,
wantFailed: 1,
wantJobConditionType: batchv1.JobFailed,
wantJobFinishedMetric: metricLabelsWithValue{
Labels: []string{"NonIndexed", "failed", "PodFailurePolicy"},
Value: 1,
},
wantPodFailuresHandledByPolicyRuleMetric: &metricLabelsWithValue{
Labels: []string{"FailJob"},
Value: 1,
},
},
"pod status matching the configured FailJob rule on exit codes; with controller restart; job terminated": {
restartController: true,
job: job,
podStatus: podStatusMatchingOnExitCodesTerminateRule,
wantActive: 0,
wantFailed: 1,
wantJobConditionType: batchv1.JobFailed,
wantJobFinishedMetric: metricLabelsWithValue{
Labels: []string{"NonIndexed", "failed", "PodFailurePolicy"},
Value: 1,
},
},
"pod status matching the configured Ignore rule on pod conditions; pod failure not counted": {
job: job,
podStatus: podStatusMatchingOnPodConditionsIgnoreRule,
wantActive: 1,
wantFailed: 0,
wantJobConditionType: batchv1.JobComplete,
wantPodFailuresHandledByPolicyRuleMetric: &metricLabelsWithValue{
Labels: []string{"Ignore"},
Value: 1,
},
wantJobFinishedMetric: metricLabelsWithValue{
Labels: []string{"NonIndexed", "succeeded", "CompletionsReached"},
Value: 1,
},
},
"pod status matching the configured Count rule on exit codes; pod failure counted": {
job: job,
podStatus: podStatusMatchingOnExitCodesCountRule,
wantActive: 1,
wantFailed: 1,
wantJobConditionType: batchv1.JobComplete,
wantJobFinishedMetric: metricLabelsWithValue{
Labels: []string{"NonIndexed", "succeeded", "CompletionsReached"},
Value: 1,
},
wantPodFailuresHandledByPolicyRuleMetric: &metricLabelsWithValue{
Labels: []string{"Count"},
Value: 1,
},
},
"pod status non-matching any configured rule; pod failure counted": {
job: job,
podStatus: podStatusNotMatchingAnyRule,
wantActive: 1,
wantFailed: 1,
wantJobConditionType: batchv1.JobComplete,
wantJobFinishedMetric: metricLabelsWithValue{
Labels: []string{"NonIndexed", "succeeded", "CompletionsReached"},
Value: 1,
},
wantPodFailuresHandledByPolicyRuleMetric: &metricLabelsWithValue{
Labels: []string{"Count"},
Value: 0,
},
},
}
for name, test := range testCases {
t.Run(name, func(t *testing.T) {
resetMetrics()
closeFn, restConfig, clientSet, ns := setup(t, "simple")
defer closeFn()
ctx, cancel := startJobControllerAndWaitForCaches(t, restConfig)
defer func() {
cancel()
}()
jobObj, err := createJobWithDefaults(ctx, clientSet, ns.Name, &test.job)
if err != nil {
t.Fatalf("Error %q while creating the job %q", err, jobObj.Name)
}
validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
Active: 1,
Ready: ptr.To[int32](0),
Terminating: ptr.To[int32](0),
})
op := func(p *v1.Pod) bool {
p.Status = test.podStatus
return true
}
if _, err := updateJobPodsStatus(ctx, clientSet, jobObj, op, 1); err != nil {
t.Fatalf("Error %q while updating pod status for Job: %v", err, jobObj.Name)
}
if test.restartController {
cancel()
ctx, cancel = startJobControllerAndWaitForCaches(t, restConfig)
}
validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
Active: test.wantActive,
Failed: test.wantFailed,
Ready: ptr.To[int32](0),
Terminating: ptr.To[int32](0),
})
if test.wantJobConditionType == batchv1.JobComplete {
if _, err := setJobPodsPhase(ctx, clientSet, jobObj, v1.PodSucceeded, 1); err != nil {
t.Fatalf("Failed setting phase %q on Job Pod: %v", v1.PodSucceeded, err)
}
}
validateJobCondition(ctx, t, clientSet, jobObj, test.wantJobConditionType)
validateCounterMetric(ctx, t, metrics.JobFinishedNum, test.wantJobFinishedMetric)
if test.wantPodFailuresHandledByPolicyRuleMetric != nil {
validateCounterMetric(ctx, t, metrics.PodFailuresHandledByFailurePolicy, *test.wantPodFailuresHandledByPolicyRuleMetric)
}
validateFinishedPodsNoFinalizer(ctx, t, clientSet, jobObj)
})
}
}
// TestSuccessPolicy tests handling of job and its pods when
// successPolicy is used.
func TestSuccessPolicy(t *testing.T) {
type podTerminationWithExpectations struct {
index int
status v1.PodStatus
wantActive int
wantFailed int
wantSucceeded int
wantActiveIndexes sets.Set[int]
wantCompletedIndexes string
wantFailedIndexes *string
wantTerminating *int32
}
podTemplateSpec := v1.PodTemplateSpec{
Spec: v1.PodSpec{
Containers: []v1.Container{
{
Name: "main-container",
Image: "foo",
ImagePullPolicy: v1.PullIfNotPresent,
TerminationMessagePolicy: v1.TerminationMessageFallbackToLogsOnError,
},
},
},
}
testCases := map[string]struct {
enableJobSuccessPolicy bool
enableBackoffLimitPerIndex bool
job batchv1.Job
podTerminations []podTerminationWithExpectations
wantConditionTypes []batchv1.JobConditionType
wantJobFinishedNumMetric []metricLabelsWithValue
}{
"all indexes succeeded; JobSuccessPolicy is enabled": {
enableJobSuccessPolicy: true,
job: batchv1.Job{
Spec: batchv1.JobSpec{
Parallelism: ptr.To[int32](1),
Completions: ptr.To[int32](1),
CompletionMode: completionModePtr(batchv1.IndexedCompletion),
Template: podTemplateSpec,
SuccessPolicy: &batchv1.SuccessPolicy{
Rules: []batchv1.SuccessPolicyRule{{
SucceededIndexes: ptr.To("0"),
}},
},
},
},
podTerminations: []podTerminationWithExpectations{
{
index: 0,
status: v1.PodStatus{
Phase: v1.PodSucceeded,
},
wantActive: 0,
wantFailed: 0,
wantSucceeded: 1,
wantCompletedIndexes: "0",
wantTerminating: ptr.To[int32](0),
},
},
wantConditionTypes: []batchv1.JobConditionType{batchv1.JobSuccessCriteriaMet, batchv1.JobComplete},
wantJobFinishedNumMetric: []metricLabelsWithValue{
{
Labels: []string{"Indexed", "succeeded", "SuccessPolicy"},
Value: 1,
},
},
},
"all indexes succeeded; JobSuccessPolicy is disabled": {
job: batchv1.Job{
Spec: batchv1.JobSpec{
Parallelism: ptr.To[int32](1),
Completions: ptr.To[int32](1),
CompletionMode: completionModePtr(batchv1.IndexedCompletion),
Template: podTemplateSpec,
SuccessPolicy: &batchv1.SuccessPolicy{
Rules: []batchv1.SuccessPolicyRule{{
SucceededIndexes: ptr.To("0"),
}},
},
},
},
podTerminations: []podTerminationWithExpectations{
{
index: 0,
status: v1.PodStatus{
Phase: v1.PodSucceeded,
},
wantActive: 0,
wantFailed: 0,
wantSucceeded: 1,
wantCompletedIndexes: "0",
wantTerminating: ptr.To[int32](0),
},
},
wantConditionTypes: []batchv1.JobConditionType{batchv1.JobComplete},
wantJobFinishedNumMetric: []metricLabelsWithValue{
{
Labels: []string{"Indexed", "succeeded", ""},
Value: 1,
},
},
},
"job without successPolicy; incremented the jobs_finished_total metric with CompletionsReached reason": {
enableJobSuccessPolicy: true,
job: batchv1.Job{
Spec: batchv1.JobSpec{
Parallelism: ptr.To[int32](1),
Completions: ptr.To[int32](1),
CompletionMode: completionModePtr(batchv1.IndexedCompletion),
Template: podTemplateSpec,
},
},
podTerminations: []podTerminationWithExpectations{
{
index: 0,
status: v1.PodStatus{
Phase: v1.PodSucceeded,
},
wantActive: 0,
wantFailed: 0,
wantSucceeded: 1,
wantCompletedIndexes: "0",
wantTerminating: ptr.To[int32](0),
},
},
wantConditionTypes: []batchv1.JobConditionType{batchv1.JobSuccessCriteriaMet, batchv1.JobComplete},
wantJobFinishedNumMetric: []metricLabelsWithValue{
{
Labels: []string{"Indexed", "succeeded", "CompletionsReached"},
Value: 1,
},
},
},
"job with successPolicy with succeededIndexes; job has SuccessCriteriaMet and Complete conditions even if some indexes remain pending": {
enableJobSuccessPolicy: true,
job: batchv1.Job{
Spec: batchv1.JobSpec{
Parallelism: ptr.To[int32](2),
Completions: ptr.To[int32](2),
CompletionMode: completionModePtr(batchv1.IndexedCompletion),
Template: podTemplateSpec,
SuccessPolicy: &batchv1.SuccessPolicy{
Rules: []batchv1.SuccessPolicyRule{{
SucceededIndexes: ptr.To("1"),
}},
},
},
},
podTerminations: []podTerminationWithExpectations{
{
index: 0,
status: v1.PodStatus{
Phase: v1.PodPending,
},
wantActive: 2,
wantActiveIndexes: sets.New(0, 1),
wantFailed: 0,
wantSucceeded: 0,
wantTerminating: ptr.To[int32](0),
},
{
index: 1,
status: v1.PodStatus{
Phase: v1.PodSucceeded,
},
wantActive: 0,
wantFailed: 0,
wantSucceeded: 1,
wantCompletedIndexes: "1",
wantTerminating: ptr.To[int32](0),
},
},
wantConditionTypes: []batchv1.JobConditionType{batchv1.JobSuccessCriteriaMet, batchv1.JobComplete},
wantJobFinishedNumMetric: []metricLabelsWithValue{
{
Labels: []string{"Indexed", "succeeded", "SuccessPolicy"},
Value: 1,
},
},
},
"job with successPolicy with succeededCount; job has SuccessCriteriaMet and Complete conditions even if some indexes remain pending": {
enableJobSuccessPolicy: true,
job: batchv1.Job{
Spec: batchv1.JobSpec{
Parallelism: ptr.To[int32](2),
Completions: ptr.To[int32](2),
CompletionMode: completionModePtr(batchv1.IndexedCompletion),
Template: podTemplateSpec,
SuccessPolicy: &batchv1.SuccessPolicy{
Rules: []batchv1.SuccessPolicyRule{{
SucceededCount: ptr.To[int32](1),
}},
},
},
},
podTerminations: []podTerminationWithExpectations{
{
index: 0,
status: v1.PodStatus{
Phase: v1.PodPending,
},
wantActive: 2,
wantActiveIndexes: sets.New(0, 1),
wantFailed: 0,
wantSucceeded: 0,
wantTerminating: ptr.To[int32](0),
},
{
index: 1,
status: v1.PodStatus{
Phase: v1.PodSucceeded,
},
wantActive: 0,
wantFailed: 0,
wantSucceeded: 1,
wantCompletedIndexes: "1",
wantTerminating: ptr.To[int32](0),
},
},
wantConditionTypes: []batchv1.JobConditionType{batchv1.JobSuccessCriteriaMet, batchv1.JobComplete},
wantJobFinishedNumMetric: []metricLabelsWithValue{
{
Labels: []string{"Indexed", "succeeded", "SuccessPolicy"},
Value: 1,
},
},
},
"job with successPolicy and backoffLimitPerIndex; job has a Failed condition if job meets backoffLimitPerIndex": {
enableJobSuccessPolicy: true,
enableBackoffLimitPerIndex: true,
job: batchv1.Job{
Spec: batchv1.JobSpec{
Parallelism: ptr.To[int32](2),
Completions: ptr.To[int32](2),
CompletionMode: completionModePtr(batchv1.IndexedCompletion),
BackoffLimitPerIndex: ptr.To[int32](0),
Template: podTemplateSpec,
SuccessPolicy: &batchv1.SuccessPolicy{
Rules: []batchv1.SuccessPolicyRule{{
SucceededCount: ptr.To[int32](1),
}},
},
},
},
podTerminations: []podTerminationWithExpectations{
{
index: 0,
status: v1.PodStatus{
Phase: v1.PodFailed,
},
wantActive: 1,
wantActiveIndexes: sets.New(1),
wantFailed: 1,
wantFailedIndexes: ptr.To("0"),
wantSucceeded: 0,
wantTerminating: ptr.To[int32](0),
},
{
index: 1,
status: v1.PodStatus{
Phase: v1.PodSucceeded,
},
wantActive: 0,
wantFailed: 1,
wantSucceeded: 1,
wantFailedIndexes: ptr.To("0"),
wantCompletedIndexes: "1",
wantTerminating: ptr.To[int32](0),
},
},
wantConditionTypes: []batchv1.JobConditionType{batchv1.JobFailed},
},
}
for name, tc := range testCases {
t.Run(name, func(t *testing.T) {
resetMetrics()
featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobSuccessPolicy, tc.enableJobSuccessPolicy)
featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobBackoffLimitPerIndex, tc.enableBackoffLimitPerIndex)
closeFn, restConfig, clientSet, ns := setup(t, "simple")
defer closeFn()
ctx, cancel := startJobControllerAndWaitForCaches(t, restConfig)
defer func() {
cancel()
}()
jobObj, err := createJobWithDefaults(ctx, clientSet, ns.Name, &tc.job)
if err != nil {
t.Fatalf("Error %v while creating the Job %q", err, jobObj.Name)
}
validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
Active: int(*tc.job.Spec.Parallelism),
Ready: ptr.To[int32](0),
Terminating: ptr.To[int32](0),
})
for _, podTermination := range tc.podTerminations {
pod, err := getActivePodForIndex(ctx, clientSet, jobObj, podTermination.index)
if err != nil {
t.Fatalf("Listing Job Pods: %v", err)
}
pod.Status = podTermination.status
if _, err = clientSet.CoreV1().Pods(pod.Namespace).UpdateStatus(ctx, pod, metav1.UpdateOptions{}); err != nil {
t.Fatalf("Error updating the Pod %q: %v", klog.KObj(pod), err)
}
validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
Active: podTermination.wantActive,
Succeeded: podTermination.wantSucceeded,
Failed: podTermination.wantFailed,
Ready: ptr.To[int32](0),
Terminating: podTermination.wantTerminating,
})
validateIndexedJobPods(ctx, t, clientSet, jobObj, podTermination.wantActiveIndexes, podTermination.wantCompletedIndexes, podTermination.wantFailedIndexes)
}
for i := range tc.wantConditionTypes {
validateJobCondition(ctx, t, clientSet, jobObj, tc.wantConditionTypes[i])
}
for i := range tc.wantJobFinishedNumMetric {
validateCounterMetric(ctx, t, metrics.JobFinishedNum, tc.wantJobFinishedNumMetric[i])
}
validateFinishedPodsNoFinalizer(ctx, t, clientSet, jobObj)
})
}
}
// TestSuccessPolicy_ReEnabling tests handling of pod successful when
// re-enabling the JobSuccessPolicy feature.
func TestSuccessPolicy_ReEnabling(t *testing.T) {
featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobSuccessPolicy, true)
closeFn, resetConfig, clientSet, ns := setup(t, "success-policy-re-enabling")
defer closeFn()
ctx, cancel := startJobControllerAndWaitForCaches(t, resetConfig)
defer cancel()
resetMetrics()
jobObj, err := createJobWithDefaults(ctx, clientSet, ns.Name, &batchv1.Job{
Spec: batchv1.JobSpec{
Parallelism: ptr.To[int32](5),
Completions: ptr.To[int32](5),
CompletionMode: completionModePtr(batchv1.IndexedCompletion),
SuccessPolicy: &batchv1.SuccessPolicy{
Rules: []batchv1.SuccessPolicyRule{{
SucceededCount: ptr.To[int32](3),
}},
},
},
})
if err != nil {
t.Fatalf("Failed to create Job: %v", err)
}
validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
Active: 5,
Ready: ptr.To[int32](0),
Terminating: ptr.To[int32](0),
})
validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New(0, 1, 2, 3, 4), "", nil)
// First pod from index 0 succeeded
if err = setJobPhaseForIndex(ctx, clientSet, jobObj, v1.PodSucceeded, 0); err != nil {
t.Fatalf("Failed tring to succeess pod with index 0")
}
validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
Active: 4,
Succeeded: 1,
Ready: ptr.To[int32](0),
Terminating: ptr.To[int32](0),
})
validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New(1, 2, 3, 4), "0", nil)
// Disable the JobSuccessPolicy
featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobSuccessPolicy, false)
// First pod from index 1 succeeded
if err = setJobPhaseForIndex(ctx, clientSet, jobObj, v1.PodSucceeded, 1); err != nil {
t.Fatalf("Failed trying to succeess pod with index 1")
}
validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
Active: 3,
Succeeded: 2,
Ready: ptr.To[int32](0),
Terminating: ptr.To[int32](0),
})
validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New(2, 3, 4), "0,1", nil)
// ReEnable the JobSuccessPolicy
featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobSuccessPolicy, true)
// First pod from index 2 succeeded
if err = setJobPhaseForIndex(ctx, clientSet, jobObj, v1.PodSucceeded, 2); err != nil {
t.Fatalf("Failed trying to success pod with index 2")
}
// Verify all indexes are terminated as job meets successPolicy.
validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
Active: 0,
Succeeded: 3,
Ready: ptr.To[int32](0),
Terminating: ptr.To[int32](2),
})
validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New[int](), "0-2", nil)
validateJobCondition(ctx, t, clientSet, jobObj, batchv1.JobSuccessCriteriaMet)
validateJobComplete(ctx, t, clientSet, jobObj)
validateFinishedPodsNoFinalizer(ctx, t, clientSet, jobObj)
}
// TestBackoffLimitPerIndex_DelayedPodDeletion tests the pod deletion is delayed
// until the replacement pod is created, so that the replacement pod has the
// index-failure-count annotation bumped, when BackoffLimitPerIndex is used.
func TestBackoffLimitPerIndex_DelayedPodDeletion(t *testing.T) {
t.Cleanup(setDurationDuringTest(&jobcontroller.DefaultJobPodFailureBackOff, fastPodFailureBackoff))
featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobBackoffLimitPerIndex, true)
closeFn, restConfig, clientSet, ns := setup(t, "backoff-limit-per-index-failed")
defer closeFn()
ctx, cancel := startJobControllerAndWaitForCaches(t, restConfig)
defer func() {
cancel()
}()
jobObj, err := createJobWithDefaults(ctx, clientSet, ns.Name, &batchv1.Job{
Spec: batchv1.JobSpec{
Parallelism: ptr.To[int32](1),
Completions: ptr.To[int32](1),
BackoffLimitPerIndex: ptr.To[int32](1),
CompletionMode: completionModePtr(batchv1.IndexedCompletion),
},
})
if err != nil {
t.Fatalf("Failed to create Job: %v", err)
}
validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
Active: 1,
Ready: ptr.To[int32](0),
Terminating: ptr.To[int32](0),
})
validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New(0), "", ptr.To(""))
// First pod from index 0 failed.
if err := setJobPhaseForIndex(ctx, clientSet, jobObj, v1.PodFailed, 0); err != nil {
t.Fatal("Failed trying to fail pod with index 0")
}
// Delete the failed pod
pod, err := getJobPodForIndex(ctx, clientSet, jobObj, 0, func(_ *v1.Pod) bool { return true })
if err != nil {
t.Fatalf("failed to get terminal pod for index: %v", 0)
}
if err := clientSet.CoreV1().Pods(pod.Namespace).Delete(ctx, pod.Name, metav1.DeleteOptions{}); err != nil {
t.Fatalf("failed to delete pod: %v, error: %v", klog.KObj(pod), err)
}
validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
Active: 1,
Failed: 1,
Ready: ptr.To[int32](0),
Terminating: ptr.To[int32](0),
})
validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New(0), "", ptr.To(""))
// Verify the replacement pod is created and has the index-failure-count
// annotation bumped.
replacement, err := getActivePodForIndex(ctx, clientSet, jobObj, 0)
if err != nil {
t.Fatalf("Failed to get active replacement pod for index: %v, error: %v", 0, err)
}
gotIndexFailureCount, err := getIndexFailureCount(replacement)
if err != nil {
t.Fatalf("Failed read the index failure count annotation for pod: %v, error: %v", klog.KObj(replacement), err)
}
if diff := cmp.Diff(1, gotIndexFailureCount); diff != "" {
t.Errorf("Unexpected index failure count for the replacement pod: %s", diff)
}
if err := setJobPhaseForIndex(ctx, clientSet, jobObj, v1.PodSucceeded, 0); err != nil {
t.Fatal("Failed trying to fail pod with index 0")
}
validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
Active: 0,
Succeeded: 1,
Failed: 1,
Ready: ptr.To[int32](0),
Terminating: ptr.To[int32](0),
})
validateJobComplete(ctx, t, clientSet, jobObj)
}
// TestBackoffLimitPerIndex_Reenabling tests handling of pod failures when
// reenabling the BackoffLimitPerIndex feature.
func TestBackoffLimitPerIndex_Reenabling(t *testing.T) {
t.Cleanup(setDurationDuringTest(&jobcontroller.DefaultJobPodFailureBackOff, fastPodFailureBackoff))
featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobBackoffLimitPerIndex, true)
closeFn, restConfig, clientSet, ns := setup(t, "backoff-limit-per-index-reenabled")
defer closeFn()
ctx, cancel := startJobControllerAndWaitForCaches(t, restConfig)
defer cancel()
resetMetrics()
jobObj, err := createJobWithDefaults(ctx, clientSet, ns.Name, &batchv1.Job{
Spec: batchv1.JobSpec{
Parallelism: ptr.To[int32](3),
Completions: ptr.To[int32](3),
BackoffLimitPerIndex: ptr.To[int32](0),
CompletionMode: completionModePtr(batchv1.IndexedCompletion),
},
})
if err != nil {
t.Fatalf("Failed to create Job: %v", err)
}
validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
Active: 3,
Ready: ptr.To[int32](0),
Terminating: ptr.To[int32](0),
})
validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New(0, 1, 2), "", ptr.To(""))
// First pod from index 0 failed
if err := setJobPhaseForIndex(ctx, clientSet, jobObj, v1.PodFailed, 0); err != nil {
t.Fatal("Failed trying to fail pod with index 0")
}
validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
Active: 2,
Failed: 1,
Ready: ptr.To[int32](0),
Terminating: ptr.To[int32](0),
})
validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New(1, 2), "", ptr.To("0"))
// Disable the feature
featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobBackoffLimitPerIndex, false)
// First pod from index 1 failed
if err := setJobPhaseForIndex(ctx, clientSet, jobObj, v1.PodFailed, 1); err != nil {
t.Fatal("Failed trying to fail pod with index 1")
}
validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
Active: 3,
Failed: 2,
Ready: ptr.To[int32](0),
Terminating: ptr.To[int32](0),
})
validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New(0, 1, 2), "", nil)
// Reenable the feature
featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobBackoffLimitPerIndex, true)
// First pod from index 2 failed
if err := setJobPhaseForIndex(ctx, clientSet, jobObj, v1.PodFailed, 2); err != nil {
t.Fatal("Failed trying to fail pod with index 2")
}
// Verify the indexes 0 and 1 are active as the failed pods don't have
// finalizers at this point, so they are ignored.
validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
Active: 2,
Failed: 3,
Ready: ptr.To[int32](0),
Terminating: ptr.To[int32](0),
})
validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New(0, 1), "", ptr.To("2"))
// mark remaining pods are Succeeded and verify Job status
if _, err := setJobPodsPhase(ctx, clientSet, jobObj, v1.PodSucceeded, 2); err != nil {
t.Fatalf("Failed setting phase %q on Job Pod: %v", v1.PodSucceeded, err)
}
validateJobFailed(ctx, t, clientSet, jobObj)
validateFinishedPodsNoFinalizer(ctx, t, clientSet, jobObj)
}
// TestBackoffLimitPerIndex_JobPodsCreatedWithExponentialBackoff tests that the
// pods are recreated with expotential backoff delay computed independently
// per index. Scenario:
// - fail index 0
// - fail index 0
// - fail index 1
// - succeed index 0
// - fail index 1
// - succeed index 1
func TestBackoffLimitPerIndex_JobPodsCreatedWithExponentialBackoff(t *testing.T) {
featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobBackoffLimitPerIndex, true)
t.Cleanup(setDurationDuringTest(&jobcontroller.DefaultJobPodFailureBackOff, 2*time.Second))
closeFn, restConfig, clientSet, ns := setup(t, "simple")
defer closeFn()
ctx, cancel := startJobControllerAndWaitForCaches(t, restConfig)
defer cancel()
jobObj, err := createJobWithDefaults(ctx, clientSet, ns.Name, &batchv1.Job{
Spec: batchv1.JobSpec{
Completions: ptr.To[int32](2),
Parallelism: ptr.To[int32](2),
BackoffLimitPerIndex: ptr.To[int32](2),
CompletionMode: completionModePtr(batchv1.IndexedCompletion),
},
})
if err != nil {
t.Fatalf("Could not create job: %v", err)
}
validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
Active: 2,
Ready: ptr.To[int32](0),
Terminating: ptr.To[int32](0),
})
validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New(0, 1), "", ptr.To(""))
// Fail the first pod for index 0
if err := setJobPhaseForIndex(ctx, clientSet, jobObj, v1.PodFailed, 0); err != nil {
t.Fatalf("Failed setting phase %s on Job Pod: %v", v1.PodFailed, err)
}
validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
Active: 2,
Failed: 1,
Ready: ptr.To[int32](0),
Terminating: ptr.To[int32](0),
})
validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New(0, 1), "", ptr.To(""))
// Fail the second pod for index 0
if err := setJobPhaseForIndex(ctx, clientSet, jobObj, v1.PodFailed, 0); err != nil {
t.Fatalf("Failed setting phase %s on Job Pod: %v", v1.PodFailed, err)
}
validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
Active: 2,
Failed: 2,
Ready: ptr.To[int32](0),
Terminating: ptr.To[int32](0),
})
validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New(0, 1), "", ptr.To(""))
// Fail the first pod for index 1
if err := setJobPhaseForIndex(ctx, clientSet, jobObj, v1.PodFailed, 1); err != nil {
t.Fatalf("Failed setting phase %s on Job Pod: %v", v1.PodFailed, err)
}
validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
Active: 2,
Failed: 3,
Ready: ptr.To[int32](0),
Terminating: ptr.To[int32](0),
})
validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New(0, 1), "", ptr.To(""))
// Succeed the third pod for index 0
if err := setJobPhaseForIndex(ctx, clientSet, jobObj, v1.PodSucceeded, 0); err != nil {
t.Fatalf("Failed setting phase %s on Job Pod: %v", v1.PodSucceeded, err)
}
validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
Active: 1,
Failed: 3,
Succeeded: 1,
Ready: ptr.To[int32](0),
Terminating: ptr.To[int32](0),
})
validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New(1), "0", ptr.To(""))
// Fail the second pod for index 1
if err := setJobPhaseForIndex(ctx, clientSet, jobObj, v1.PodFailed, 1); err != nil {
t.Fatalf("Failed setting phase %s on Job Pod: %v", v1.PodFailed, err)
}
validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
Active: 1,
Failed: 4,
Succeeded: 1,
Ready: ptr.To[int32](0),
Terminating: ptr.To[int32](0),
})
validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New(1), "0", ptr.To(""))
// Succeed the third pod for index 1
if err := setJobPhaseForIndex(ctx, clientSet, jobObj, v1.PodSucceeded, 1); err != nil {
t.Fatalf("Failed setting phase %s on Job Pod: %v", v1.PodSucceeded, err)
}
validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
Active: 0,
Failed: 4,
Succeeded: 2,
Ready: ptr.To[int32](0),
Terminating: ptr.To[int32](0),
})
validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New[int](), "0,1", ptr.To(""))
validateJobComplete(ctx, t, clientSet, jobObj)
for index := 0; index < int(*jobObj.Spec.Completions); index++ {
podsForIndex, err := getJobPodsForIndex(ctx, clientSet, jobObj, index, func(_ *v1.Pod) bool { return true })
if err != nil {
t.Fatalf("Failed to list job %q pods for index %v, error: %v", klog.KObj(jobObj), index, err)
}
validateExpotentialBackoffDelay(t, jobcontroller.DefaultJobPodFailureBackOff, podsForIndex)
}
}
// TestDelayTerminalPhaseCondition tests the fix for Job controller to delay
// setting the terminal phase conditions (Failed and Complete) until all Pods
// are terminal. The fate of the Job is indicated by the interim Job conditions:
// FailureTarget, or SuccessCriteriaMet.
func TestDelayTerminalPhaseCondition(t *testing.T) {
t.Cleanup(setDurationDuringTest(&jobcontroller.DefaultJobPodFailureBackOff, fastPodFailureBackoff))
podTemplateSpec := v1.PodTemplateSpec{
ObjectMeta: metav1.ObjectMeta{
Finalizers: []string{"fake.example.com/blockDeletion"},
},
Spec: v1.PodSpec{
Containers: []v1.Container{
{
Name: "main-container",
Image: "foo",
ImagePullPolicy: v1.PullIfNotPresent,
TerminationMessagePolicy: v1.TerminationMessageFallbackToLogsOnError,
},
},
},
}
failOnePod := func(ctx context.Context, clientSet clientset.Interface, jobObj *batchv1.Job) {
if _, err := setJobPodsPhase(ctx, clientSet, jobObj, v1.PodFailed, 1); err != nil {
t.Fatalf("Failed setting phase %q on Job Pod: %v", v1.PodFailed, err)
}
}
succeedOnePodAndScaleDown := func(ctx context.Context, clientSet clientset.Interface, jobObj *batchv1.Job) {
// mark one pod as succeeded
if err := setJobPhaseForIndex(ctx, clientSet, jobObj, v1.PodSucceeded, 0); err != nil {
t.Fatalf("Failed setting phase %q on Job Pod: %v", v1.PodSucceeded, err)
}
jobClient := clientSet.BatchV1().Jobs(jobObj.Namespace)
if _, err := updateJob(ctx, jobClient, jobObj.Name, func(j *batchv1.Job) {
j.Spec.Parallelism = ptr.To[int32](1)
j.Spec.Completions = ptr.To[int32](1)
}); err != nil {
t.Fatalf("Unexpected error when scaling down the job: %v", err)
}
}
testCases := map[string]struct {
enableJobManagedBy bool
enableJobPodReplacementPolicy bool
enableJobSuccessPolicy bool
job batchv1.Job
action func(context.Context, clientset.Interface, *batchv1.Job)
wantInterimStatus *batchv1.JobStatus
wantTerminalStatus batchv1.JobStatus
}{
"job backoff limit exceeded; JobPodReplacementPolicy and JobManagedBy disabled": {
job: batchv1.Job{
Spec: batchv1.JobSpec{
Parallelism: ptr.To[int32](2),
Completions: ptr.To[int32](2),
Template: podTemplateSpec,
BackoffLimit: ptr.To[int32](0),
},
},
action: failOnePod,
wantTerminalStatus: batchv1.JobStatus{
Failed: 2,
Ready: ptr.To[int32](0),
Conditions: []batchv1.JobCondition{
{
Type: batchv1.JobFailed,
Status: v1.ConditionTrue,
Reason: batchv1.JobReasonBackoffLimitExceeded,
},
},
},
},
"job backoff limit exceeded; JobPodReplacementPolicy enabled": {
enableJobPodReplacementPolicy: true,
job: batchv1.Job{
Spec: batchv1.JobSpec{
Parallelism: ptr.To[int32](2),
Completions: ptr.To[int32](2),
Template: podTemplateSpec,
BackoffLimit: ptr.To[int32](0),
},
},
action: failOnePod,
wantInterimStatus: &batchv1.JobStatus{
Failed: 2,
Ready: ptr.To[int32](0),
Terminating: ptr.To[int32](1),
Conditions: []batchv1.JobCondition{
{
Type: batchv1.JobFailureTarget,
Status: v1.ConditionTrue,
Reason: batchv1.JobReasonBackoffLimitExceeded,
},
},
},
wantTerminalStatus: batchv1.JobStatus{
Failed: 2,
Ready: ptr.To[int32](0),
Terminating: ptr.To[int32](0),
Conditions: []batchv1.JobCondition{
{
Type: batchv1.JobFailureTarget,
Status: v1.ConditionTrue,
Reason: batchv1.JobReasonBackoffLimitExceeded,
},
{
Type: batchv1.JobFailed,
Status: v1.ConditionTrue,
Reason: batchv1.JobReasonBackoffLimitExceeded,
},
},
},
},
"job backoff limit exceeded; JobManagedBy enabled": {
enableJobManagedBy: true,
job: batchv1.Job{
Spec: batchv1.JobSpec{
Parallelism: ptr.To[int32](2),
Completions: ptr.To[int32](2),
Template: podTemplateSpec,
BackoffLimit: ptr.To[int32](0),
},
},
action: failOnePod,
wantInterimStatus: &batchv1.JobStatus{
Failed: 2,
Ready: ptr.To[int32](0),
Conditions: []batchv1.JobCondition{
{
Type: batchv1.JobFailureTarget,
Status: v1.ConditionTrue,
Reason: batchv1.JobReasonBackoffLimitExceeded,
},
},
},
wantTerminalStatus: batchv1.JobStatus{
Failed: 2,
Ready: ptr.To[int32](0),
Conditions: []batchv1.JobCondition{
{
Type: batchv1.JobFailureTarget,
Status: v1.ConditionTrue,
Reason: batchv1.JobReasonBackoffLimitExceeded,
},
{
Type: batchv1.JobFailed,
Status: v1.ConditionTrue,
Reason: batchv1.JobReasonBackoffLimitExceeded,
},
},
},
},
"job scale down to meet completions; JobPodReplacementPolicy and JobManagedBy disabled": {
job: batchv1.Job{
Spec: batchv1.JobSpec{
Parallelism: ptr.To[int32](2),
Completions: ptr.To[int32](2),
CompletionMode: ptr.To(batchv1.IndexedCompletion),
Template: podTemplateSpec,
},
},
action: succeedOnePodAndScaleDown,
wantTerminalStatus: batchv1.JobStatus{
Succeeded: 1,
Ready: ptr.To[int32](0),
CompletedIndexes: "0",
Conditions: []batchv1.JobCondition{
{
Type: batchv1.JobComplete,
Status: v1.ConditionTrue,
},
},
},
},
"job scale down to meet completions; JobPodReplacementPolicy enabled": {
enableJobPodReplacementPolicy: true,
job: batchv1.Job{
Spec: batchv1.JobSpec{
Parallelism: ptr.To[int32](2),
Completions: ptr.To[int32](2),
CompletionMode: ptr.To(batchv1.IndexedCompletion),
Template: podTemplateSpec,
},
},
action: succeedOnePodAndScaleDown,
wantInterimStatus: &batchv1.JobStatus{
Succeeded: 1,
Ready: ptr.To[int32](0),
Terminating: ptr.To[int32](1),
CompletedIndexes: "0",
Conditions: []batchv1.JobCondition{
{
Type: batchv1.JobSuccessCriteriaMet,
Status: v1.ConditionTrue,
},
},
},
wantTerminalStatus: batchv1.JobStatus{
Succeeded: 1,
Ready: ptr.To[int32](0),
Terminating: ptr.To[int32](0),
CompletedIndexes: "0",
Conditions: []batchv1.JobCondition{
{
Type: batchv1.JobSuccessCriteriaMet,
Status: v1.ConditionTrue,
},
{
Type: batchv1.JobComplete,
Status: v1.ConditionTrue,
},
},
},
},
"job scale down to meet completions; JobManagedBy enabled": {
enableJobManagedBy: true,
job: batchv1.Job{
Spec: batchv1.JobSpec{
Parallelism: ptr.To[int32](2),
Completions: ptr.To[int32](2),
CompletionMode: ptr.To(batchv1.IndexedCompletion),
Template: podTemplateSpec,
},
},
action: succeedOnePodAndScaleDown,
wantInterimStatus: &batchv1.JobStatus{
Succeeded: 1,
Ready: ptr.To[int32](0),
CompletedIndexes: "0",
Conditions: []batchv1.JobCondition{
{
Type: batchv1.JobSuccessCriteriaMet,
Status: v1.ConditionTrue,
},
},
},
wantTerminalStatus: batchv1.JobStatus{
Succeeded: 1,
Ready: ptr.To[int32](0),
CompletedIndexes: "0",
Conditions: []batchv1.JobCondition{
{
Type: batchv1.JobSuccessCriteriaMet,
Status: v1.ConditionTrue,
},
{
Type: batchv1.JobComplete,
Status: v1.ConditionTrue,
},
},
},
},
"job scale down to meet completions; JobManagedBy and JobSuccessPolicy are enabled": {
enableJobManagedBy: true,
enableJobSuccessPolicy: true,
job: batchv1.Job{
Spec: batchv1.JobSpec{
Parallelism: ptr.To[int32](2),
Completions: ptr.To[int32](2),
CompletionMode: ptr.To(batchv1.IndexedCompletion),
Template: podTemplateSpec,
},
},
action: succeedOnePodAndScaleDown,
wantInterimStatus: &batchv1.JobStatus{
Succeeded: 1,
Ready: ptr.To[int32](0),
CompletedIndexes: "0",
Conditions: []batchv1.JobCondition{
{
Type: batchv1.JobSuccessCriteriaMet,
Status: v1.ConditionTrue,
Reason: batchv1.JobReasonCompletionsReached,
},
},
},
wantTerminalStatus: batchv1.JobStatus{
Succeeded: 1,
Ready: ptr.To[int32](0),
CompletedIndexes: "0",
Conditions: []batchv1.JobCondition{
{
Type: batchv1.JobSuccessCriteriaMet,
Status: v1.ConditionTrue,
Reason: batchv1.JobReasonCompletionsReached,
},
{
Type: batchv1.JobComplete,
Status: v1.ConditionTrue,
Reason: batchv1.JobReasonCompletionsReached,
},
},
},
},
"job scale down to meet completions; JobPodReplacementPolicy and JobSuccessPolicy are enabled": {
enableJobPodReplacementPolicy: true,
enableJobSuccessPolicy: true,
job: batchv1.Job{
Spec: batchv1.JobSpec{
Parallelism: ptr.To[int32](2),
Completions: ptr.To[int32](2),
CompletionMode: ptr.To(batchv1.IndexedCompletion),
Template: podTemplateSpec,
},
},
action: succeedOnePodAndScaleDown,
wantInterimStatus: &batchv1.JobStatus{
Succeeded: 1,
Ready: ptr.To[int32](0),
Terminating: ptr.To[int32](1),
CompletedIndexes: "0",
Conditions: []batchv1.JobCondition{
{
Type: batchv1.JobSuccessCriteriaMet,
Status: v1.ConditionTrue,
Reason: batchv1.JobReasonCompletionsReached,
},
},
},
wantTerminalStatus: batchv1.JobStatus{
Succeeded: 1,
Ready: ptr.To[int32](0),
Terminating: ptr.To[int32](0),
CompletedIndexes: "0",
Conditions: []batchv1.JobCondition{
{
Type: batchv1.JobSuccessCriteriaMet,
Status: v1.ConditionTrue,
Reason: batchv1.JobReasonCompletionsReached,
},
{
Type: batchv1.JobComplete,
Status: v1.ConditionTrue,
Reason: batchv1.JobReasonCompletionsReached,
},
},
},
},
}
for name, test := range testCases {
t.Run(name, func(t *testing.T) {
resetMetrics()
featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobPodReplacementPolicy, test.enableJobPodReplacementPolicy)
featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobManagedBy, test.enableJobManagedBy)
featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.ElasticIndexedJob, true)
featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobSuccessPolicy, test.enableJobSuccessPolicy)
closeFn, restConfig, clientSet, ns := setup(t, "delay-terminal-condition")
t.Cleanup(closeFn)
ctx, cancel := startJobControllerAndWaitForCaches(t, restConfig)
t.Cleanup(cancel)
jobObj, err := createJobWithDefaults(ctx, clientSet, ns.Name, &test.job)
if err != nil {
t.Fatalf("Error %q while creating the job %q", err, jobObj.Name)
}
t.Cleanup(func() { removePodsFinalizer(ctx, t, clientSet, ns.Name) })
jobClient := clientSet.BatchV1().Jobs(jobObj.Namespace)
waitForPodsToBeActive(ctx, t, jobClient, *jobObj.Spec.Parallelism, jobObj)
test.action(ctx, clientSet, jobObj)
if test.wantInterimStatus != nil {
validateJobStatus(ctx, t, clientSet, jobObj, *test.wantInterimStatus)
// Set terminal phase to all the remaining pods to simulate
// Kubelet (or other components like PodGC).
jobPods, err := getJobPods(ctx, t, clientSet, jobObj, func(s v1.PodStatus) bool {
return (s.Phase == v1.PodPending || s.Phase == v1.PodRunning)
})
if err != nil {
t.Fatalf("Failed to list Job Pods: %v", err)
}
if _, err := setJobPodsPhase(ctx, clientSet, jobObj, v1.PodSucceeded, len(jobPods)); err != nil {
t.Fatalf("Failed setting phase %q on Job Pod: %v", v1.PodSucceeded, err)
}
}
validateJobStatus(ctx, t, clientSet, jobObj, test.wantTerminalStatus)
})
}
}
// TestBackoffLimitPerIndex tests handling of job and its pods when
// backoff limit per index is used.
func TestBackoffLimitPerIndex(t *testing.T) {
t.Cleanup(setDurationDuringTest(&jobcontroller.DefaultJobPodFailureBackOff, fastPodFailureBackoff))
type podTerminationWithExpectations struct {
index int
status v1.PodStatus
wantActive int
wantFailed int
wantSucceeded int
wantActiveIndexes sets.Set[int]
wantCompletedIndexes string
wantFailedIndexes *string
wantReplacementPodFailureCount *int
wantTerminating *int32
}
podTemplateSpec := v1.PodTemplateSpec{
Spec: v1.PodSpec{
Containers: []v1.Container{
{
Name: "main-container",
Image: "foo",
ImagePullPolicy: v1.PullIfNotPresent,
TerminationMessagePolicy: v1.TerminationMessageFallbackToLogsOnError,
},
},
},
}
testCases := map[string]struct {
job batchv1.Job
podTerminations []podTerminationWithExpectations
wantJobConditionType batchv1.JobConditionType
wantJobFinishedIndexesTotalMetric []metricLabelsWithValue
}{
"job succeeded": {
job: batchv1.Job{
Spec: batchv1.JobSpec{
Parallelism: ptr.To[int32](2),
Completions: ptr.To[int32](2),
CompletionMode: completionModePtr(batchv1.IndexedCompletion),
BackoffLimitPerIndex: ptr.To[int32](1),
Template: podTemplateSpec,
},
},
podTerminations: []podTerminationWithExpectations{
{
status: v1.PodStatus{
Phase: v1.PodFailed,
},
wantActive: 2,
wantFailed: 1,
wantActiveIndexes: sets.New(0, 1),
wantFailedIndexes: ptr.To(""),
wantReplacementPodFailureCount: ptr.To(1),
wantTerminating: ptr.To[int32](0),
},
},
wantJobConditionType: batchv1.JobComplete,
wantJobFinishedIndexesTotalMetric: []metricLabelsWithValue{
{
Labels: []string{"succeeded", "perIndex"},
Value: 2,
},
},
},
"job index fails due to exceeding backoff limit per index": {
job: batchv1.Job{
Spec: batchv1.JobSpec{
Parallelism: ptr.To[int32](2),
Completions: ptr.To[int32](2),
CompletionMode: completionModePtr(batchv1.IndexedCompletion),
BackoffLimitPerIndex: ptr.To[int32](2),
Template: podTemplateSpec,
},
},
podTerminations: []podTerminationWithExpectations{
{
status: v1.PodStatus{
Phase: v1.PodFailed,
},
wantActive: 2,
wantFailed: 1,
wantActiveIndexes: sets.New(0, 1),
wantFailedIndexes: ptr.To(""),
wantReplacementPodFailureCount: ptr.To(1),
wantTerminating: ptr.To[int32](0),
},
{
status: v1.PodStatus{
Phase: v1.PodFailed,
},
wantActive: 2,
wantFailed: 2,
wantActiveIndexes: sets.New(0, 1),
wantFailedIndexes: ptr.To(""),
wantReplacementPodFailureCount: ptr.To(2),
wantTerminating: ptr.To[int32](0),
},
{
status: v1.PodStatus{
Phase: v1.PodFailed,
},
wantActive: 1,
wantFailed: 3,
wantActiveIndexes: sets.New(1),
wantFailedIndexes: ptr.To("0"),
wantTerminating: ptr.To[int32](0),
},
},
wantJobConditionType: batchv1.JobFailed,
wantJobFinishedIndexesTotalMetric: []metricLabelsWithValue{
{
Labels: []string{"failed", "perIndex"},
Value: 1,
},
{
Labels: []string{"succeeded", "perIndex"},
Value: 1,
},
},
},
"job index fails due to exceeding the global backoff limit first": {
job: batchv1.Job{
Spec: batchv1.JobSpec{
Parallelism: ptr.To[int32](3),
Completions: ptr.To[int32](3),
CompletionMode: completionModePtr(batchv1.IndexedCompletion),
BackoffLimitPerIndex: ptr.To[int32](1),
BackoffLimit: ptr.To[int32](2),
Template: podTemplateSpec,
},
},
podTerminations: []podTerminationWithExpectations{
{
index: 0,
status: v1.PodStatus{
Phase: v1.PodFailed,
},
wantActive: 3,
wantFailed: 1,
wantActiveIndexes: sets.New(0, 1, 2),
wantFailedIndexes: ptr.To(""),
wantTerminating: ptr.To[int32](0),
},
{
index: 1,
status: v1.PodStatus{
Phase: v1.PodFailed,
},
wantActive: 3,
wantFailed: 2,
wantActiveIndexes: sets.New(0, 1, 2),
wantFailedIndexes: ptr.To(""),
wantTerminating: ptr.To[int32](0),
},
{
index: 2,
status: v1.PodStatus{
Phase: v1.PodFailed,
},
wantFailed: 5,
wantFailedIndexes: ptr.To(""),
wantTerminating: ptr.To[int32](0),
},
},
wantJobConditionType: batchv1.JobFailed,
wantJobFinishedIndexesTotalMetric: []metricLabelsWithValue{
{
Labels: []string{"succeeded", "perIndex"},
Value: 0,
},
{
Labels: []string{"failed", "perIndex"},
Value: 0,
},
},
},
"job continues execution after a failed index, the job is marked Failed due to the failed index": {
job: batchv1.Job{
Spec: batchv1.JobSpec{
Parallelism: ptr.To[int32](2),
Completions: ptr.To[int32](2),
CompletionMode: completionModePtr(batchv1.IndexedCompletion),
BackoffLimitPerIndex: ptr.To[int32](0),
Template: podTemplateSpec,
},
},
podTerminations: []podTerminationWithExpectations{
{
index: 0,
status: v1.PodStatus{
Phase: v1.PodFailed,
},
wantActive: 1,
wantFailed: 1,
wantActiveIndexes: sets.New(1),
wantFailedIndexes: ptr.To("0"),
wantTerminating: ptr.To[int32](0),
},
{
index: 1,
status: v1.PodStatus{
Phase: v1.PodSucceeded,
},
wantFailed: 1,
wantSucceeded: 1,
wantFailedIndexes: ptr.To("0"),
wantCompletedIndexes: "1",
wantTerminating: ptr.To[int32](0),
},
},
wantJobConditionType: batchv1.JobFailed,
wantJobFinishedIndexesTotalMetric: []metricLabelsWithValue{
{
Labels: []string{"succeeded", "perIndex"},
Value: 1,
},
{
Labels: []string{"failed", "perIndex"},
Value: 1,
},
},
},
"job execution terminated early due to exceeding max failed indexes": {
job: batchv1.Job{
Spec: batchv1.JobSpec{
Parallelism: ptr.To[int32](3),
Completions: ptr.To[int32](3),
CompletionMode: completionModePtr(batchv1.IndexedCompletion),
BackoffLimitPerIndex: ptr.To[int32](0),
MaxFailedIndexes: ptr.To[int32](1),
Template: podTemplateSpec,
},
},
podTerminations: []podTerminationWithExpectations{
{
index: 0,
status: v1.PodStatus{
Phase: v1.PodFailed,
},
wantActive: 2,
wantFailed: 1,
wantActiveIndexes: sets.New(1, 2),
wantFailedIndexes: ptr.To("0"),
wantTerminating: ptr.To[int32](0),
},
{
index: 1,
status: v1.PodStatus{
Phase: v1.PodFailed,
},
wantActive: 0,
wantFailed: 3,
wantFailedIndexes: ptr.To("0,1"),
wantTerminating: ptr.To[int32](0),
},
},
wantJobConditionType: batchv1.JobFailed,
wantJobFinishedIndexesTotalMetric: []metricLabelsWithValue{
{
Labels: []string{"failed", "perIndex"},
Value: 2,
},
},
},
"pod failure matching pod failure policy rule with FailIndex action": {
job: batchv1.Job{
Spec: batchv1.JobSpec{
Parallelism: ptr.To[int32](2),
Completions: ptr.To[int32](2),
CompletionMode: completionModePtr(batchv1.IndexedCompletion),
BackoffLimitPerIndex: ptr.To[int32](1),
Template: podTemplateSpec,
PodFailurePolicy: &batchv1.PodFailurePolicy{
Rules: []batchv1.PodFailurePolicyRule{
{
Action: batchv1.PodFailurePolicyActionFailIndex,
OnExitCodes: &batchv1.PodFailurePolicyOnExitCodesRequirement{
Operator: batchv1.PodFailurePolicyOnExitCodesOpIn,
Values: []int32{13},
},
},
{
Action: batchv1.PodFailurePolicyActionFailIndex,
OnPodConditions: []batchv1.PodFailurePolicyOnPodConditionsPattern{
{
Type: v1.DisruptionTarget,
Status: v1.ConditionTrue,
},
},
},
},
},
},
},
podTerminations: []podTerminationWithExpectations{
{
index: 0,
status: v1.PodStatus{
Phase: v1.PodFailed,
ContainerStatuses: []v1.ContainerStatus{
{
State: v1.ContainerState{
Terminated: &v1.ContainerStateTerminated{
ExitCode: 13,
},
},
},
},
},
wantActive: 1,
wantFailed: 1,
wantActiveIndexes: sets.New(1),
wantFailedIndexes: ptr.To("0"),
wantTerminating: ptr.To[int32](0),
},
{
index: 1,
status: v1.PodStatus{
Phase: v1.PodFailed,
Conditions: []v1.PodCondition{
{
Type: v1.DisruptionTarget,
Status: v1.ConditionTrue,
},
},
},
wantFailed: 2,
wantFailedIndexes: ptr.To("0,1"),
wantTerminating: ptr.To[int32](0),
},
},
wantJobConditionType: batchv1.JobFailed,
wantJobFinishedIndexesTotalMetric: []metricLabelsWithValue{
{
Labels: []string{"failed", "perIndex"},
Value: 2,
},
},
},
}
for name, test := range testCases {
t.Run(name, func(t *testing.T) {
resetMetrics()
featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobBackoffLimitPerIndex, true)
closeFn, restConfig, clientSet, ns := setup(t, "simple")
defer closeFn()
ctx, cancel := startJobControllerAndWaitForCaches(t, restConfig)
defer func() {
cancel()
}()
jobObj, err := createJobWithDefaults(ctx, clientSet, ns.Name, &test.job)
if err != nil {
t.Fatalf("Error %q while creating the job %q", err, jobObj.Name)
}
validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
Active: int(*test.job.Spec.Parallelism),
Ready: ptr.To[int32](0),
Terminating: ptr.To[int32](0),
})
for _, podTermination := range test.podTerminations {
pod, err := getActivePodForIndex(ctx, clientSet, jobObj, podTermination.index)
if err != nil {
t.Fatalf("listing Job Pods: %v", err)
}
pod.Status = podTermination.status
if _, err = clientSet.CoreV1().Pods(pod.Namespace).UpdateStatus(ctx, pod, metav1.UpdateOptions{}); err != nil {
t.Fatalf("Error updating the pod %q: %v", klog.KObj(pod), err)
}
validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
Active: podTermination.wantActive,
Succeeded: podTermination.wantSucceeded,
Failed: podTermination.wantFailed,
Ready: ptr.To[int32](0),
Terminating: podTermination.wantTerminating,
})
validateIndexedJobPods(ctx, t, clientSet, jobObj, podTermination.wantActiveIndexes, podTermination.wantCompletedIndexes, podTermination.wantFailedIndexes)
if podTermination.wantReplacementPodFailureCount != nil {
replacement, err := getActivePodForIndex(ctx, clientSet, jobObj, podTermination.index)
if err != nil {
t.Fatalf("Failed to get active replacement pod for index: %v, error: %v", podTermination.index, err)
}
gotReplacementPodFailureCount, err := getIndexFailureCount(replacement)
if err != nil {
t.Fatalf("Failed read the index failure count annotation for pod: %v, error: %v", klog.KObj(replacement), err)
}
if *podTermination.wantReplacementPodFailureCount != gotReplacementPodFailureCount {
t.Fatalf("Unexpected value of the index failure count annotation. Want: %v, got: %v", *podTermination.wantReplacementPodFailureCount, gotReplacementPodFailureCount)
}
}
}
remainingActive := test.podTerminations[len(test.podTerminations)-1].wantActive
if remainingActive > 0 {
if _, err := setJobPodsPhase(ctx, clientSet, jobObj, v1.PodSucceeded, remainingActive); err != nil {
t.Fatalf("Failed setting phase %q on Job Pod: %v", v1.PodSucceeded, err)
}
}
validateJobCondition(ctx, t, clientSet, jobObj, test.wantJobConditionType)
for _, wantMetricValue := range test.wantJobFinishedIndexesTotalMetric {
validateCounterMetric(ctx, t, metrics.JobFinishedIndexesTotal, wantMetricValue)
}
validateFinishedPodsNoFinalizer(ctx, t, clientSet, jobObj)
})
}
}
// TestManagedBy verifies the Job controller correctly makes a decision to
// reconcile or skip reconciliation of the Job depending on the Job's managedBy
// field, and the enablement of the JobManagedBy feature gate.
func TestManagedBy(t *testing.T) {
customControllerName := "example.com/custom-job-controller"
podTemplateSpec := v1.PodTemplateSpec{
Spec: v1.PodSpec{
Containers: []v1.Container{
{
Name: "main-container",
Image: "foo",
},
},
},
}
testCases := map[string]struct {
enableJobManagedBy bool
job batchv1.Job
wantReconciledByBuiltInController bool
wantJobByExternalControllerTotalMetric metricLabelsWithValue
}{
"the Job controller reconciles jobs without the managedBy": {
enableJobManagedBy: true,
job: batchv1.Job{
Spec: batchv1.JobSpec{
Template: podTemplateSpec,
},
},
wantReconciledByBuiltInController: true,
wantJobByExternalControllerTotalMetric: metricLabelsWithValue{
// There is no good label value choice to check here, since the
// values wasn't specified. Let's go with checking for the reserved
// value just so that all test cases verify the metric.
Labels: []string{batchv1.JobControllerName},
Value: 0,
},
},
"the Job controller reconciles jobs with the well known value of the managedBy field": {
enableJobManagedBy: true,
job: batchv1.Job{
Spec: batchv1.JobSpec{
Template: podTemplateSpec,
ManagedBy: ptr.To(batchv1.JobControllerName),
},
},
wantReconciledByBuiltInController: true,
wantJobByExternalControllerTotalMetric: metricLabelsWithValue{
Labels: []string{batchv1.JobControllerName},
Value: 0,
},
},
"the Job controller reconciles an unsuspended with the custom value of managedBy; feature disabled": {
enableJobManagedBy: false,
job: batchv1.Job{
Spec: batchv1.JobSpec{
Template: podTemplateSpec,
ManagedBy: ptr.To(customControllerName),
},
},
wantReconciledByBuiltInController: true,
wantJobByExternalControllerTotalMetric: metricLabelsWithValue{
Labels: []string{customControllerName},
Value: 0,
},
},
"the Job controller does not reconcile an unsuspended with the custom value of managedBy": {
enableJobManagedBy: true,
job: batchv1.Job{
Spec: batchv1.JobSpec{
Suspend: ptr.To(false),
Template: podTemplateSpec,
ManagedBy: ptr.To(customControllerName),
},
},
wantReconciledByBuiltInController: false,
wantJobByExternalControllerTotalMetric: metricLabelsWithValue{
Labels: []string{customControllerName},
Value: 1,
},
},
"the Job controller does not reconcile a suspended with the custom value of managedBy": {
enableJobManagedBy: true,
job: batchv1.Job{
Spec: batchv1.JobSpec{
Suspend: ptr.To(true),
Template: podTemplateSpec,
ManagedBy: ptr.To(customControllerName),
},
},
wantReconciledByBuiltInController: false,
wantJobByExternalControllerTotalMetric: metricLabelsWithValue{
Labels: []string{customControllerName},
Value: 1,
},
},
}
for name, test := range testCases {
t.Run(name, func(t *testing.T) {
resetMetrics()
featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobManagedBy, test.enableJobManagedBy)
closeFn, restConfig, clientSet, ns := setup(t, "managed-by")
defer closeFn()
ctx, cancel := startJobControllerAndWaitForCaches(t, restConfig)
defer cancel()
jobObj, err := createJobWithDefaults(ctx, clientSet, ns.Name, &test.job)
if err != nil {
t.Fatalf("Error %v while creating the job %q", err, klog.KObj(jobObj))
}
if test.wantReconciledByBuiltInController {
validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
Active: int(*jobObj.Spec.Parallelism),
Ready: ptr.To[int32](0),
Terminating: ptr.To[int32](0),
})
validateCounterMetric(ctx, t, metrics.JobByExternalControllerTotal, test.wantJobByExternalControllerTotalMetric)
} else {
validateCounterMetric(ctx, t, metrics.JobByExternalControllerTotal, test.wantJobByExternalControllerTotalMetric)
time.Sleep(sleepDurationForControllerLatency)
jobObj, err = clientSet.BatchV1().Jobs(jobObj.Namespace).Get(ctx, jobObj.Name, metav1.GetOptions{})
if err != nil {
t.Fatalf("Error %v when getting the latest job %v", err, klog.KObj(jobObj))
}
if diff := cmp.Diff(batchv1.JobStatus{}, jobObj.Status); diff != "" {
t.Fatalf("Unexpected status (-want/+got): %s", diff)
}
}
})
}
}
// TestManagedBy_Reenabling verifies handling a Job with a custom value of the
// managedBy field by the Job controller, as the JobManagedBy feature gate is
// disabled and reenabled again. First, when the feature gate is enabled, the
// synchronization is skipped, when it is disabled the synchronization is starts,
// and is disabled again with re-enabling of the feature gate.
func TestManagedBy_Reenabling(t *testing.T) {
customControllerName := "example.com/custom-job-controller"
featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobManagedBy, true)
closeFn, restConfig, clientSet, ns := setup(t, "managed-by-reenabling")
defer closeFn()
ctx, cancel := startJobControllerAndWaitForCaches(t, restConfig)
defer func() {
cancel()
}()
resetMetrics()
baseJob := batchv1.Job{
ObjectMeta: metav1.ObjectMeta{
Name: "custom-job-test",
Namespace: ns.Name,
},
Spec: batchv1.JobSpec{
Completions: ptr.To[int32](1),
Parallelism: ptr.To[int32](1),
Template: v1.PodTemplateSpec{
Spec: v1.PodSpec{
Containers: []v1.Container{
{
Name: "main-container",
Image: "foo",
},
},
},
},
ManagedBy: &customControllerName,
},
}
jobObj, err := createJobWithDefaults(ctx, clientSet, ns.Name, &baseJob)
if err != nil {
t.Fatalf("Error %v when creating the job %q", err, klog.KObj(jobObj))
}
jobClient := clientSet.BatchV1().Jobs(jobObj.Namespace)
validateCounterMetric(ctx, t, metrics.JobByExternalControllerTotal, metricLabelsWithValue{
Labels: []string{customControllerName},
Value: 1,
})
time.Sleep(sleepDurationForControllerLatency)
jobObj, err = jobClient.Get(ctx, jobObj.Name, metav1.GetOptions{})
if err != nil {
t.Fatalf("Error %v when getting the latest job %v", err, klog.KObj(jobObj))
}
if diff := cmp.Diff(batchv1.JobStatus{}, jobObj.Status); diff != "" {
t.Fatalf("Unexpected status (-want/+got): %s", diff)
}
// Disable the feature gate and restart the controller
featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobManagedBy, false)
cancel()
resetMetrics()
ctx, cancel = startJobControllerAndWaitForCaches(t, restConfig)
// Verify the built-in controller reconciles the Job
validateJobsPodsStatusOnly(ctx, t, clientSet, jobObj, podsByStatus{
Active: 1,
Ready: ptr.To[int32](0),
Terminating: ptr.To[int32](0),
})
validateCounterMetric(ctx, t, metrics.JobByExternalControllerTotal, metricLabelsWithValue{
Labels: []string{customControllerName},
Value: 0,
})
// Reenable the feature gate and restart the controller
featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobManagedBy, true)
cancel()
resetMetrics()
ctx, cancel = startJobControllerAndWaitForCaches(t, restConfig)
// Marking the pod as finished, but it does not result in updating of the Job status.
if _, err := setJobPodsPhase(ctx, clientSet, jobObj, v1.PodSucceeded, 1); err != nil {
t.Fatalf("Error %v when setting phase %s on the pod of job %v", err, v1.PodSucceeded, klog.KObj(jobObj))
}
validateCounterMetric(ctx, t, metrics.JobByExternalControllerTotal, metricLabelsWithValue{
Labels: []string{customControllerName},
Value: 1,
})
time.Sleep(sleepDurationForControllerLatency)
validateJobsPodsStatusOnly(ctx, t, clientSet, jobObj, podsByStatus{
Active: 1,
Ready: ptr.To[int32](0),
Terminating: ptr.To[int32](0),
})
}
// TestManagedBy_RecreatedJob verifies that the Job controller skips
// reconciliation of a job with managedBy field, when this is a recreated job,
// and there is still a pending sync queued for the previous job.
// In this scenario we first create a job without managedBy field, and we mark
// its pod as succeeded. This queues the Job object sync with 1s delay. Then,
// without waiting for the Job status update we delete and recreate the job under
// the same name, but with managedBy field. The queued update starts to execute
// on the new job, but is skipped.
func TestManagedBy_RecreatedJob(t *testing.T) {
customControllerName := "example.com/custom-job-controller"
featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobManagedBy, true)
closeFn, restConfig, clientSet, ns := setup(t, "managed-by-recreate-job")
defer closeFn()
ctx, cancel := startJobControllerAndWaitForCaches(t, restConfig)
defer cancel()
resetMetrics()
baseJob := batchv1.Job{
ObjectMeta: metav1.ObjectMeta{
Name: "custom-job-test",
Namespace: ns.Name,
},
Spec: batchv1.JobSpec{
Completions: ptr.To[int32](1),
Parallelism: ptr.To[int32](1),
Template: v1.PodTemplateSpec{
Spec: v1.PodSpec{
Containers: []v1.Container{
{
Name: "main-container",
Image: "foo",
},
},
},
},
},
}
jobObj, err := createJobWithDefaults(ctx, clientSet, ns.Name, &baseJob)
if err != nil {
t.Fatalf("Error %v when creating the job %q", err, klog.KObj(jobObj))
}
validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
Active: 1,
Ready: ptr.To[int32](0),
Terminating: ptr.To[int32](0),
})
// Marking the pod as complete queues the job reconciliation
if _, err := setJobPodsPhase(ctx, clientSet, jobObj, v1.PodSucceeded, 1); err != nil {
t.Fatalf("Error %v when setting phase %s on the pod of job %v", err, v1.PodSucceeded, klog.KObj(jobObj))
}
jobClient := clientSet.BatchV1().Jobs(jobObj.Namespace)
if err = jobClient.Delete(ctx, jobObj.Name, metav1.DeleteOptions{
// Use propagationPolicy=background so that we don't need to wait for the job object to be gone.
PropagationPolicy: ptr.To(metav1.DeletePropagationBackground),
}); err != nil {
t.Fatalf("Error %v when deleting the job %v", err, klog.KObj(jobObj))
}
jobWithManagedBy := baseJob.DeepCopy()
jobWithManagedBy.Spec.ManagedBy = ptr.To(customControllerName)
jobObj, err = createJobWithDefaults(ctx, clientSet, ns.Name, jobWithManagedBy)
if err != nil {
t.Fatalf("Error %q while creating the job %q", err, klog.KObj(jobObj))
}
validateCounterMetric(ctx, t, metrics.JobByExternalControllerTotal, metricLabelsWithValue{
Labels: []string{customControllerName},
Value: 1,
})
time.Sleep(sleepDurationForControllerLatency)
jobObj, err = jobClient.Get(ctx, jobObj.Name, metav1.GetOptions{})
if err != nil {
t.Fatalf("Error %v when getting the latest job %v", err, klog.KObj(jobObj))
}
if diff := cmp.Diff(batchv1.JobStatus{}, jobObj.Status); diff != "" {
t.Fatalf("Unexpected status (-want/+got): %s", diff)
}
}
// TestManagedBy_UsingReservedJobFinalizers documents the behavior of the Job
// controller when there is a job with custom value of the managedBy field, creating
// pods with the batch.kubernetes.io/job-tracking finalizer. The built-in controller
// should not remove the finalizer. Note that, the use of the finalizer in jobs
// managed by external controllers is discouraged, but may potentially happen
// when one forks the controller and does not rename the finalizer.
func TestManagedBy_UsingReservedJobFinalizers(t *testing.T) {
customControllerName := "example.com/custom-job-controller"
featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobManagedBy, true)
closeFn, restConfig, clientSet, ns := setup(t, "managed-by-reserved-finalizers")
defer closeFn()
ctx, cancel := startJobControllerAndWaitForCaches(t, restConfig)
defer cancel()
resetMetrics()
jobSpec := batchv1.Job{
TypeMeta: metav1.TypeMeta{
APIVersion: "batch/v1",
Kind: "Job",
},
ObjectMeta: metav1.ObjectMeta{
Name: "custom-job-test",
Namespace: ns.Name,
},
Spec: batchv1.JobSpec{
Completions: ptr.To[int32](1),
Parallelism: ptr.To[int32](1),
Template: v1.PodTemplateSpec{
Spec: v1.PodSpec{
Containers: []v1.Container{
{
Name: "main-container",
Image: "foo",
},
},
},
},
ManagedBy: ptr.To(customControllerName),
},
}
// Create a job with custom managedBy
jobObj, err := createJobWithDefaults(ctx, clientSet, ns.Name, &jobSpec)
if err != nil {
t.Fatalf("Error %v when creating the job %q", err, klog.KObj(jobObj))
}
podControl := controller.RealPodControl{
KubeClient: clientSet,
Recorder: &record.FakeRecorder{},
}
// Create the pod manually simulating the behavior of the external controller
// indicated by the managedBy field. We create the pod with the built-in
// finalizer.
podTemplate := jobObj.Spec.Template.DeepCopy()
podTemplate.Finalizers = append(podTemplate.Finalizers, batchv1.JobTrackingFinalizer)
err = podControl.CreatePodsWithGenerateName(ctx, jobObj.Namespace, podTemplate, jobObj, metav1.NewControllerRef(jobObj, batchv1.SchemeGroupVersion.WithKind("Job")), "pod1")
if err != nil {
t.Fatalf("Error %v when creating a pod for job %q", err, klog.KObj(jobObj))
}
// Getting the list of pods for the Jobs to obtain the reference to the created pod.
jobPods, err := getJobPods(ctx, t, clientSet, jobObj, func(ps v1.PodStatus) bool { return true })
if err != nil {
t.Fatalf("Error %v getting the list of pods for job %q", err, klog.KObj(jobObj))
}
if len(jobPods) != 1 {
t.Fatalf("Unexpected number (%d) of pods for job: %v", len(jobPods), klog.KObj(jobObj))
}
// Marking the pod as finished (succeeded), before marking the parent job as complete.
podObj := jobPods[0]
podObj.Status.Phase = v1.PodSucceeded
podObj, err = clientSet.CoreV1().Pods(ns.Name).UpdateStatus(ctx, podObj, metav1.UpdateOptions{})
if err != nil {
t.Fatalf("Error %v when marking the %q pod as succeeded", err, klog.KObj(podObj))
}
// Trigger termination for the Job so that the built-in controller receives the
// UpdateJob event in reaction to each it would remove the pod's finalizer,
// if not for the custom managedBy field.
jobObj.Status.Conditions = append(jobObj.Status.Conditions, batchv1.JobCondition{
Type: batchv1.JobSuccessCriteriaMet,
Status: v1.ConditionTrue,
})
jobObj.Status.StartTime = ptr.To(metav1.Now())
if jobObj, err = clientSet.BatchV1().Jobs(jobObj.Namespace).UpdateStatus(ctx, jobObj, metav1.UpdateOptions{}); err != nil {
t.Fatalf("Error %v when updating the job as finished %v", err, klog.KObj(jobObj))
}
podObj, err = clientSet.CoreV1().Pods(ns.Name).Get(ctx, podObj.Name, metav1.GetOptions{})
if err != nil {
t.Fatalf("Error %v when getting the latest version of the pod %v", err, klog.KObj(podObj))
}
// Update the pod so that the built-in controller receives the UpdatePod event
// in reaction to each it would remove the pod's finalizer, if not for the
// custom value of the managedBy field on the job.
podObj.Status.Conditions = append(podObj.Status.Conditions, v1.PodCondition{
Type: v1.PodConditionType("CustomCondition"),
Status: v1.ConditionTrue,
})
podObj, err = clientSet.CoreV1().Pods(ns.Name).UpdateStatus(ctx, podObj, metav1.UpdateOptions{})
if err != nil {
t.Fatalf("Error %v when adding a condition to the pod status %v", err, klog.KObj(podObj))
}
time.Sleep(sleepDurationForControllerLatency)
podObj, err = clientSet.CoreV1().Pods(ns.Name).Get(ctx, podObj.Name, metav1.GetOptions{})
if err != nil {
t.Fatalf("Error %v when getting the latest version of the pod %v", err, klog.KObj(podObj))
}
if diff := cmp.Diff([]string{batchv1.JobTrackingFinalizer}, podObj.Finalizers); diff != "" {
t.Fatalf("Unexpected change in the set of finalizers for pod %q, because the owner job %q has custom managedBy, diff=%s", klog.KObj(podObj), klog.KObj(jobObj), diff)
}
}
func getIndexFailureCount(p *v1.Pod) (int, error) {
if p.Annotations == nil {
return 0, errors.New("no annotations found")
}
v, ok := p.Annotations[batchv1.JobIndexFailureCountAnnotation]
if !ok {
return 0, fmt.Errorf("annotation %s not found", batchv1.JobIndexFailureCountAnnotation)
}
return strconv.Atoi(v)
}
func completionModePtr(cm batchv1.CompletionMode) *batchv1.CompletionMode {
return &cm
}
// TestNonParallelJob tests that a Job that only executes one Pod. The test
// recreates the Job controller at some points to make sure a new controller
// is able to pickup.
func TestNonParallelJob(t *testing.T) {
t.Cleanup(setDurationDuringTest(&jobcontroller.DefaultJobPodFailureBackOff, fastPodFailureBackoff))
closeFn, restConfig, clientSet, ns := setup(t, "simple")
defer closeFn()
ctx, cancel := startJobControllerAndWaitForCaches(t, restConfig)
defer func() {
cancel()
}()
resetMetrics()
jobObj, err := createJobWithDefaults(ctx, clientSet, ns.Name, &batchv1.Job{})
if err != nil {
t.Fatalf("Failed to create Job: %v", err)
}
validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
Active: 1,
Ready: ptr.To[int32](0),
Terminating: ptr.To[int32](0),
})
// Restarting controller.
cancel()
ctx, cancel = startJobControllerAndWaitForCaches(t, restConfig)
// Failed Pod is replaced.
if _, err := setJobPodsPhase(ctx, clientSet, jobObj, v1.PodFailed, 1); err != nil {
t.Fatalf("Failed setting phase %s on Job Pod: %v", v1.PodFailed, err)
}
validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
Active: 1,
Failed: 1,
Ready: ptr.To[int32](0),
Terminating: ptr.To[int32](0),
})
validateCounterMetric(ctx, t, metrics.JobPodsFinished, metricLabelsWithValue{
Labels: []string{"NonIndexed", "failed"},
Value: 1,
})
// Restarting controller.
cancel()
ctx, cancel = startJobControllerAndWaitForCaches(t, restConfig)
// No more Pods are created after the Pod succeeds.
if _, err := setJobPodsPhase(ctx, clientSet, jobObj, v1.PodSucceeded, 1); err != nil {
t.Fatalf("Failed setting phase %s on Job Pod: %v", v1.PodSucceeded, err)
}
validateJobComplete(ctx, t, clientSet, jobObj)
validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
Failed: 1,
Succeeded: 1,
Ready: ptr.To[int32](0),
Terminating: ptr.To[int32](0),
})
validateFinishedPodsNoFinalizer(ctx, t, clientSet, jobObj)
validateCounterMetric(ctx, t, metrics.JobFinishedNum, metricLabelsWithValue{
Labels: []string{"NonIndexed", "succeeded", "CompletionsReached"},
Value: 1,
})
validateCounterMetric(ctx, t, metrics.JobPodsFinished, metricLabelsWithValue{
Labels: []string{"NonIndexed", "succeeded"},
Value: 1,
})
}
func TestParallelJob(t *testing.T) {
t.Cleanup(setDurationDuringTest(&jobcontroller.DefaultJobPodFailureBackOff, fastPodFailureBackoff))
closeFn, restConfig, clientSet, ns := setup(t, "parallel")
defer closeFn()
ctx, cancel := startJobControllerAndWaitForCaches(t, restConfig)
defer cancel()
resetMetrics()
jobObj, err := createJobWithDefaults(ctx, clientSet, ns.Name, &batchv1.Job{
Spec: batchv1.JobSpec{
Parallelism: ptr.To[int32](5),
},
})
if err != nil {
t.Fatalf("Failed to create Job: %v", err)
}
want := podsByStatus{
Active: 5,
Ready: ptr.To[int32](0),
Terminating: ptr.To[int32](0),
}
validateJobPodsStatus(ctx, t, clientSet, jobObj, want)
// Tracks ready pods, if enabled.
if _, err := setJobPodsReady(ctx, clientSet, jobObj, 2); err != nil {
t.Fatalf("Failed Marking Pods as ready: %v", err)
}
want.Ready = ptr.To[int32](2)
validateJobPodsStatus(ctx, t, clientSet, jobObj, want)
// Failed Pods are replaced.
if _, err := setJobPodsPhase(ctx, clientSet, jobObj, v1.PodFailed, 2); err != nil {
t.Fatalf("Failed setting phase %s on Job Pods: %v", v1.PodFailed, err)
}
want = podsByStatus{
Active: 5,
Failed: 2,
Ready: ptr.To[int32](0),
Terminating: ptr.To[int32](0),
}
validateJobPodsStatus(ctx, t, clientSet, jobObj, want)
// Once one Pod succeeds, no more Pods are created, even if some fail.
if _, err := setJobPodsPhase(ctx, clientSet, jobObj, v1.PodSucceeded, 1); err != nil {
t.Fatalf("Failed setting phase %s on Job Pod: %v", v1.PodSucceeded, err)
}
want = podsByStatus{
Failed: 2,
Succeeded: 1,
Active: 4,
Ready: ptr.To[int32](0),
Terminating: ptr.To[int32](0),
}
validateJobPodsStatus(ctx, t, clientSet, jobObj, want)
if _, err := setJobPodsPhase(ctx, clientSet, jobObj, v1.PodFailed, 2); err != nil {
t.Fatalf("Failed setting phase %s on Job Pods: %v", v1.PodFailed, err)
}
want = podsByStatus{
Failed: 4,
Succeeded: 1,
Active: 2,
Ready: ptr.To[int32](0),
Terminating: ptr.To[int32](0),
}
validateJobPodsStatus(ctx, t, clientSet, jobObj, want)
// No more Pods are created after remaining Pods succeed.
if _, err := setJobPodsPhase(ctx, clientSet, jobObj, v1.PodSucceeded, 2); err != nil {
t.Fatalf("Failed setting phase %s on Job Pods: %v", v1.PodSucceeded, err)
}
validateJobComplete(ctx, t, clientSet, jobObj)
want = podsByStatus{
Failed: 4,
Succeeded: 3,
Ready: ptr.To[int32](0),
Terminating: ptr.To[int32](0),
}
validateJobPodsStatus(ctx, t, clientSet, jobObj, want)
validateFinishedPodsNoFinalizer(ctx, t, clientSet, jobObj)
validateTerminatedPodsTrackingFinalizerMetric(ctx, t, 7)
validateCounterMetric(ctx, t, metrics.JobFinishedNum, metricLabelsWithValue{
Labels: []string{"NonIndexed", "succeeded", "CompletionsReached"},
Value: 1,
})
validateCounterMetric(ctx, t, metrics.JobPodsFinished, metricLabelsWithValue{
Labels: []string{"NonIndexed", "succeeded"},
Value: 3,
})
validateCounterMetric(ctx, t, metrics.JobPodsFinished, metricLabelsWithValue{
Labels: []string{"NonIndexed", "failed"},
Value: 4,
})
}
func TestParallelJobChangingParallelism(t *testing.T) {
closeFn, restConfig, clientSet, ns := setup(t, "parallel")
defer closeFn()
ctx, cancel := startJobControllerAndWaitForCaches(t, restConfig)
defer cancel()
jobObj, err := createJobWithDefaults(ctx, clientSet, ns.Name, &batchv1.Job{
Spec: batchv1.JobSpec{
BackoffLimit: ptr.To[int32](2),
Parallelism: ptr.To[int32](5),
},
})
if err != nil {
t.Fatalf("Failed to create Job: %v", err)
}
validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
Active: 5,
Ready: ptr.To[int32](0),
Terminating: ptr.To[int32](0),
})
// Reduce parallelism by a number greater than backoffLimit.
patch := []byte(`{"spec":{"parallelism":2}}`)
jobObj, err = clientSet.BatchV1().Jobs(ns.Name).Patch(ctx, jobObj.Name, types.StrategicMergePatchType, patch, metav1.PatchOptions{})
if err != nil {
t.Fatalf("Updating Job: %v", err)
}
validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
Active: 2,
Ready: ptr.To[int32](0),
Terminating: ptr.To[int32](0),
})
// Increase parallelism again.
patch = []byte(`{"spec":{"parallelism":4}}`)
jobObj, err = clientSet.BatchV1().Jobs(ns.Name).Patch(ctx, jobObj.Name, types.StrategicMergePatchType, patch, metav1.PatchOptions{})
if err != nil {
t.Fatalf("Updating Job: %v", err)
}
validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
Active: 4,
Ready: ptr.To[int32](0),
Terminating: ptr.To[int32](0),
})
// Succeed Job
if _, err := setJobPodsPhase(ctx, clientSet, jobObj, v1.PodSucceeded, 4); err != nil {
t.Fatalf("Failed setting phase %s on Job Pods: %v", v1.PodSucceeded, err)
}
validateJobComplete(ctx, t, clientSet, jobObj)
validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
Succeeded: 4,
Ready: ptr.To[int32](0),
Terminating: ptr.To[int32](0),
})
validateFinishedPodsNoFinalizer(ctx, t, clientSet, jobObj)
}
func TestParallelJobWithCompletions(t *testing.T) {
// Lower limits for a job sync so that we can test partial updates with a low
// number of pods.
t.Cleanup(setDuringTest(&jobcontroller.MaxUncountedPods, 10))
t.Cleanup(setDuringTest(&jobcontroller.MaxPodCreateDeletePerSync, 10))
t.Cleanup(setDurationDuringTest(&jobcontroller.DefaultJobPodFailureBackOff, fastPodFailureBackoff))
closeFn, restConfig, clientSet, ns := setup(t, "completions")
defer closeFn()
ctx, cancel := startJobControllerAndWaitForCaches(t, restConfig)
defer cancel()
resetMetrics()
jobObj, err := createJobWithDefaults(ctx, clientSet, ns.Name, &batchv1.Job{
Spec: batchv1.JobSpec{
Parallelism: ptr.To[int32](54),
Completions: ptr.To[int32](56),
},
})
if err != nil {
t.Fatalf("Failed to create Job: %v", err)
}
want := podsByStatus{
Active: 54,
Ready: ptr.To[int32](0),
Terminating: ptr.To[int32](0),
}
validateJobPodsStatus(ctx, t, clientSet, jobObj, want)
// Tracks ready pods, if enabled.
if _, err := setJobPodsReady(ctx, clientSet, jobObj, 52); err != nil {
t.Fatalf("Failed Marking Pods as ready: %v", err)
}
want.Ready = ptr.To[int32](52)
validateJobPodsStatus(ctx, t, clientSet, jobObj, want)
// Failed Pods are replaced.
if _, err := setJobPodsPhase(ctx, clientSet, jobObj, v1.PodFailed, 2); err != nil {
t.Fatalf("Failed setting phase %s on Job Pods: %v", v1.PodFailed, err)
}
want = podsByStatus{
Active: 54,
Failed: 2,
Ready: ptr.To[int32](50),
Terminating: ptr.To[int32](0),
}
validateJobPodsStatus(ctx, t, clientSet, jobObj, want)
// Pods are created until the number of succeeded Pods equals completions.
if _, err := setJobPodsPhase(ctx, clientSet, jobObj, v1.PodSucceeded, 53); err != nil {
t.Fatalf("Failed setting phase %s on Job Pod: %v", v1.PodSucceeded, err)
}
want = podsByStatus{
Failed: 2,
Succeeded: 53,
Active: 3,
Ready: ptr.To[int32](0),
Terminating: ptr.To[int32](0),
}
validateJobPodsStatus(ctx, t, clientSet, jobObj, want)
// No more Pods are created after the Job completes.
if _, err := setJobPodsPhase(ctx, clientSet, jobObj, v1.PodSucceeded, 3); err != nil {
t.Fatalf("Failed setting phase %s on Job Pods: %v", v1.PodSucceeded, err)
}
validateJobComplete(ctx, t, clientSet, jobObj)
want = podsByStatus{
Failed: 2,
Succeeded: 56,
Ready: ptr.To[int32](0),
Terminating: ptr.To[int32](0),
}
validateJobPodsStatus(ctx, t, clientSet, jobObj, want)
validateFinishedPodsNoFinalizer(ctx, t, clientSet, jobObj)
validateCounterMetric(ctx, t, metrics.JobFinishedNum, metricLabelsWithValue{
Labels: []string{"NonIndexed", "succeeded", "CompletionsReached"},
Value: 1,
})
validateCounterMetric(ctx, t, metrics.JobPodsFinished, metricLabelsWithValue{
Labels: []string{"NonIndexed", "succeeded"},
Value: 56,
})
validateCounterMetric(ctx, t, metrics.JobPodsFinished, metricLabelsWithValue{
Labels: []string{"NonIndexed", "failed"},
Value: 2,
})
}
func TestIndexedJob(t *testing.T) {
t.Cleanup(setDurationDuringTest(&jobcontroller.DefaultJobPodFailureBackOff, fastPodFailureBackoff))
closeFn, restConfig, clientSet, ns := setup(t, "indexed")
defer closeFn()
ctx, cancel := startJobControllerAndWaitForCaches(t, restConfig)
defer cancel()
resetMetrics()
mode := batchv1.IndexedCompletion
jobObj, err := createJobWithDefaults(ctx, clientSet, ns.Name, &batchv1.Job{
Spec: batchv1.JobSpec{
Parallelism: ptr.To[int32](3),
Completions: ptr.To[int32](4),
CompletionMode: &mode,
},
})
if err != nil {
t.Fatalf("Failed to create Job: %v", err)
}
validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
Active: 3,
Ready: ptr.To[int32](0),
Terminating: ptr.To[int32](0),
})
validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New(0, 1, 2), "", nil)
validateCounterMetric(ctx, t, metrics.JobFinishedIndexesTotal, metricLabelsWithValue{
Labels: []string{"succeeded", "global"},
Value: 0,
})
// One Pod succeeds.
if err := setJobPhaseForIndex(ctx, clientSet, jobObj, v1.PodSucceeded, 1); err != nil {
t.Fatal("Failed trying to succeed pod with index 1")
}
validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
Active: 3,
Succeeded: 1,
Ready: ptr.To[int32](0),
Terminating: ptr.To[int32](0),
})
validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New(0, 2, 3), "1", nil)
validateCounterMetric(ctx, t, metrics.JobFinishedIndexesTotal, metricLabelsWithValue{
Labels: []string{"succeeded", "global"},
Value: 1,
})
// One Pod fails, which should be recreated.
if err := setJobPhaseForIndex(ctx, clientSet, jobObj, v1.PodFailed, 2); err != nil {
t.Fatal("Failed trying to succeed pod with index 2")
}
validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
Active: 3,
Failed: 1,
Succeeded: 1,
Ready: ptr.To[int32](0),
Terminating: ptr.To[int32](0),
})
validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New(0, 2, 3), "1", nil)
validateCounterMetric(ctx, t, metrics.JobFinishedIndexesTotal, metricLabelsWithValue{
Labels: []string{"succeeded", "global"},
Value: 1,
})
// Remaining Pods succeed.
if _, err := setJobPodsPhase(ctx, clientSet, jobObj, v1.PodSucceeded, 3); err != nil {
t.Fatal("Failed trying to succeed remaining pods")
}
validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
Active: 0,
Failed: 1,
Succeeded: 4,
Ready: ptr.To[int32](0),
Terminating: ptr.To[int32](0),
})
validateIndexedJobPods(ctx, t, clientSet, jobObj, nil, "0-3", nil)
validateJobComplete(ctx, t, clientSet, jobObj)
validateFinishedPodsNoFinalizer(ctx, t, clientSet, jobObj)
validateTerminatedPodsTrackingFinalizerMetric(ctx, t, 5)
validateCounterMetric(ctx, t, metrics.JobFinishedIndexesTotal, metricLabelsWithValue{
Labels: []string{"succeeded", "global"},
Value: 4,
})
validateCounterMetric(ctx, t, metrics.JobFinishedNum, metricLabelsWithValue{
Labels: []string{"Indexed", "succeeded", "CompletionsReached"},
Value: 1,
})
validateCounterMetric(ctx, t, metrics.JobPodsFinished, metricLabelsWithValue{
Labels: []string{"Indexed", "succeeded"},
Value: 4,
})
validateCounterMetric(ctx, t, metrics.JobPodsFinished, metricLabelsWithValue{
Labels: []string{"Indexed", "failed"},
Value: 1,
})
}
func TestJobPodReplacementPolicy(t *testing.T) {
t.Cleanup(setDurationDuringTest(&jobcontroller.DefaultJobPodFailureBackOff, fastPodFailureBackoff))
indexedCompletion := batchv1.IndexedCompletion
nonIndexedCompletion := batchv1.NonIndexedCompletion
var podReplacementPolicy = func(obj batchv1.PodReplacementPolicy) *batchv1.PodReplacementPolicy {
return &obj
}
type jobStatus struct {
active int
failed int
terminating *int32
}
type jobPodsCreationMetrics struct {
new int
recreateTerminatingOrFailed int
recreateFailed int
}
cases := map[string]struct {
podReplacementPolicyEnabled bool
jobSpec *batchv1.JobSpec
wantStatusAfterDeletion jobStatus
wantStatusAfterFailure jobStatus
wantMetrics jobPodsCreationMetrics
}{
"feature flag off, delete & fail pods, recreate terminating pods, and verify job status counters": {
jobSpec: &batchv1.JobSpec{
Parallelism: ptr.To[int32](2),
Completions: ptr.To[int32](2),
CompletionMode: &indexedCompletion,
Template: v1.PodTemplateSpec{
ObjectMeta: metav1.ObjectMeta{
Finalizers: []string{"fake.example.com/blockDeletion"},
},
},
},
wantStatusAfterDeletion: jobStatus{
active: 2,
failed: 2,
},
wantStatusAfterFailure: jobStatus{
active: 2,
failed: 2,
},
wantMetrics: jobPodsCreationMetrics{
new: 4,
},
},
"feature flag true with IndexedJob, TerminatingOrFailed policy, delete & fail pods, recreate terminating pods, and verify job status counters": {
podReplacementPolicyEnabled: true,
jobSpec: &batchv1.JobSpec{
Parallelism: ptr.To[int32](2),
Completions: ptr.To[int32](2),
CompletionMode: &indexedCompletion,
PodReplacementPolicy: podReplacementPolicy(batchv1.TerminatingOrFailed),
Template: v1.PodTemplateSpec{
ObjectMeta: metav1.ObjectMeta{
Finalizers: []string{"fake.example.com/blockDeletion"},
},
},
},
wantStatusAfterDeletion: jobStatus{
active: 2,
failed: 2,
terminating: ptr.To[int32](2),
},
wantStatusAfterFailure: jobStatus{
active: 2,
failed: 2,
terminating: ptr.To[int32](0),
},
wantMetrics: jobPodsCreationMetrics{
new: 2,
recreateTerminatingOrFailed: 2,
},
},
"feature flag true with NonIndexedJob, TerminatingOrFailed policy, delete & fail pods, recreate terminating pods, and verify job status counters": {
podReplacementPolicyEnabled: true,
jobSpec: &batchv1.JobSpec{
Parallelism: ptr.To[int32](2),
Completions: ptr.To[int32](2),
CompletionMode: &nonIndexedCompletion,
PodReplacementPolicy: podReplacementPolicy(batchv1.TerminatingOrFailed),
Template: v1.PodTemplateSpec{
ObjectMeta: metav1.ObjectMeta{
Finalizers: []string{"fake.example.com/blockDeletion"},
},
},
},
wantStatusAfterDeletion: jobStatus{
active: 2,
failed: 2,
terminating: ptr.To[int32](2),
},
wantStatusAfterFailure: jobStatus{
active: 2,
failed: 2,
terminating: ptr.To[int32](0),
},
wantMetrics: jobPodsCreationMetrics{
new: 2,
recreateTerminatingOrFailed: 2,
},
},
"feature flag false, podFailurePolicy enabled, delete & fail pods, recreate failed pods, and verify job status counters": {
podReplacementPolicyEnabled: false,
jobSpec: &batchv1.JobSpec{
Parallelism: ptr.To[int32](2),
Completions: ptr.To[int32](2),
CompletionMode: &nonIndexedCompletion,
PodReplacementPolicy: podReplacementPolicy(batchv1.Failed),
Template: v1.PodTemplateSpec{
ObjectMeta: metav1.ObjectMeta{
Finalizers: []string{"fake.example.com/blockDeletion"},
},
},
PodFailurePolicy: &batchv1.PodFailurePolicy{
Rules: []batchv1.PodFailurePolicyRule{
{
Action: batchv1.PodFailurePolicyActionFailJob,
OnExitCodes: &batchv1.PodFailurePolicyOnExitCodesRequirement{
Operator: batchv1.PodFailurePolicyOnExitCodesOpIn,
Values: []int32{5},
},
},
},
},
},
wantStatusAfterDeletion: jobStatus{
active: 2,
},
wantStatusAfterFailure: jobStatus{
active: 2,
},
wantMetrics: jobPodsCreationMetrics{
new: 2,
},
},
"feature flag true, Failed policy, delete & fail pods, recreate failed pods, and verify job status counters": {
podReplacementPolicyEnabled: true,
jobSpec: &batchv1.JobSpec{
Parallelism: ptr.To[int32](2),
Completions: ptr.To[int32](2),
CompletionMode: &indexedCompletion,
PodReplacementPolicy: podReplacementPolicy(batchv1.Failed),
Template: v1.PodTemplateSpec{
ObjectMeta: metav1.ObjectMeta{
Finalizers: []string{"fake.example.com/blockDeletion"},
},
},
},
wantStatusAfterDeletion: jobStatus{
active: 0,
failed: 0,
terminating: ptr.To[int32](2),
},
wantStatusAfterFailure: jobStatus{
active: 2,
failed: 2,
terminating: ptr.To[int32](0),
},
wantMetrics: jobPodsCreationMetrics{
new: 2,
recreateFailed: 2,
},
},
"feature flag true with NonIndexedJob, Failed policy, delete & fail pods, recreate failed pods, and verify job status counters": {
podReplacementPolicyEnabled: true,
jobSpec: &batchv1.JobSpec{
Parallelism: ptr.To[int32](2),
Completions: ptr.To[int32](2),
CompletionMode: &nonIndexedCompletion,
PodReplacementPolicy: podReplacementPolicy(batchv1.Failed),
Template: v1.PodTemplateSpec{
ObjectMeta: metav1.ObjectMeta{
Finalizers: []string{"fake.example.com/blockDeletion"},
},
},
},
wantStatusAfterDeletion: jobStatus{
active: 0,
failed: 0,
terminating: ptr.To[int32](2),
},
wantStatusAfterFailure: jobStatus{
active: 2,
failed: 2,
terminating: ptr.To[int32](0),
},
wantMetrics: jobPodsCreationMetrics{
new: 2,
recreateFailed: 2,
},
},
}
for name, tc := range cases {
tc := tc
t.Run(name, func(t *testing.T) {
featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobPodReplacementPolicy, tc.podReplacementPolicyEnabled)
closeFn, restConfig, clientSet, ns := setup(t, "pod-replacement-policy")
t.Cleanup(closeFn)
ctx, cancel := startJobControllerAndWaitForCaches(t, restConfig)
t.Cleanup(cancel)
resetMetrics()
jobObj, err := createJobWithDefaults(ctx, clientSet, ns.Name, &batchv1.Job{
Spec: *tc.jobSpec,
})
if err != nil {
t.Fatalf("Failed to create Job: %v", err)
}
jobClient := clientSet.BatchV1().Jobs(jobObj.Namespace)
waitForPodsToBeActive(ctx, t, jobClient, 2, jobObj)
t.Cleanup(func() { removePodsFinalizer(ctx, t, clientSet, ns.Name) })
deletePods(ctx, t, clientSet, ns.Name)
validateJobsPodsStatusOnly(ctx, t, clientSet, jobObj, podsByStatus{
Terminating: tc.wantStatusAfterDeletion.terminating,
Failed: tc.wantStatusAfterDeletion.failed,
Active: tc.wantStatusAfterDeletion.active,
Ready: ptr.To[int32](0),
})
failTerminatingPods(ctx, t, clientSet, ns.Name)
validateJobsPodsStatusOnly(ctx, t, clientSet, jobObj, podsByStatus{
Terminating: tc.wantStatusAfterFailure.terminating,
Failed: tc.wantStatusAfterFailure.failed,
Active: tc.wantStatusAfterFailure.active,
Ready: ptr.To[int32](0),
})
validateCounterMetric(
ctx,
t,
metrics.JobPodsCreationTotal,
metricLabelsWithValue{Labels: []string{"new", "succeeded"}, Value: tc.wantMetrics.new},
)
validateCounterMetric(
ctx,
t,
metrics.JobPodsCreationTotal,
metricLabelsWithValue{Labels: []string{"recreate_terminating_or_failed", "succeeded"}, Value: tc.wantMetrics.recreateTerminatingOrFailed},
)
validateCounterMetric(
ctx,
t,
metrics.JobPodsCreationTotal,
metricLabelsWithValue{Labels: []string{"recreate_failed", "succeeded"}, Value: tc.wantMetrics.recreateFailed},
)
})
}
}
// This tests the feature enable -> disable -> enable path for PodReplacementPolicy.
// We verify that Failed case works as expected when turned on.
// Disable reverts to previous behavior.
// Enabling will then match the original failed case.
func TestJobPodReplacementPolicyFeatureToggling(t *testing.T) {
t.Cleanup(setDurationDuringTest(&jobcontroller.DefaultJobPodFailureBackOff, fastPodFailureBackoff))
const podCount int32 = 2
jobSpec := batchv1.JobSpec{
Parallelism: ptr.To(podCount),
Completions: ptr.To(podCount),
CompletionMode: ptr.To(batchv1.NonIndexedCompletion),
PodReplacementPolicy: ptr.To(batchv1.Failed),
}
wantTerminating := ptr.To(podCount)
featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobPodReplacementPolicy, true)
closeFn, restConfig, clientSet, ns := setup(t, "pod-replacement-policy")
defer closeFn()
ctx, cancel := startJobControllerAndWaitForCaches(t, restConfig)
defer func() {
cancel()
}()
resetMetrics()
jobObj, err := createJobWithDefaults(ctx, clientSet, ns.Name, &batchv1.Job{
Spec: jobSpec,
})
if err != nil {
t.Fatalf("Failed to create Job: %v", err)
}
jobClient := clientSet.BatchV1().Jobs(jobObj.Namespace)
waitForPodsToBeActive(ctx, t, jobClient, 2, jobObj)
deletePods(ctx, t, clientSet, jobObj.Namespace)
validateJobsPodsStatusOnly(ctx, t, clientSet, jobObj, podsByStatus{
Terminating: wantTerminating,
Failed: 0,
Ready: ptr.To[int32](0),
})
// Disable controller and turn feature off.
featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobPodReplacementPolicy, false)
cancel()
ctx, cancel = startJobControllerAndWaitForCaches(t, restConfig)
validateJobsPodsStatusOnly(ctx, t, clientSet, jobObj, podsByStatus{
Terminating: nil,
Failed: int(podCount),
Ready: ptr.To[int32](0),
Active: int(podCount),
})
cancel()
// Disable the controller and turn feature on again.
// However, before we re-enabling the feature gate we wait a little (1s to
// wait for the syncJob re-queue after update + 100ms for the syncJob
// execution itself) to make sure there is no pending syncJob which could
// panic if the trackTerminating returned false at the start of the sync,
// but onlyReplaceFailedPods returned true during that sync.
time.Sleep(time.Second + sleepDurationForControllerLatency)
featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobPodReplacementPolicy, true)
ctx, cancel = startJobControllerAndWaitForCaches(t, restConfig)
waitForPodsToBeActive(ctx, t, jobClient, 2, jobObj)
deletePods(ctx, t, clientSet, jobObj.Namespace)
validateJobsPodsStatusOnly(ctx, t, clientSet, jobObj, podsByStatus{
Terminating: wantTerminating,
Failed: int(podCount),
Active: 0,
Ready: ptr.To[int32](0),
})
}
func TestElasticIndexedJob(t *testing.T) {
const initialCompletions int32 = 3
type jobUpdate struct {
completions *int32
succeedIndexes []int
failIndexes []int
wantSucceededIndexes string
wantFailed int
wantRemainingIndexes sets.Set[int]
wantActivePods int
wantTerminating *int32
}
cases := map[string]struct {
jobUpdates []jobUpdate
wantErr *apierrors.StatusError
}{
"scale up": {
jobUpdates: []jobUpdate{
{
// Scale up completions 3->4 then succeed indexes 0-3
completions: ptr.To[int32](4),
succeedIndexes: []int{0, 1, 2, 3},
wantSucceededIndexes: "0-3",
wantTerminating: ptr.To[int32](0),
},
},
},
"scale down": {
jobUpdates: []jobUpdate{
// First succeed index 1 and fail index 2 while completions is still original value (3).
{
succeedIndexes: []int{1},
failIndexes: []int{2},
wantSucceededIndexes: "1",
wantFailed: 1,
wantRemainingIndexes: sets.New(0, 2),
wantActivePods: 2,
wantTerminating: ptr.To[int32](0),
},
// Scale down completions 3->1, verify prev failure out of range still counts
// but succeeded out of range does not.
{
completions: ptr.To[int32](1),
succeedIndexes: []int{0},
wantSucceededIndexes: "0",
wantFailed: 1,
wantTerminating: ptr.To[int32](0),
},
},
},
"index finishes successfully, scale down, scale up": {
jobUpdates: []jobUpdate{
// First succeed index 2 while completions is still original value (3).
{
succeedIndexes: []int{2},
wantSucceededIndexes: "2",
wantRemainingIndexes: sets.New(0, 1),
wantActivePods: 2,
wantTerminating: ptr.To[int32](0),
},
// Scale completions down 3->2 to exclude previously succeeded index.
{
completions: ptr.To[int32](2),
wantRemainingIndexes: sets.New(0, 1),
wantActivePods: 2,
wantTerminating: ptr.To[int32](0),
},
// Scale completions back up to include previously succeeded index that was temporarily out of range.
{
completions: ptr.To[int32](3),
succeedIndexes: []int{0, 1, 2},
wantSucceededIndexes: "0-2",
wantTerminating: ptr.To[int32](0),
},
},
},
"scale down to 0, verify that the job succeeds": {
jobUpdates: []jobUpdate{
{
completions: ptr.To[int32](0),
wantTerminating: ptr.To[int32](0),
},
},
},
}
for name, tc := range cases {
tc := tc
t.Run(name, func(t *testing.T) {
closeFn, restConfig, clientSet, ns := setup(t, "indexed")
defer closeFn()
ctx, cancel := startJobControllerAndWaitForCaches(t, restConfig)
defer cancel()
resetMetrics()
// Set up initial Job in Indexed completion mode.
mode := batchv1.IndexedCompletion
jobObj, err := createJobWithDefaults(ctx, clientSet, ns.Name, &batchv1.Job{
Spec: batchv1.JobSpec{
Parallelism: ptr.To(initialCompletions),
Completions: ptr.To(initialCompletions),
CompletionMode: &mode,
},
})
if err != nil {
t.Fatalf("Failed to create Job: %v", err)
}
jobClient := clientSet.BatchV1().Jobs(jobObj.Namespace)
// Wait for pods to start up.
err = wait.PollUntilContextTimeout(ctx, 5*time.Millisecond, wait.ForeverTestTimeout, true, func(ctx context.Context) (done bool, err error) {
job, err := jobClient.Get(ctx, jobObj.Name, metav1.GetOptions{})
if err != nil {
return false, err
}
if job.Status.Active == initialCompletions {
return true, nil
}
return false, nil
})
if err != nil {
t.Fatalf("Error waiting for Job pods to become active: %v", err)
}
for _, update := range tc.jobUpdates {
// Update Job spec if necessary.
if update.completions != nil {
if jobObj, err = updateJob(ctx, jobClient, jobObj.Name, func(j *batchv1.Job) {
j.Spec.Completions = update.completions
j.Spec.Parallelism = update.completions
}); err != nil {
if diff := cmp.Diff(tc.wantErr, err); diff != "" {
t.Fatalf("Unexpected or missing errors (-want/+got): %s", diff)
}
return
}
}
// Succeed specified indexes.
for _, idx := range update.succeedIndexes {
if err := setJobPhaseForIndex(ctx, clientSet, jobObj, v1.PodSucceeded, idx); err != nil {
t.Fatalf("Failed trying to succeed pod with index %d: %v", idx, err)
}
}
// Fail specified indexes.
for _, idx := range update.failIndexes {
if err := setJobPhaseForIndex(ctx, clientSet, jobObj, v1.PodFailed, idx); err != nil {
t.Fatalf("Failed trying to fail pod with index %d: %v", idx, err)
}
}
validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
Active: update.wantActivePods,
Succeeded: len(update.succeedIndexes),
Failed: update.wantFailed,
Ready: ptr.To[int32](0),
Terminating: update.wantTerminating,
})
validateIndexedJobPods(ctx, t, clientSet, jobObj, update.wantRemainingIndexes, update.wantSucceededIndexes, nil)
}
validateJobComplete(ctx, t, clientSet, jobObj)
})
}
}
// BenchmarkLargeIndexedJob benchmarks the completion of an Indexed Job.
// We expect that large jobs are more commonly used as Indexed. And they are
// also faster to track, as they need less API calls.
func BenchmarkLargeIndexedJob(b *testing.B) {
closeFn, restConfig, clientSet, ns := setup(b, "indexed")
restConfig.QPS = 100
restConfig.Burst = 100
defer closeFn()
ctx, cancel := startJobControllerAndWaitForCaches(b, restConfig)
defer cancel()
backoff := wait.Backoff{
Duration: time.Second,
Factor: 1.5,
Steps: 30,
Cap: 5 * time.Minute,
}
cases := map[string]struct {
nPods int32
backoffLimitPerIndex *int32
}{
"regular indexed job without failures; size=10": {
nPods: 10,
},
"job with backoffLimitPerIndex without failures; size=10": {
nPods: 10,
backoffLimitPerIndex: ptr.To[int32](1),
},
"regular indexed job without failures; size=100": {
nPods: 100,
},
"job with backoffLimitPerIndex without failures; size=100": {
nPods: 100,
backoffLimitPerIndex: ptr.To[int32](1),
},
}
mode := batchv1.IndexedCompletion
for name, tc := range cases {
b.Run(name, func(b *testing.B) {
enableJobBackoffLimitPerIndex := tc.backoffLimitPerIndex != nil
featuregatetesting.SetFeatureGateDuringTest(b, feature.DefaultFeatureGate, features.JobBackoffLimitPerIndex, enableJobBackoffLimitPerIndex)
b.ResetTimer()
for n := 0; n < b.N; n++ {
b.StartTimer()
jobObj, err := createJobWithDefaults(ctx, clientSet, ns.Name, &batchv1.Job{
ObjectMeta: metav1.ObjectMeta{
Name: fmt.Sprintf("npods-%d-%d-%v", tc.nPods, n, enableJobBackoffLimitPerIndex),
},
Spec: batchv1.JobSpec{
Parallelism: ptr.To(tc.nPods),
Completions: ptr.To(tc.nPods),
CompletionMode: &mode,
BackoffLimitPerIndex: tc.backoffLimitPerIndex,
},
})
if err != nil {
b.Fatalf("Failed to create Job: %v", err)
}
b.Cleanup(func() {
if err := cleanUp(ctx, clientSet, jobObj); err != nil {
b.Fatalf("Failed cleanup: %v", err)
}
})
remaining := int(tc.nPods)
if err := wait.ExponentialBackoff(backoff, func() (done bool, err error) {
if succ, err := setJobPodsPhase(ctx, clientSet, jobObj, v1.PodSucceeded, remaining); err != nil {
remaining -= succ
b.Logf("Transient failure succeeding pods: %v", err)
return false, nil
}
return true, nil
}); err != nil {
b.Fatalf("Could not succeed the remaining %d pods: %v", remaining, err)
}
validateJobComplete(ctx, b, clientSet, jobObj)
b.StopTimer()
}
})
}
}
// BenchmarkLargeFailureHandling benchmarks the handling of numerous pod failures
// of an Indexed Job. We set minimal backoff delay to make the job controller
// performance comparable for indexed jobs with global backoffLimit, and those
// with backoffLimit per-index, despite different patterns of handling failures.
func BenchmarkLargeFailureHandling(b *testing.B) {
b.Cleanup(setDurationDuringTest(&jobcontroller.DefaultJobPodFailureBackOff, fastPodFailureBackoff))
b.Cleanup(setDurationDuringTest(&jobcontroller.MaxJobPodFailureBackOff, fastPodFailureBackoff))
closeFn, restConfig, clientSet, ns := setup(b, "indexed")
restConfig.QPS = 100
restConfig.Burst = 100
defer closeFn()
ctx, cancel := startJobControllerAndWaitForCaches(b, restConfig)
defer cancel()
backoff := wait.Backoff{
Duration: time.Second,
Factor: 1.5,
Steps: 30,
Cap: 5 * time.Minute,
}
cases := map[string]struct {
nPods int32
backoffLimitPerIndex *int32
customTimeout *time.Duration
}{
"regular indexed job with failures; size=10": {
nPods: 10,
},
"job with backoffLimitPerIndex with failures; size=10": {
nPods: 10,
backoffLimitPerIndex: ptr.To[int32](1),
},
"regular indexed job with failures; size=100": {
nPods: 100,
},
"job with backoffLimitPerIndex with failures; size=100": {
nPods: 100,
backoffLimitPerIndex: ptr.To[int32](1),
},
}
mode := batchv1.IndexedCompletion
for name, tc := range cases {
b.Run(name, func(b *testing.B) {
enableJobBackoffLimitPerIndex := tc.backoffLimitPerIndex != nil
timeout := ptr.Deref(tc.customTimeout, wait.ForeverTestTimeout)
featuregatetesting.SetFeatureGateDuringTest(b, feature.DefaultFeatureGate, features.JobBackoffLimitPerIndex, enableJobBackoffLimitPerIndex)
b.ResetTimer()
for n := 0; n < b.N; n++ {
b.StopTimer()
jobObj, err := createJobWithDefaults(ctx, clientSet, ns.Name, &batchv1.Job{
ObjectMeta: metav1.ObjectMeta{
Name: fmt.Sprintf("npods-%d-%d-%v", tc.nPods, n, enableJobBackoffLimitPerIndex),
},
Spec: batchv1.JobSpec{
Parallelism: ptr.To(tc.nPods),
Completions: ptr.To(tc.nPods),
CompletionMode: &mode,
BackoffLimitPerIndex: tc.backoffLimitPerIndex,
BackoffLimit: ptr.To(tc.nPods),
},
})
if err != nil {
b.Fatalf("Failed to create Job: %v", err)
}
b.Cleanup(func() {
if err := cleanUp(ctx, clientSet, jobObj); err != nil {
b.Fatalf("Failed cleanup: %v", err)
}
})
validateJobsPodsStatusOnlyWithTimeout(ctx, b, clientSet, jobObj, podsByStatus{
Active: int(tc.nPods),
Ready: ptr.To[int32](0),
Terminating: ptr.To[int32](0),
}, timeout)
b.StartTimer()
remaining := int(tc.nPods)
if err := wait.ExponentialBackoff(backoff, func() (done bool, err error) {
if fail, err := setJobPodsPhase(ctx, clientSet, jobObj, v1.PodFailed, remaining); err != nil {
remaining -= fail
b.Logf("Transient failure failing pods: %v", err)
return false, nil
}
return true, nil
}); err != nil {
b.Fatalf("Could not succeed the remaining %d pods: %v", remaining, err)
}
validateJobsPodsStatusOnlyWithTimeout(ctx, b, clientSet, jobObj, podsByStatus{
Active: int(tc.nPods),
Ready: ptr.To[int32](0),
Failed: int(tc.nPods),
Terminating: ptr.To[int32](0),
}, timeout)
b.StopTimer()
}
})
}
}
// cleanUp deletes all pods and the job
func cleanUp(ctx context.Context, clientSet clientset.Interface, jobObj *batchv1.Job) error {
// Clean up pods in pages, because DeleteCollection might timeout.
// #90743
for {
pods, err := clientSet.CoreV1().Pods(jobObj.Namespace).List(ctx, metav1.ListOptions{Limit: 1})
if err != nil {
return err
}
if len(pods.Items) == 0 {
break
}
err = clientSet.CoreV1().Pods(jobObj.Namespace).DeleteCollection(ctx,
metav1.DeleteOptions{},
metav1.ListOptions{
Limit: 1000,
})
if err != nil {
return err
}
}
return clientSet.BatchV1().Jobs(jobObj.Namespace).Delete(ctx, jobObj.Name, metav1.DeleteOptions{})
}
func TestOrphanPodsFinalizersClearedWithGC(t *testing.T) {
for _, policy := range []metav1.DeletionPropagation{metav1.DeletePropagationOrphan, metav1.DeletePropagationBackground, metav1.DeletePropagationForeground} {
t.Run(string(policy), func(t *testing.T) {
closeFn, restConfig, clientSet, ns := setup(t, "simple")
defer closeFn()
informerSet := informers.NewSharedInformerFactory(clientset.NewForConfigOrDie(restclient.AddUserAgent(restConfig, "controller-informers")), 0)
// Make the job controller significantly slower to trigger race condition.
restConfig.QPS = 1
restConfig.Burst = 1
jc, ctx, cancel := createJobControllerWithSharedInformers(t, restConfig, informerSet)
resetMetrics()
defer cancel()
restConfig.QPS = 200
restConfig.Burst = 200
runGC := util.CreateGCController(ctx, t, *restConfig, informerSet)
informerSet.Start(ctx.Done())
go jc.Run(ctx, 1)
runGC()
jobObj, err := createJobWithDefaults(ctx, clientSet, ns.Name, &batchv1.Job{
Spec: batchv1.JobSpec{
Parallelism: ptr.To[int32](2),
},
})
if err != nil {
t.Fatalf("Failed to create Job: %v", err)
}
validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
Active: 2,
Ready: ptr.To[int32](0),
Terminating: ptr.To[int32](0),
})
// Delete Job. The GC should delete the pods in cascade.
err = clientSet.BatchV1().Jobs(jobObj.Namespace).Delete(ctx, jobObj.Name, metav1.DeleteOptions{
PropagationPolicy: &policy,
})
if err != nil {
t.Fatalf("Failed to delete job: %v", err)
}
validateNoOrphanPodsWithFinalizers(ctx, t, clientSet, jobObj)
// Pods never finished, so they are not counted in the metric.
validateTerminatedPodsTrackingFinalizerMetric(ctx, t, 0)
})
}
}
func TestFinalizersClearedWhenBackoffLimitExceeded(t *testing.T) {
// Set a maximum number of uncounted pods below parallelism, to ensure it
// doesn't affect the termination of pods.
t.Cleanup(setDuringTest(&jobcontroller.MaxUncountedPods, 50))
closeFn, restConfig, clientSet, ns := setup(t, "simple")
defer closeFn()
ctx, cancel := startJobControllerAndWaitForCaches(t, restConfig)
defer cancel()
// Job tracking with finalizers requires less calls in Indexed mode,
// so it's more likely to process all finalizers before all the pods
// are visible.
mode := batchv1.IndexedCompletion
jobObj, err := createJobWithDefaults(ctx, clientSet, ns.Name, &batchv1.Job{
Spec: batchv1.JobSpec{
CompletionMode: &mode,
Completions: ptr.To[int32](100),
Parallelism: ptr.To[int32](100),
BackoffLimit: ptr.To[int32](0),
},
})
if err != nil {
t.Fatalf("Could not create job: %v", err)
}
// Fail a pod ASAP.
err = wait.PollUntilContextTimeout(ctx, time.Millisecond, wait.ForeverTestTimeout, true, func(ctx context.Context) (done bool, err error) {
if _, err := setJobPodsPhase(ctx, clientSet, jobObj, v1.PodFailed, 1); err != nil {
return false, nil
}
return true, nil
})
if err != nil {
t.Fatalf("Could not fail pod: %v", err)
}
validateJobFailed(ctx, t, clientSet, jobObj)
validateCounterMetric(ctx, t, metrics.JobFinishedNum, metricLabelsWithValue{
Labels: []string{"Indexed", "failed", "BackoffLimitExceeded"},
Value: 1,
})
validateNoOrphanPodsWithFinalizers(ctx, t, clientSet, jobObj)
}
func TestJobPodsCreatedWithExponentialBackoff(t *testing.T) {
t.Cleanup(setDurationDuringTest(&jobcontroller.DefaultJobPodFailureBackOff, 2*time.Second))
closeFn, restConfig, clientSet, ns := setup(t, "simple")
defer closeFn()
ctx, cancel := startJobControllerAndWaitForCaches(t, restConfig)
defer cancel()
jobObj, err := createJobWithDefaults(ctx, clientSet, ns.Name, &batchv1.Job{})
if err != nil {
t.Fatalf("Could not create job: %v", err)
}
validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
Active: 1,
Ready: ptr.To[int32](0),
Terminating: ptr.To[int32](0),
})
// Fail the first pod
if _, err := setJobPodsPhase(ctx, clientSet, jobObj, v1.PodFailed, 1); err != nil {
t.Fatalf("Failed setting phase %s on Job Pod: %v", v1.PodFailed, err)
}
validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
Active: 1,
Ready: ptr.To[int32](0),
Failed: 1,
Terminating: ptr.To[int32](0),
})
// Fail the second pod
if _, err := setJobPodsPhase(ctx, clientSet, jobObj, v1.PodFailed, 1); err != nil {
t.Fatalf("Failed setting phase %s on Job Pod: %v", v1.PodFailed, err)
}
validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
Active: 1,
Ready: ptr.To[int32](0),
Failed: 2,
Terminating: ptr.To[int32](0),
})
jobPods, err := getJobPods(ctx, t, clientSet, jobObj, func(ps v1.PodStatus) bool { return true })
if err != nil {
t.Fatalf("Failed to list Job Pods: %v", err)
}
if len(jobPods) != 3 {
t.Fatalf("Expected to get %v pods, received %v", 4, len(jobPods))
}
validateExpotentialBackoffDelay(t, jobcontroller.DefaultJobPodFailureBackOff, jobPods)
}
func validateExpotentialBackoffDelay(t *testing.T, defaultPodFailureBackoff time.Duration, pods []*v1.Pod) {
t.Helper()
creationTime := []time.Time{}
finishTime := []time.Time{}
for _, pod := range pods {
creationTime = append(creationTime, pod.CreationTimestamp.Time)
if len(pod.Status.ContainerStatuses) > 0 {
finishTime = append(finishTime, pod.Status.ContainerStatuses[0].State.Terminated.FinishedAt.Time)
}
}
sort.Slice(creationTime, func(i, j int) bool {
return creationTime[i].Before(creationTime[j])
})
sort.Slice(finishTime, func(i, j int) bool {
return finishTime[i].Before(finishTime[j])
})
diff := creationTime[1].Sub(finishTime[0])
if diff < defaultPodFailureBackoff {
t.Fatalf("Second pod should be created at least %v seconds after the first pod, time difference: %v", defaultPodFailureBackoff, diff)
}
if diff >= 2*defaultPodFailureBackoff {
t.Fatalf("Second pod should be created before %v seconds after the first pod, time difference: %v", 2*defaultPodFailureBackoff, diff)
}
diff = creationTime[2].Sub(finishTime[1])
if diff < 2*defaultPodFailureBackoff {
t.Fatalf("Third pod should be created at least %v seconds after the second pod, time difference: %v", 2*defaultPodFailureBackoff, diff)
}
if diff >= 4*defaultPodFailureBackoff {
t.Fatalf("Third pod should be created before %v seconds after the second pod, time difference: %v", 4*defaultPodFailureBackoff, diff)
}
}
// TestJobFailedWithInterrupts tests that a job were one pod fails and the rest
// succeed is marked as Failed, even if the controller fails in the middle.
func TestJobFailedWithInterrupts(t *testing.T) {
closeFn, restConfig, clientSet, ns := setup(t, "simple")
defer closeFn()
ctx, cancel := startJobControllerAndWaitForCaches(t, restConfig)
defer func() {
cancel()
}()
jobObj, err := createJobWithDefaults(ctx, clientSet, ns.Name, &batchv1.Job{
Spec: batchv1.JobSpec{
Completions: ptr.To[int32](10),
Parallelism: ptr.To[int32](10),
BackoffLimit: ptr.To[int32](0),
Template: v1.PodTemplateSpec{
Spec: v1.PodSpec{
NodeName: "foo", // Scheduled pods are not deleted immediately.
},
},
},
})
if err != nil {
t.Fatalf("Could not create job: %v", err)
}
validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
Active: 10,
Ready: ptr.To[int32](0),
Terminating: ptr.To[int32](0),
})
t.Log("Finishing pods")
if _, err := setJobPodsPhase(ctx, clientSet, jobObj, v1.PodFailed, 1); err != nil {
t.Fatalf("Could not fail a pod: %v", err)
}
remaining := 9
if err := wait.PollUntilContextTimeout(ctx, 5*time.Millisecond, wait.ForeverTestTimeout, true, func(ctx context.Context) (done bool, err error) {
if succ, err := setJobPodsPhase(ctx, clientSet, jobObj, v1.PodSucceeded, remaining); err != nil {
remaining -= succ
t.Logf("Transient failure succeeding pods: %v", err)
return false, nil
}
return true, nil
}); err != nil {
t.Fatalf("Could not succeed the remaining %d pods: %v", remaining, err)
}
t.Log("Recreating job controller")
cancel()
ctx, cancel = startJobControllerAndWaitForCaches(t, restConfig)
validateJobCondition(ctx, t, clientSet, jobObj, batchv1.JobFailed)
}
func validateNoOrphanPodsWithFinalizers(ctx context.Context, t *testing.T, clientSet clientset.Interface, jobObj *batchv1.Job) {
t.Helper()
orphanPods := 0
if err := wait.PollUntilContextTimeout(ctx, waitInterval, wait.ForeverTestTimeout, true, func(ctx context.Context) (done bool, err error) {
pods, err := clientSet.CoreV1().Pods(jobObj.Namespace).List(ctx, metav1.ListOptions{
LabelSelector: metav1.FormatLabelSelector(jobObj.Spec.Selector),
})
if err != nil {
return false, err
}
orphanPods = 0
for _, pod := range pods.Items {
if hasJobTrackingFinalizer(&pod) {
orphanPods++
}
}
return orphanPods == 0, nil
}); err != nil {
t.Errorf("Failed waiting for pods to be freed from finalizer: %v", err)
t.Logf("Last saw %d orphan pods", orphanPods)
}
}
func TestOrphanPodsFinalizersClearedOnRestart(t *testing.T) {
// Step 0: create job.
closeFn, restConfig, clientSet, ns := setup(t, "simple")
defer closeFn()
ctx, cancel := startJobControllerAndWaitForCaches(t, restConfig)
defer func() {
cancel()
}()
jobObj, err := createJobWithDefaults(ctx, clientSet, ns.Name, &batchv1.Job{
Spec: batchv1.JobSpec{
Parallelism: ptr.To[int32](1),
},
})
if err != nil {
t.Fatalf("Failed to create Job: %v", err)
}
validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
Active: 1,
Ready: ptr.To[int32](0),
Terminating: ptr.To[int32](0),
})
// Step 2: Delete the Job while the controller is stopped.
cancel()
err = clientSet.BatchV1().Jobs(jobObj.Namespace).Delete(context.Background(), jobObj.Name, metav1.DeleteOptions{})
if err != nil {
t.Fatalf("Failed to delete job: %v", err)
}
// Step 3: Restart controller.
ctx, cancel = startJobControllerAndWaitForCaches(t, restConfig)
validateNoOrphanPodsWithFinalizers(ctx, t, clientSet, jobObj)
}
func TestSuspendJob(t *testing.T) {
type step struct {
flag bool
wantActive int
wantStatus v1.ConditionStatus
wantReason string
}
testCases := []struct {
featureGate bool
create step
update step
}{
// Exhaustively test all combinations other than trivial true->true and
// false->false cases.
{
create: step{flag: false, wantActive: 2},
update: step{flag: true, wantActive: 0, wantStatus: v1.ConditionTrue, wantReason: "Suspended"},
},
{
create: step{flag: true, wantActive: 0, wantStatus: v1.ConditionTrue, wantReason: "Suspended"},
update: step{flag: false, wantActive: 2, wantStatus: v1.ConditionFalse, wantReason: "Resumed"},
},
}
for _, tc := range testCases {
name := fmt.Sprintf("feature=%v,create=%v,update=%v", tc.featureGate, tc.create.flag, tc.update.flag)
t.Run(name, func(t *testing.T) {
closeFn, restConfig, clientSet, ns := setup(t, "suspend")
defer closeFn()
ctx, cancel := startJobControllerAndWaitForCaches(t, restConfig)
defer cancel()
events, err := clientSet.EventsV1().Events(ns.Name).Watch(ctx, metav1.ListOptions{})
if err != nil {
t.Fatal(err)
}
defer events.Stop()
parallelism := int32(2)
job, err := createJobWithDefaults(ctx, clientSet, ns.Name, &batchv1.Job{
Spec: batchv1.JobSpec{
Parallelism: ptr.To(parallelism),
Completions: ptr.To[int32](4),
Suspend: ptr.To(tc.create.flag),
},
})
if err != nil {
t.Fatalf("Failed to create Job: %v", err)
}
validate := func(s string, active int, status v1.ConditionStatus, reason string) {
validateJobPodsStatus(ctx, t, clientSet, job, podsByStatus{
Active: active,
Ready: ptr.To[int32](0),
Terminating: ptr.To[int32](0),
})
job, err = clientSet.BatchV1().Jobs(ns.Name).Get(ctx, job.Name, metav1.GetOptions{})
if err != nil {
t.Fatalf("Failed to get Job after %s: %v", s, err)
}
if got, want := getJobConditionStatus(ctx, job, batchv1.JobSuspended), status; got != want {
t.Errorf("Unexpected Job condition %q status after %s: got %q, want %q", batchv1.JobSuspended, s, got, want)
}
if err := waitForEvent(ctx, events, job.UID, reason); err != nil {
t.Errorf("Waiting for event with reason %q after %s: %v", reason, s, err)
}
}
validate("create", tc.create.wantActive, tc.create.wantStatus, tc.create.wantReason)
job.Spec.Suspend = ptr.To(tc.update.flag)
job, err = clientSet.BatchV1().Jobs(ns.Name).Update(ctx, job, metav1.UpdateOptions{})
if err != nil {
t.Fatalf("Failed to update Job: %v", err)
}
validate("update", tc.update.wantActive, tc.update.wantStatus, tc.update.wantReason)
})
}
}
func TestSuspendJobControllerRestart(t *testing.T) {
closeFn, restConfig, clientSet, ns := setup(t, "suspend")
defer closeFn()
ctx, cancel := startJobControllerAndWaitForCaches(t, restConfig)
defer cancel()
job, err := createJobWithDefaults(ctx, clientSet, ns.Name, &batchv1.Job{
Spec: batchv1.JobSpec{
Parallelism: ptr.To[int32](2),
Completions: ptr.To[int32](4),
Suspend: ptr.To(true),
},
})
if err != nil {
t.Fatalf("Failed to create Job: %v", err)
}
validateJobPodsStatus(ctx, t, clientSet, job, podsByStatus{
Active: 0,
Ready: ptr.To[int32](0),
Terminating: ptr.To[int32](0),
})
}
func TestNodeSelectorUpdate(t *testing.T) {
closeFn, restConfig, clientSet, ns := setup(t, "suspend")
defer closeFn()
ctx, cancel := startJobControllerAndWaitForCaches(t, restConfig)
defer cancel()
job, err := createJobWithDefaults(ctx, clientSet, ns.Name, &batchv1.Job{Spec: batchv1.JobSpec{
Parallelism: ptr.To[int32](1),
Suspend: ptr.To(true),
}})
if err != nil {
t.Fatalf("Failed to create Job: %v", err)
}
jobName := job.Name
jobNamespace := job.Namespace
jobClient := clientSet.BatchV1().Jobs(jobNamespace)
// (1) Unsuspend and set node selector in the same update.
nodeSelector := map[string]string{"foo": "bar"}
if _, err := updateJob(ctx, jobClient, jobName, func(j *batchv1.Job) {
j.Spec.Template.Spec.NodeSelector = nodeSelector
j.Spec.Suspend = ptr.To(false)
}); err != nil {
t.Errorf("Unexpected error: %v", err)
}
// (2) Check that the pod was created using the expected node selector.
var pod *v1.Pod
if err := wait.PollUntilContextTimeout(ctx, waitInterval, wait.ForeverTestTimeout, true, func(ctx context.Context) (bool, error) {
pods, err := clientSet.CoreV1().Pods(jobNamespace).List(ctx, metav1.ListOptions{})
if err != nil {
t.Fatalf("Failed to list Job Pods: %v", err)
}
if len(pods.Items) == 0 {
return false, nil
}
pod = &pods.Items[0]
return true, nil
}); err != nil || pod == nil {
t.Fatalf("pod not found: %v", err)
}
// if the feature gate is enabled, then the job should now be unsuspended and
// the pod has the node selector.
if diff := cmp.Diff(nodeSelector, pod.Spec.NodeSelector); diff != "" {
t.Errorf("Unexpected nodeSelector (-want,+got):\n%s", diff)
}
// (3) Update node selector again. It should fail since the job is unsuspended.
_, err = updateJob(ctx, jobClient, jobName, func(j *batchv1.Job) {
j.Spec.Template.Spec.NodeSelector = map[string]string{"foo": "baz"}
})
if err == nil || !strings.Contains(err.Error(), "spec.template: Invalid value") {
t.Errorf("Expected \"spec.template: Invalid value\" error, got: %v", err)
}
}
type podsByStatus struct {
Active int
Ready *int32
Failed int
Succeeded int
Terminating *int32
}
func validateJobsPodsStatusOnly(ctx context.Context, t testing.TB, clientSet clientset.Interface, jobObj *batchv1.Job, desired podsByStatus) {
t.Helper()
validateJobsPodsStatusOnlyWithTimeout(ctx, t, clientSet, jobObj, desired, wait.ForeverTestTimeout)
}
func validateJobsPodsStatusOnlyWithTimeout(ctx context.Context, t testing.TB, clientSet clientset.Interface, jobObj *batchv1.Job, desired podsByStatus, timeout time.Duration) {
t.Helper()
var actualCounts podsByStatus
if err := wait.PollUntilContextTimeout(ctx, waitInterval, timeout, true, func(ctx context.Context) (bool, error) {
updatedJob, err := clientSet.BatchV1().Jobs(jobObj.Namespace).Get(ctx, jobObj.Name, metav1.GetOptions{})
if err != nil {
t.Fatalf("Failed to get updated Job: %v", err)
}
actualCounts = podsByStatus{
Active: int(updatedJob.Status.Active),
Ready: updatedJob.Status.Ready,
Succeeded: int(updatedJob.Status.Succeeded),
Failed: int(updatedJob.Status.Failed),
Terminating: updatedJob.Status.Terminating,
}
return cmp.Equal(actualCounts, desired), nil
}); err != nil {
diff := cmp.Diff(desired, actualCounts)
t.Errorf("Waiting for Job Status: %v\nPods (-want,+got):\n%s", err, diff)
}
}
func validateJobStatus(ctx context.Context, t testing.TB, clientSet clientset.Interface, jobObj *batchv1.Job, wantStatus batchv1.JobStatus) {
t.Helper()
diff := ""
if err := wait.PollUntilContextTimeout(ctx, waitInterval, wait.ForeverTestTimeout, true, func(ctx context.Context) (bool, error) {
gotJob, err := clientSet.BatchV1().Jobs(jobObj.Namespace).Get(ctx, jobObj.Name, metav1.GetOptions{})
if err != nil {
t.Fatalf("Failed to get updated Job: %v, last status diff (-want,+got):\n%s", err, diff)
}
diff = cmp.Diff(wantStatus, gotJob.Status,
cmpopts.EquateEmpty(),
cmpopts.IgnoreFields(batchv1.JobStatus{}, "StartTime", "UncountedTerminatedPods", "CompletionTime"),
cmpopts.IgnoreFields(batchv1.JobCondition{}, "LastProbeTime", "LastTransitionTime", "Message"),
)
return diff == "", nil
}); err != nil {
t.Fatalf("Waiting for Job Status: %v\n, Status diff (-want,+got):\n%s", err, diff)
}
}
func validateJobPodsStatus(ctx context.Context, t testing.TB, clientSet clientset.Interface, jobObj *batchv1.Job, desired podsByStatus) {
t.Helper()
validateJobsPodsStatusOnly(ctx, t, clientSet, jobObj, desired)
var active []*v1.Pod
if err := wait.PollUntilContextTimeout(ctx, waitInterval, time.Second*5, true, func(ctx context.Context) (bool, error) {
pods, err := clientSet.CoreV1().Pods(jobObj.Namespace).List(ctx, metav1.ListOptions{})
if err != nil {
t.Fatalf("Failed to list Job Pods: %v", err)
}
active = nil
for _, pod := range pods.Items {
phase := pod.Status.Phase
if metav1.IsControlledBy(&pod, jobObj) && (phase == v1.PodPending || phase == v1.PodRunning) {
p := pod
active = append(active, &p)
}
}
return len(active) == desired.Active, nil
}); err != nil {
if len(active) != desired.Active {
t.Errorf("Found %d active Pods, want %d", len(active), desired.Active)
}
}
for _, p := range active {
if !hasJobTrackingFinalizer(p) {
t.Errorf("Active pod %s doesn't have tracking finalizer", p.Name)
}
}
}
func getJobPods(ctx context.Context, t *testing.T, clientSet clientset.Interface, jobObj *batchv1.Job, filter func(v1.PodStatus) bool) ([]*v1.Pod, error) {
t.Helper()
allPods, err := clientSet.CoreV1().Pods(jobObj.Namespace).List(ctx, metav1.ListOptions{})
if err != nil {
return nil, err
}
jobPods := make([]*v1.Pod, 0, 0)
for _, pod := range allPods.Items {
if metav1.IsControlledBy(&pod, jobObj) && filter(pod.Status) {
p := pod
jobPods = append(jobPods, &p)
}
}
return jobPods, nil
}
func validateFinishedPodsNoFinalizer(ctx context.Context, t *testing.T, clientSet clientset.Interface, jobObj *batchv1.Job) {
t.Helper()
pods, err := clientSet.CoreV1().Pods(jobObj.Namespace).List(ctx, metav1.ListOptions{})
if err != nil {
t.Fatalf("Failed to list Job Pods: %v", err)
}
for _, pod := range pods.Items {
phase := pod.Status.Phase
if metav1.IsControlledBy(&pod, jobObj) && (phase == v1.PodPending || phase == v1.PodRunning) && hasJobTrackingFinalizer(&pod) {
t.Errorf("Finished pod %s still has a tracking finalizer", pod.Name)
}
}
}
// validateIndexedJobPods validates indexes and hostname of
// active and completed Pods of an Indexed Job.
// Call after validateJobPodsStatus
func validateIndexedJobPods(ctx context.Context, t *testing.T, clientSet clientset.Interface, jobObj *batchv1.Job, wantActive sets.Set[int], gotCompleted string, wantFailed *string) {
t.Helper()
updatedJob, err := clientSet.BatchV1().Jobs(jobObj.Namespace).Get(ctx, jobObj.Name, metav1.GetOptions{})
if err != nil {
t.Fatalf("Failed to get updated Job: %v", err)
}
if updatedJob.Status.CompletedIndexes != gotCompleted {
t.Errorf("Got completed indexes %q, want %q", updatedJob.Status.CompletedIndexes, gotCompleted)
}
if diff := cmp.Diff(wantFailed, updatedJob.Status.FailedIndexes); diff != "" {
t.Errorf("Got unexpected failed indexes: %s", diff)
}
pods, err := clientSet.CoreV1().Pods(jobObj.Namespace).List(ctx, metav1.ListOptions{})
if err != nil {
t.Fatalf("Failed to list Job Pods: %v", err)
}
gotActive := sets.New[int]()
for _, pod := range pods.Items {
if metav1.IsControlledBy(&pod, jobObj) {
if pod.Status.Phase == v1.PodPending || pod.Status.Phase == v1.PodRunning {
ix, err := getCompletionIndex(&pod)
if err != nil {
t.Errorf("Failed getting completion index for pod %s: %v", pod.Name, err)
} else {
gotActive.Insert(ix)
}
expectedName := fmt.Sprintf("%s-%d", jobObj.Name, ix)
if diff := cmp.Equal(expectedName, pod.Spec.Hostname); !diff {
t.Errorf("Got pod hostname %s, want %s", pod.Spec.Hostname, expectedName)
}
}
}
}
if wantActive == nil {
wantActive = sets.New[int]()
}
if diff := cmp.Diff(sets.List(wantActive), sets.List(gotActive)); diff != "" {
t.Errorf("Unexpected active indexes (-want,+got):\n%s", diff)
}
}
func waitForEvent(ctx context.Context, events watch.Interface, uid types.UID, reason string) error {
if reason == "" {
return nil
}
return wait.PollUntilContextTimeout(ctx, waitInterval, wait.ForeverTestTimeout, true, func(ctx context.Context) (bool, error) {
for {
var ev watch.Event
select {
case ev = <-events.ResultChan():
default:
return false, nil
}
e, ok := ev.Object.(*eventsv1.Event)
if !ok {
continue
}
ctrl := "job-controller"
if (e.ReportingController == ctrl || e.DeprecatedSource.Component == ctrl) && e.Reason == reason && e.Regarding.UID == uid {
return true, nil
}
}
})
}
func getJobConditionStatus(ctx context.Context, job *batchv1.Job, cType batchv1.JobConditionType) v1.ConditionStatus {
for _, cond := range job.Status.Conditions {
if cond.Type == cType {
return cond.Status
}
}
return ""
}
func validateJobFailed(ctx context.Context, t *testing.T, clientSet clientset.Interface, jobObj *batchv1.Job) {
t.Helper()
validateJobCondition(ctx, t, clientSet, jobObj, batchv1.JobFailed)
}
func validateJobComplete(ctx context.Context, t testing.TB, clientSet clientset.Interface, jobObj *batchv1.Job) {
t.Helper()
validateJobCondition(ctx, t, clientSet, jobObj, batchv1.JobComplete)
}
func validateJobCondition(ctx context.Context, t testing.TB, clientSet clientset.Interface, jobObj *batchv1.Job, cond batchv1.JobConditionType) {
t.Helper()
if err := wait.PollUntilContextTimeout(ctx, waitInterval, wait.ForeverTestTimeout, true, func(ctx context.Context) (bool, error) {
j, err := clientSet.BatchV1().Jobs(jobObj.Namespace).Get(ctx, jobObj.Name, metav1.GetOptions{})
if err != nil {
t.Fatalf("Failed to obtain updated Job: %v", err)
}
return getJobConditionStatus(ctx, j, cond) == v1.ConditionTrue, nil
}); err != nil {
t.Errorf("Waiting for Job to have condition %s: %v", cond, err)
}
}
func setJobPodsPhase(ctx context.Context, clientSet clientset.Interface, jobObj *batchv1.Job, phase v1.PodPhase, cnt int) (int, error) {
op := func(p *v1.Pod) bool {
p.Status.Phase = phase
if phase == v1.PodFailed || phase == v1.PodSucceeded {
p.Status.ContainerStatuses = []v1.ContainerStatus{
{
State: v1.ContainerState{
Terminated: &v1.ContainerStateTerminated{
FinishedAt: metav1.Now(),
},
},
},
}
}
return true
}
return updateJobPodsStatus(ctx, clientSet, jobObj, op, cnt)
}
func setJobPodsReady(ctx context.Context, clientSet clientset.Interface, jobObj *batchv1.Job, cnt int) (int, error) {
op := func(p *v1.Pod) bool {
if podutil.IsPodReady(p) {
return false
}
p.Status.Conditions = append(p.Status.Conditions, v1.PodCondition{
Type: v1.PodReady,
Status: v1.ConditionTrue,
})
return true
}
return updateJobPodsStatus(ctx, clientSet, jobObj, op, cnt)
}
func updateJobPodsStatus(ctx context.Context, clientSet clientset.Interface, jobObj *batchv1.Job, op func(*v1.Pod) bool, cnt int) (int, error) {
pods, err := clientSet.CoreV1().Pods(jobObj.Namespace).List(ctx, metav1.ListOptions{})
if err != nil {
return 0, fmt.Errorf("listing Job Pods: %w", err)
}
updates := make([]v1.Pod, 0, cnt)
for _, pod := range pods.Items {
if len(updates) == cnt {
break
}
if p := pod.Status.Phase; metav1.IsControlledBy(&pod, jobObj) && p != v1.PodFailed && p != v1.PodSucceeded {
if !op(&pod) {
continue
}
updates = append(updates, pod)
}
}
successful, err := updatePodStatuses(ctx, clientSet, updates)
if successful != cnt {
return successful, fmt.Errorf("couldn't set phase on %d Job pods", cnt-successful)
}
return successful, err
}
func updatePodStatuses(ctx context.Context, clientSet clientset.Interface, updates []v1.Pod) (int, error) {
wg := sync.WaitGroup{}
wg.Add(len(updates))
errCh := make(chan error, len(updates))
var updated int32
for _, pod := range updates {
pod := pod
go func() {
_, err := clientSet.CoreV1().Pods(pod.Namespace).UpdateStatus(ctx, &pod, metav1.UpdateOptions{})
if err != nil {
errCh <- err
} else {
atomic.AddInt32(&updated, 1)
}
wg.Done()
}()
}
wg.Wait()
select {
case err := <-errCh:
return int(updated), fmt.Errorf("updating Pod status: %w", err)
default:
}
return int(updated), nil
}
func setJobPhaseForIndex(ctx context.Context, clientSet clientset.Interface, jobObj *batchv1.Job, phase v1.PodPhase, ix int) error {
pods, err := clientSet.CoreV1().Pods(jobObj.Namespace).List(ctx, metav1.ListOptions{})
if err != nil {
return fmt.Errorf("listing Job Pods: %w", err)
}
for _, pod := range pods.Items {
if p := pod.Status.Phase; !metav1.IsControlledBy(&pod, jobObj) || p == v1.PodFailed || p == v1.PodSucceeded {
continue
}
if pix, err := getCompletionIndex(&pod); err == nil && pix == ix {
pod.Status.Phase = phase
if phase == v1.PodFailed || phase == v1.PodSucceeded {
pod.Status.ContainerStatuses = []v1.ContainerStatus{
{
State: v1.ContainerState{
Terminated: &v1.ContainerStateTerminated{
FinishedAt: metav1.Now(),
},
},
},
}
}
_, err := clientSet.CoreV1().Pods(pod.Namespace).UpdateStatus(ctx, &pod, metav1.UpdateOptions{})
if err != nil {
return fmt.Errorf("updating pod %s status: %w", pod.Name, err)
}
return nil
}
}
return errors.New("no pod matching index found")
}
func getActivePodForIndex(ctx context.Context, clientSet clientset.Interface, jobObj *batchv1.Job, ix int) (*v1.Pod, error) {
return getJobPodForIndex(ctx, clientSet, jobObj, ix, func(p *v1.Pod) bool {
return !podutil.IsPodTerminal(p)
})
}
func getJobPodForIndex(ctx context.Context, clientSet clientset.Interface, jobObj *batchv1.Job, ix int, filter func(*v1.Pod) bool) (*v1.Pod, error) {
pods, err := getJobPodsForIndex(ctx, clientSet, jobObj, ix, filter)
if err != nil {
return nil, err
}
if len(pods) == 0 {
return nil, fmt.Errorf("Pod not found for index: %v", ix)
}
return pods[0], nil
}
func getJobPodsForIndex(ctx context.Context, clientSet clientset.Interface, jobObj *batchv1.Job, ix int, filter func(*v1.Pod) bool) ([]*v1.Pod, error) {
pods, err := clientSet.CoreV1().Pods(jobObj.Namespace).List(ctx, metav1.ListOptions{})
if err != nil {
return nil, fmt.Errorf("listing Job Pods: %w", err)
}
var result []*v1.Pod
for _, pod := range pods.Items {
pod := pod
if !metav1.IsControlledBy(&pod, jobObj) {
continue
}
if !filter(&pod) {
continue
}
if pix, err := getCompletionIndex(&pod); err == nil && pix == ix {
result = append(result, &pod)
}
}
return result, nil
}
func getCompletionIndex(p *v1.Pod) (int, error) {
if p.Annotations == nil {
return 0, errors.New("no annotations found")
}
v, ok := p.Annotations[batchv1.JobCompletionIndexAnnotation]
if !ok {
return 0, fmt.Errorf("annotation %s not found", batchv1.JobCompletionIndexAnnotation)
}
return strconv.Atoi(v)
}
func createJobWithDefaults(ctx context.Context, clientSet clientset.Interface, ns string, jobObj *batchv1.Job) (*batchv1.Job, error) {
if jobObj.Name == "" {
jobObj.Name = "test-job"
}
if len(jobObj.Spec.Template.Spec.Containers) == 0 {
jobObj.Spec.Template.Spec.Containers = []v1.Container{
{Name: "foo", Image: "bar"},
}
}
if jobObj.Spec.Template.Spec.RestartPolicy == "" {
jobObj.Spec.Template.Spec.RestartPolicy = v1.RestartPolicyNever
}
return clientSet.BatchV1().Jobs(ns).Create(ctx, jobObj, metav1.CreateOptions{})
}
func setup(t testing.TB, nsBaseName string) (framework.TearDownFunc, *restclient.Config, clientset.Interface, *v1.Namespace) {
// Disable ServiceAccount admission plugin as we don't have serviceaccount controller running.
server := kubeapiservertesting.StartTestServerOrDie(t, nil, framework.DefaultTestServerFlags(), framework.SharedEtcd())
config := restclient.CopyConfig(server.ClientConfig)
config.QPS = 200
config.Burst = 200
config.Timeout = 0
clientSet, err := clientset.NewForConfig(config)
if err != nil {
t.Fatalf("Error creating clientset: %v", err)
}
ns := framework.CreateNamespaceOrDie(clientSet, nsBaseName, t)
closeFn := func() {
framework.DeleteNamespaceOrDie(clientSet, ns, t)
server.TearDownFn()
}
return closeFn, config, clientSet, ns
}
func startJobControllerAndWaitForCaches(tb testing.TB, restConfig *restclient.Config) (context.Context, context.CancelFunc) {
tb.Helper()
informerSet := informers.NewSharedInformerFactory(clientset.NewForConfigOrDie(restclient.AddUserAgent(restConfig, "job-informers")), 0)
jc, ctx, cancel := createJobControllerWithSharedInformers(tb, restConfig, informerSet)
informerSet.Start(ctx.Done())
go jc.Run(ctx, 1)
// since this method starts the controller in a separate goroutine
// and the tests don't check /readyz there is no way
// the tests can tell it is safe to call the server and requests won't be rejected
// thus we wait until caches have synced
informerSet.WaitForCacheSync(ctx.Done())
return ctx, cancel
}
func resetMetrics() {
metrics.TerminatedPodsTrackingFinalizerTotal.Reset()
metrics.JobFinishedNum.Reset()
metrics.JobPodsFinished.Reset()
metrics.PodFailuresHandledByFailurePolicy.Reset()
metrics.JobFinishedIndexesTotal.Reset()
metrics.JobPodsCreationTotal.Reset()
metrics.JobByExternalControllerTotal.Reset()
}
func createJobControllerWithSharedInformers(tb testing.TB, restConfig *restclient.Config, informerSet informers.SharedInformerFactory) (*jobcontroller.Controller, context.Context, context.CancelFunc) {
tb.Helper()
clientSet := clientset.NewForConfigOrDie(restclient.AddUserAgent(restConfig, "job-controller"))
ctx, cancel := context.WithCancel(context.Background())
jc, err := jobcontroller.NewController(ctx, informerSet.Core().V1().Pods(), informerSet.Batch().V1().Jobs(), clientSet)
if err != nil {
tb.Fatalf("Error creating Job controller: %v", err)
}
return jc, ctx, cancel
}
func hasJobTrackingFinalizer(obj metav1.Object) bool {
for _, fin := range obj.GetFinalizers() {
if fin == batchv1.JobTrackingFinalizer {
return true
}
}
return false
}
func setDuringTest(val *int, newVal int) func() {
origVal := *val
*val = newVal
return func() {
*val = origVal
}
}
func setDurationDuringTest(val *time.Duration, newVal time.Duration) func() {
origVal := *val
*val = newVal
return func() {
*val = origVal
}
}
func updateJob(ctx context.Context, jobClient typedv1.JobInterface, jobName string, updateFunc func(*batchv1.Job)) (*batchv1.Job, error) {
var job *batchv1.Job
err := retry.RetryOnConflict(retry.DefaultBackoff, func() error {
newJob, err := jobClient.Get(ctx, jobName, metav1.GetOptions{})
if err != nil {
return err
}
updateFunc(newJob)
job, err = jobClient.Update(ctx, newJob, metav1.UpdateOptions{})
return err
})
return job, err
}
func waitForPodsToBeActive(ctx context.Context, t *testing.T, jobClient typedv1.JobInterface, podCount int32, jobObj *batchv1.Job) {
t.Helper()
err := wait.PollUntilContextTimeout(ctx, 5*time.Millisecond, wait.ForeverTestTimeout, true, func(context.Context) (done bool, err error) {
job, err := jobClient.Get(ctx, jobObj.Name, metav1.GetOptions{})
if err != nil {
return false, err
}
return job.Status.Active == podCount, nil
})
if err != nil {
t.Fatalf("Error waiting for Job pods to become active: %v", err)
}
}
func deletePods(ctx context.Context, t *testing.T, clientSet clientset.Interface, namespace string) {
t.Helper()
err := clientSet.CoreV1().Pods(namespace).DeleteCollection(ctx,
metav1.DeleteOptions{},
metav1.ListOptions{
Limit: 1000,
})
if err != nil {
t.Fatalf("Failed to cleanup Pods: %v", err)
}
}
func removePodsFinalizer(ctx context.Context, t *testing.T, clientSet clientset.Interface, namespace string) {
t.Helper()
pods, err := clientSet.CoreV1().Pods(namespace).List(ctx, metav1.ListOptions{})
if err != nil {
t.Fatalf("Failed to list pods: %v", err)
}
updatePod(ctx, t, clientSet, pods.Items, func(pod *v1.Pod) {
for i, finalizer := range pod.Finalizers {
if finalizer == "fake.example.com/blockDeletion" {
pod.Finalizers = append(pod.Finalizers[:i], pod.Finalizers[i+1:]...)
}
}
})
}
func updatePod(ctx context.Context, t *testing.T, clientSet clientset.Interface, pods []v1.Pod, updateFunc func(*v1.Pod)) {
t.Helper()
for _, val := range pods {
if err := retry.RetryOnConflict(retry.DefaultBackoff, func() error {
newPod, err := clientSet.CoreV1().Pods(val.Namespace).Get(ctx, val.Name, metav1.GetOptions{})
if err != nil {
return err
}
updateFunc(newPod)
_, err = clientSet.CoreV1().Pods(val.Namespace).Update(ctx, newPod, metav1.UpdateOptions{})
return err
}); err != nil {
t.Fatalf("Failed to update pod %s: %v", val.Name, err)
}
}
}
func failTerminatingPods(ctx context.Context, t *testing.T, clientSet clientset.Interface, namespace string) {
t.Helper()
pods, err := clientSet.CoreV1().Pods(namespace).List(ctx, metav1.ListOptions{})
if err != nil {
t.Fatalf("Failed to list pods: %v", err)
}
var terminatingPods []v1.Pod
for _, pod := range pods.Items {
if pod.DeletionTimestamp != nil {
pod.Status.Phase = v1.PodFailed
terminatingPods = append(terminatingPods, pod)
}
}
_, err = updatePodStatuses(ctx, clientSet, terminatingPods)
if err != nil {
t.Fatalf("Failed to update pod statuses: %v", err)
}
}