Merge pull request #121292 from mimowo/backoff-limit-per-index-metrics
Introduce the job_finished_indexes_total metric
This commit is contained in:
@@ -981,9 +981,10 @@ func TestBackoffLimitPerIndex(t *testing.T) {
|
||||
},
|
||||
}
|
||||
testCases := map[string]struct {
|
||||
job batchv1.Job
|
||||
podTerminations []podTerminationWithExpectations
|
||||
wantJobConditionType batchv1.JobConditionType
|
||||
job batchv1.Job
|
||||
podTerminations []podTerminationWithExpectations
|
||||
wantJobConditionType batchv1.JobConditionType
|
||||
wantJobFinishedIndexesTotalMetric []metricLabelsWithValue
|
||||
}{
|
||||
"job succeeded": {
|
||||
job: batchv1.Job{
|
||||
@@ -1008,6 +1009,12 @@ func TestBackoffLimitPerIndex(t *testing.T) {
|
||||
},
|
||||
},
|
||||
wantJobConditionType: batchv1.JobComplete,
|
||||
wantJobFinishedIndexesTotalMetric: []metricLabelsWithValue{
|
||||
{
|
||||
Labels: []string{"succeeded", "perIndex"},
|
||||
Value: 2,
|
||||
},
|
||||
},
|
||||
},
|
||||
"job index fails due to exceeding backoff limit per index": {
|
||||
job: batchv1.Job{
|
||||
@@ -1051,6 +1058,16 @@ func TestBackoffLimitPerIndex(t *testing.T) {
|
||||
},
|
||||
},
|
||||
wantJobConditionType: batchv1.JobFailed,
|
||||
wantJobFinishedIndexesTotalMetric: []metricLabelsWithValue{
|
||||
{
|
||||
Labels: []string{"failed", "perIndex"},
|
||||
Value: 1,
|
||||
},
|
||||
{
|
||||
Labels: []string{"succeeded", "perIndex"},
|
||||
Value: 1,
|
||||
},
|
||||
},
|
||||
},
|
||||
"job index fails due to exceeding the global backoff limit first": {
|
||||
job: batchv1.Job{
|
||||
@@ -1094,6 +1111,16 @@ func TestBackoffLimitPerIndex(t *testing.T) {
|
||||
},
|
||||
},
|
||||
wantJobConditionType: batchv1.JobFailed,
|
||||
wantJobFinishedIndexesTotalMetric: []metricLabelsWithValue{
|
||||
{
|
||||
Labels: []string{"succeeded", "perIndex"},
|
||||
Value: 0,
|
||||
},
|
||||
{
|
||||
Labels: []string{"failed", "perIndex"},
|
||||
Value: 0,
|
||||
},
|
||||
},
|
||||
},
|
||||
"job continues execution after a failed index, the job is marked Failed due to the failed index": {
|
||||
job: batchv1.Job{
|
||||
@@ -1128,6 +1155,16 @@ func TestBackoffLimitPerIndex(t *testing.T) {
|
||||
},
|
||||
},
|
||||
wantJobConditionType: batchv1.JobFailed,
|
||||
wantJobFinishedIndexesTotalMetric: []metricLabelsWithValue{
|
||||
{
|
||||
Labels: []string{"succeeded", "perIndex"},
|
||||
Value: 1,
|
||||
},
|
||||
{
|
||||
Labels: []string{"failed", "perIndex"},
|
||||
Value: 1,
|
||||
},
|
||||
},
|
||||
},
|
||||
"job execution terminated early due to exceeding max failed indexes": {
|
||||
job: batchv1.Job{
|
||||
@@ -1162,6 +1199,12 @@ func TestBackoffLimitPerIndex(t *testing.T) {
|
||||
},
|
||||
},
|
||||
wantJobConditionType: batchv1.JobFailed,
|
||||
wantJobFinishedIndexesTotalMetric: []metricLabelsWithValue{
|
||||
{
|
||||
Labels: []string{"failed", "perIndex"},
|
||||
Value: 2,
|
||||
},
|
||||
},
|
||||
},
|
||||
"pod failure matching pod failure policy rule with FailIndex action": {
|
||||
job: batchv1.Job{
|
||||
@@ -1229,6 +1272,12 @@ func TestBackoffLimitPerIndex(t *testing.T) {
|
||||
},
|
||||
},
|
||||
wantJobConditionType: batchv1.JobFailed,
|
||||
wantJobFinishedIndexesTotalMetric: []metricLabelsWithValue{
|
||||
{
|
||||
Labels: []string{"failed", "perIndex"},
|
||||
Value: 2,
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
for name, test := range testCases {
|
||||
@@ -1289,6 +1338,9 @@ func TestBackoffLimitPerIndex(t *testing.T) {
|
||||
}
|
||||
}
|
||||
validateJobCondition(ctx, t, clientSet, jobObj, test.wantJobConditionType)
|
||||
for _, wantMetricValue := range test.wantJobFinishedIndexesTotalMetric {
|
||||
validateCounterMetric(ctx, t, metrics.JobFinishedIndexesTotal, wantMetricValue)
|
||||
}
|
||||
validateFinishedPodsNoFinalizer(ctx, t, clientSet, jobObj)
|
||||
})
|
||||
}
|
||||
@@ -1581,6 +1633,10 @@ func TestIndexedJob(t *testing.T) {
|
||||
Ready: ptr.To[int32](0),
|
||||
})
|
||||
validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New(0, 1, 2), "", nil)
|
||||
validateCounterMetric(ctx, t, metrics.JobFinishedIndexesTotal, metricLabelsWithValue{
|
||||
Labels: []string{"succeeded", "global"},
|
||||
Value: 0,
|
||||
})
|
||||
|
||||
// One Pod succeeds.
|
||||
if err := setJobPhaseForIndex(ctx, clientSet, jobObj, v1.PodSucceeded, 1); err != nil {
|
||||
@@ -1592,6 +1648,10 @@ func TestIndexedJob(t *testing.T) {
|
||||
Ready: ptr.To[int32](0),
|
||||
})
|
||||
validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New(0, 2, 3), "1", nil)
|
||||
validateCounterMetric(ctx, t, metrics.JobFinishedIndexesTotal, metricLabelsWithValue{
|
||||
Labels: []string{"succeeded", "global"},
|
||||
Value: 1,
|
||||
})
|
||||
|
||||
// One Pod fails, which should be recreated.
|
||||
if err := setJobPhaseForIndex(ctx, clientSet, jobObj, v1.PodFailed, 2); err != nil {
|
||||
@@ -1604,6 +1664,10 @@ func TestIndexedJob(t *testing.T) {
|
||||
Ready: ptr.To[int32](0),
|
||||
})
|
||||
validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New(0, 2, 3), "1", nil)
|
||||
validateCounterMetric(ctx, t, metrics.JobFinishedIndexesTotal, metricLabelsWithValue{
|
||||
Labels: []string{"succeeded", "global"},
|
||||
Value: 1,
|
||||
})
|
||||
|
||||
// Remaining Pods succeed.
|
||||
if err, _ := setJobPodsPhase(ctx, clientSet, jobObj, v1.PodSucceeded, 3); err != nil {
|
||||
@@ -1619,6 +1683,10 @@ func TestIndexedJob(t *testing.T) {
|
||||
validateJobSucceeded(ctx, t, clientSet, jobObj)
|
||||
validateFinishedPodsNoFinalizer(ctx, t, clientSet, jobObj)
|
||||
validateTerminatedPodsTrackingFinalizerMetric(ctx, t, 5)
|
||||
validateCounterMetric(ctx, t, metrics.JobFinishedIndexesTotal, metricLabelsWithValue{
|
||||
Labels: []string{"succeeded", "global"},
|
||||
Value: 4,
|
||||
})
|
||||
}
|
||||
|
||||
func TestJobPodReplacementPolicy(t *testing.T) {
|
||||
@@ -2902,6 +2970,7 @@ func resetMetrics() {
|
||||
metrics.JobFinishedNum.Reset()
|
||||
metrics.JobPodsFinished.Reset()
|
||||
metrics.PodFailuresHandledByFailurePolicy.Reset()
|
||||
metrics.JobFinishedIndexesTotal.Reset()
|
||||
}
|
||||
|
||||
func createJobControllerWithSharedInformers(tb testing.TB, restConfig *restclient.Config, informerSet informers.SharedInformerFactory) (*jobcontroller.Controller, context.Context, context.CancelFunc) {
|
||||
|
Reference in New Issue
Block a user