Merge pull request #121292 from mimowo/backoff-limit-per-index-metrics

Introduce the job_finished_indexes_total metric
2023-10-20 23:50:57 +02:00
parent d916a83d31 b0d04d933b
commit 1fc3d10f7e
3 changed files with 107 additions and 4 deletions
--- a/test/integration/job/job_test.go
+++ b/test/integration/job/job_test.go
@@ -981,9 +981,10 @@ func TestBackoffLimitPerIndex(t *testing.T) {
 		},
 	}
 	testCases := map[string]struct {
-		job                  batchv1.Job
-		podTerminations      []podTerminationWithExpectations
-		wantJobConditionType batchv1.JobConditionType
+		job                               batchv1.Job
+		podTerminations                   []podTerminationWithExpectations
+		wantJobConditionType              batchv1.JobConditionType
+		wantJobFinishedIndexesTotalMetric []metricLabelsWithValue
 	}{
 		"job succeeded": {
 			job: batchv1.Job{
@@ -1008,6 +1009,12 @@ func TestBackoffLimitPerIndex(t *testing.T) {
 				},
 			},
 			wantJobConditionType: batchv1.JobComplete,
+			wantJobFinishedIndexesTotalMetric: []metricLabelsWithValue{
+				{
+					Labels: []string{"succeeded", "perIndex"},
+					Value:  2,
+				},
+			},
 		},
 		"job index fails due to exceeding backoff limit per index": {
 			job: batchv1.Job{
@@ -1051,6 +1058,16 @@ func TestBackoffLimitPerIndex(t *testing.T) {
 				},
 			},
 			wantJobConditionType: batchv1.JobFailed,
+			wantJobFinishedIndexesTotalMetric: []metricLabelsWithValue{
+				{
+					Labels: []string{"failed", "perIndex"},
+					Value:  1,
+				},
+				{
+					Labels: []string{"succeeded", "perIndex"},
+					Value:  1,
+				},
+			},
 		},
 		"job index fails due to exceeding the global backoff limit first": {
 			job: batchv1.Job{
@@ -1094,6 +1111,16 @@ func TestBackoffLimitPerIndex(t *testing.T) {
 				},
 			},
 			wantJobConditionType: batchv1.JobFailed,
+			wantJobFinishedIndexesTotalMetric: []metricLabelsWithValue{
+				{
+					Labels: []string{"succeeded", "perIndex"},
+					Value:  0,
+				},
+				{
+					Labels: []string{"failed", "perIndex"},
+					Value:  0,
+				},
+			},
 		},
 		"job continues execution after a failed index, the job is marked Failed due to the failed index": {
 			job: batchv1.Job{
@@ -1128,6 +1155,16 @@ func TestBackoffLimitPerIndex(t *testing.T) {
 				},
 			},
 			wantJobConditionType: batchv1.JobFailed,
+			wantJobFinishedIndexesTotalMetric: []metricLabelsWithValue{
+				{
+					Labels: []string{"succeeded", "perIndex"},
+					Value:  1,
+				},
+				{
+					Labels: []string{"failed", "perIndex"},
+					Value:  1,
+				},
+			},
 		},
 		"job execution terminated early due to exceeding max failed indexes": {
 			job: batchv1.Job{
@@ -1162,6 +1199,12 @@ func TestBackoffLimitPerIndex(t *testing.T) {
 				},
 			},
 			wantJobConditionType: batchv1.JobFailed,
+			wantJobFinishedIndexesTotalMetric: []metricLabelsWithValue{
+				{
+					Labels: []string{"failed", "perIndex"},
+					Value:  2,
+				},
+			},
 		},
 		"pod failure matching pod failure policy rule with FailIndex action": {
 			job: batchv1.Job{
@@ -1229,6 +1272,12 @@ func TestBackoffLimitPerIndex(t *testing.T) {
 				},
 			},
 			wantJobConditionType: batchv1.JobFailed,
+			wantJobFinishedIndexesTotalMetric: []metricLabelsWithValue{
+				{
+					Labels: []string{"failed", "perIndex"},
+					Value:  2,
+				},
+			},
 		},
 	}
 	for name, test := range testCases {
@@ -1289,6 +1338,9 @@ func TestBackoffLimitPerIndex(t *testing.T) {
 				}
 			}
 			validateJobCondition(ctx, t, clientSet, jobObj, test.wantJobConditionType)
+			for _, wantMetricValue := range test.wantJobFinishedIndexesTotalMetric {
+				validateCounterMetric(ctx, t, metrics.JobFinishedIndexesTotal, wantMetricValue)
+			}
 			validateFinishedPodsNoFinalizer(ctx, t, clientSet, jobObj)
 		})
 	}
@@ -1581,6 +1633,10 @@ func TestIndexedJob(t *testing.T) {
 		Ready:  ptr.To[int32](0),
 	})
 	validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New(0, 1, 2), "", nil)
+	validateCounterMetric(ctx, t, metrics.JobFinishedIndexesTotal, metricLabelsWithValue{
+		Labels: []string{"succeeded", "global"},
+		Value:  0,
+	})

 	// One Pod succeeds.
 	if err := setJobPhaseForIndex(ctx, clientSet, jobObj, v1.PodSucceeded, 1); err != nil {
@@ -1592,6 +1648,10 @@ func TestIndexedJob(t *testing.T) {
 		Ready:     ptr.To[int32](0),
 	})
 	validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New(0, 2, 3), "1", nil)
+	validateCounterMetric(ctx, t, metrics.JobFinishedIndexesTotal, metricLabelsWithValue{
+		Labels: []string{"succeeded", "global"},
+		Value:  1,
+	})

 	// One Pod fails, which should be recreated.
 	if err := setJobPhaseForIndex(ctx, clientSet, jobObj, v1.PodFailed, 2); err != nil {
@@ -1604,6 +1664,10 @@ func TestIndexedJob(t *testing.T) {
 		Ready:     ptr.To[int32](0),
 	})
 	validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New(0, 2, 3), "1", nil)
+	validateCounterMetric(ctx, t, metrics.JobFinishedIndexesTotal, metricLabelsWithValue{
+		Labels: []string{"succeeded", "global"},
+		Value:  1,
+	})

 	// Remaining Pods succeed.
 	if err, _ := setJobPodsPhase(ctx, clientSet, jobObj, v1.PodSucceeded, 3); err != nil {
@@ -1619,6 +1683,10 @@ func TestIndexedJob(t *testing.T) {
 	validateJobSucceeded(ctx, t, clientSet, jobObj)
 	validateFinishedPodsNoFinalizer(ctx, t, clientSet, jobObj)
 	validateTerminatedPodsTrackingFinalizerMetric(ctx, t, 5)
+	validateCounterMetric(ctx, t, metrics.JobFinishedIndexesTotal, metricLabelsWithValue{
+		Labels: []string{"succeeded", "global"},
+		Value:  4,
+	})
 }

 func TestJobPodReplacementPolicy(t *testing.T) {
@@ -2902,6 +2970,7 @@ func resetMetrics() {
 	metrics.JobFinishedNum.Reset()
 	metrics.JobPodsFinished.Reset()
 	metrics.PodFailuresHandledByFailurePolicy.Reset()
+	metrics.JobFinishedIndexesTotal.Reset()
 }

 func createJobControllerWithSharedInformers(tb testing.TB, restConfig *restclient.Config, informerSet informers.SharedInformerFactory) (*jobcontroller.Controller, context.Context, context.CancelFunc) {