Merge pull request #121292 from mimowo/backoff-limit-per-index-metrics

Introduce the job_finished_indexes_total metric
This commit is contained in:
Kubernetes Prow Robot
2023-10-20 23:50:57 +02:00
committed by GitHub
3 changed files with 107 additions and 4 deletions

View File

@@ -981,9 +981,10 @@ func TestBackoffLimitPerIndex(t *testing.T) {
},
}
testCases := map[string]struct {
job batchv1.Job
podTerminations []podTerminationWithExpectations
wantJobConditionType batchv1.JobConditionType
job batchv1.Job
podTerminations []podTerminationWithExpectations
wantJobConditionType batchv1.JobConditionType
wantJobFinishedIndexesTotalMetric []metricLabelsWithValue
}{
"job succeeded": {
job: batchv1.Job{
@@ -1008,6 +1009,12 @@ func TestBackoffLimitPerIndex(t *testing.T) {
},
},
wantJobConditionType: batchv1.JobComplete,
wantJobFinishedIndexesTotalMetric: []metricLabelsWithValue{
{
Labels: []string{"succeeded", "perIndex"},
Value: 2,
},
},
},
"job index fails due to exceeding backoff limit per index": {
job: batchv1.Job{
@@ -1051,6 +1058,16 @@ func TestBackoffLimitPerIndex(t *testing.T) {
},
},
wantJobConditionType: batchv1.JobFailed,
wantJobFinishedIndexesTotalMetric: []metricLabelsWithValue{
{
Labels: []string{"failed", "perIndex"},
Value: 1,
},
{
Labels: []string{"succeeded", "perIndex"},
Value: 1,
},
},
},
"job index fails due to exceeding the global backoff limit first": {
job: batchv1.Job{
@@ -1094,6 +1111,16 @@ func TestBackoffLimitPerIndex(t *testing.T) {
},
},
wantJobConditionType: batchv1.JobFailed,
wantJobFinishedIndexesTotalMetric: []metricLabelsWithValue{
{
Labels: []string{"succeeded", "perIndex"},
Value: 0,
},
{
Labels: []string{"failed", "perIndex"},
Value: 0,
},
},
},
"job continues execution after a failed index, the job is marked Failed due to the failed index": {
job: batchv1.Job{
@@ -1128,6 +1155,16 @@ func TestBackoffLimitPerIndex(t *testing.T) {
},
},
wantJobConditionType: batchv1.JobFailed,
wantJobFinishedIndexesTotalMetric: []metricLabelsWithValue{
{
Labels: []string{"succeeded", "perIndex"},
Value: 1,
},
{
Labels: []string{"failed", "perIndex"},
Value: 1,
},
},
},
"job execution terminated early due to exceeding max failed indexes": {
job: batchv1.Job{
@@ -1162,6 +1199,12 @@ func TestBackoffLimitPerIndex(t *testing.T) {
},
},
wantJobConditionType: batchv1.JobFailed,
wantJobFinishedIndexesTotalMetric: []metricLabelsWithValue{
{
Labels: []string{"failed", "perIndex"},
Value: 2,
},
},
},
"pod failure matching pod failure policy rule with FailIndex action": {
job: batchv1.Job{
@@ -1229,6 +1272,12 @@ func TestBackoffLimitPerIndex(t *testing.T) {
},
},
wantJobConditionType: batchv1.JobFailed,
wantJobFinishedIndexesTotalMetric: []metricLabelsWithValue{
{
Labels: []string{"failed", "perIndex"},
Value: 2,
},
},
},
}
for name, test := range testCases {
@@ -1289,6 +1338,9 @@ func TestBackoffLimitPerIndex(t *testing.T) {
}
}
validateJobCondition(ctx, t, clientSet, jobObj, test.wantJobConditionType)
for _, wantMetricValue := range test.wantJobFinishedIndexesTotalMetric {
validateCounterMetric(ctx, t, metrics.JobFinishedIndexesTotal, wantMetricValue)
}
validateFinishedPodsNoFinalizer(ctx, t, clientSet, jobObj)
})
}
@@ -1581,6 +1633,10 @@ func TestIndexedJob(t *testing.T) {
Ready: ptr.To[int32](0),
})
validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New(0, 1, 2), "", nil)
validateCounterMetric(ctx, t, metrics.JobFinishedIndexesTotal, metricLabelsWithValue{
Labels: []string{"succeeded", "global"},
Value: 0,
})
// One Pod succeeds.
if err := setJobPhaseForIndex(ctx, clientSet, jobObj, v1.PodSucceeded, 1); err != nil {
@@ -1592,6 +1648,10 @@ func TestIndexedJob(t *testing.T) {
Ready: ptr.To[int32](0),
})
validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New(0, 2, 3), "1", nil)
validateCounterMetric(ctx, t, metrics.JobFinishedIndexesTotal, metricLabelsWithValue{
Labels: []string{"succeeded", "global"},
Value: 1,
})
// One Pod fails, which should be recreated.
if err := setJobPhaseForIndex(ctx, clientSet, jobObj, v1.PodFailed, 2); err != nil {
@@ -1604,6 +1664,10 @@ func TestIndexedJob(t *testing.T) {
Ready: ptr.To[int32](0),
})
validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New(0, 2, 3), "1", nil)
validateCounterMetric(ctx, t, metrics.JobFinishedIndexesTotal, metricLabelsWithValue{
Labels: []string{"succeeded", "global"},
Value: 1,
})
// Remaining Pods succeed.
if err, _ := setJobPodsPhase(ctx, clientSet, jobObj, v1.PodSucceeded, 3); err != nil {
@@ -1619,6 +1683,10 @@ func TestIndexedJob(t *testing.T) {
validateJobSucceeded(ctx, t, clientSet, jobObj)
validateFinishedPodsNoFinalizer(ctx, t, clientSet, jobObj)
validateTerminatedPodsTrackingFinalizerMetric(ctx, t, 5)
validateCounterMetric(ctx, t, metrics.JobFinishedIndexesTotal, metricLabelsWithValue{
Labels: []string{"succeeded", "global"},
Value: 4,
})
}
func TestJobPodReplacementPolicy(t *testing.T) {
@@ -2902,6 +2970,7 @@ func resetMetrics() {
metrics.JobFinishedNum.Reset()
metrics.JobPodsFinished.Reset()
metrics.PodFailuresHandledByFailurePolicy.Reset()
metrics.JobFinishedIndexesTotal.Reset()
}
func createJobControllerWithSharedInformers(tb testing.TB, restConfig *restclient.Config, informerSet informers.SharedInformerFactory) (*jobcontroller.Controller, context.Context, context.CancelFunc) {