Merge pull request #113176 from alculquicondor/finalizer_metric

Add metric for terminated pods with tracking finalizer
This commit is contained in:
Kubernetes Prow Robot
2022-10-20 20:52:01 -07:00
committed by GitHub
5 changed files with 197 additions and 19 deletions

View File

@@ -78,9 +78,7 @@ func TestMetrics(t *testing.T) {
closeFn, restConfig, clientSet, ns := setup(t, "simple")
defer closeFn()
ctx, cancel := startJobControllerAndWaitForCaches(restConfig)
defer func() {
cancel()
}()
defer cancel()
testCases := map[string]struct {
job *batchv1.Job
@@ -144,13 +142,14 @@ func TestMetrics(t *testing.T) {
validateJobSucceeded(ctx, t, clientSet, jobObj)
// verify metric values after the job is finished
validateMetricValue(t, metrics.JobFinishedNum, tc.wantJobFinishedNumMetric)
validateMetricValue(t, metrics.JobPodsFinished, tc.wantJobPodsFinishedMetric)
validateCounterMetric(t, metrics.JobFinishedNum, tc.wantJobFinishedNumMetric)
validateCounterMetric(t, metrics.JobPodsFinished, tc.wantJobPodsFinishedMetric)
validateTerminatedPodsTrackingFinalizerMetric(t, int(*jobObj.Spec.Parallelism))
})
}
}
func validateMetricValue(t *testing.T, counterVec *basemetrics.CounterVec, wantMetric metricLabelsWithValue) {
func validateCounterMetric(t *testing.T, counterVec *basemetrics.CounterVec, wantMetric metricLabelsWithValue) {
t.Helper()
var cmpErr error
err := wait.PollImmediate(10*time.Millisecond, 10*time.Second, func() (bool, error) {
@@ -166,13 +165,24 @@ func validateMetricValue(t *testing.T, counterVec *basemetrics.CounterVec, wantM
return true, nil
})
if err != nil {
t.Errorf("Failed waiting for expected metric delta: %q", err)
t.Errorf("Failed waiting for expected metric: %q", err)
}
if cmpErr != nil {
t.Error(cmpErr)
}
}
func validateTerminatedPodsTrackingFinalizerMetric(t *testing.T, want int) {
validateCounterMetric(t, metrics.TerminatedPodsTrackingFinalizerTotal, metricLabelsWithValue{
Value: want,
Labels: []string{metrics.Add},
})
validateCounterMetric(t, metrics.TerminatedPodsTrackingFinalizerTotal, metricLabelsWithValue{
Value: want,
Labels: []string{metrics.Delete},
})
}
// TestJobPodFailurePolicyWithFailedPodDeletedDuringControllerRestart verifies that the job is properly marked as Failed
// in a scenario when the job controller crashes between removing pod finalizers and marking the job as Failed (based on
// the pod failure policy). After the finalizer for the failed pod is removed we remove the failed pod. This step is
@@ -238,6 +248,7 @@ func TestJobPodFailurePolicyWithFailedPodDeletedDuringControllerRestart(t *testi
defer func() {
cancel()
}()
resetMetrics()
restConfig.QPS = 200
restConfig.Burst = 200
@@ -556,6 +567,7 @@ func TestParallelJob(t *testing.T) {
defer closeFn()
ctx, cancel := startJobControllerAndWaitForCaches(restConfig)
defer cancel()
resetMetrics()
jobObj, err := createJobWithDefaults(ctx, clientSet, ns.Name, &batchv1.Job{
Spec: batchv1.JobSpec{
@@ -631,6 +643,9 @@ func TestParallelJob(t *testing.T) {
}
validateJobPodsStatus(ctx, t, clientSet, jobObj, want, false)
validateFinishedPodsNoFinalizer(ctx, t, clientSet, jobObj)
if tc.trackWithFinalizers {
validateTerminatedPodsTrackingFinalizerMetric(t, 7)
}
})
}
}
@@ -803,9 +818,8 @@ func TestIndexedJob(t *testing.T) {
closeFn, restConfig, clientSet, ns := setup(t, "indexed")
defer closeFn()
ctx, cancel := startJobControllerAndWaitForCaches(restConfig)
defer func() {
cancel()
}()
defer cancel()
resetMetrics()
mode := batchv1.IndexedCompletion
jobObj, err := createJobWithDefaults(ctx, clientSet, ns.Name, &batchv1.Job{
@@ -863,6 +877,9 @@ func TestIndexedJob(t *testing.T) {
validateIndexedJobPods(ctx, t, clientSet, jobObj, nil, "0-3")
validateJobSucceeded(ctx, t, clientSet, jobObj)
validateFinishedPodsNoFinalizer(ctx, t, clientSet, jobObj)
if wFinalizers {
validateTerminatedPodsTrackingFinalizerMetric(t, 5)
}
})
}
}
@@ -957,6 +974,7 @@ func TestOrphanPodsFinalizersClearedWithGC(t *testing.T) {
restConfig.QPS = 1
restConfig.Burst = 1
jc, ctx, cancel := createJobControllerWithSharedInformers(restConfig, informerSet)
resetMetrics()
defer cancel()
restConfig.QPS = 200
restConfig.Burst = 200
@@ -989,6 +1007,8 @@ func TestOrphanPodsFinalizersClearedWithGC(t *testing.T) {
t.Fatalf("Failed to delete job: %v", err)
}
validateNoOrphanPodsWithFinalizers(ctx, t, clientSet, jobObj)
// Pods never finished, so they are not counted in the metric.
validateTerminatedPodsTrackingFinalizerMetric(t, 0)
})
}
}
@@ -1676,6 +1696,7 @@ func startJobControllerAndWaitForCaches(restConfig *restclient.Config) (context.
}
func resetMetrics() {
metrics.TerminatedPodsTrackingFinalizerTotal.Reset()
metrics.JobFinishedNum.Reset()
metrics.JobPodsFinished.Reset()
}