Fix the occasional double-counting job_finished_total metric
The reason for the issue is that the metrics were bumped before the final job status update. In case the update failed the path was repeated by the next syncJob leading to double-counting of the metrics. The solution is to delay recording metrics and broadcasting events after the job status update succeeds.
This commit is contained in:
@@ -699,10 +699,7 @@ func (jm *Controller) syncJob(ctx context.Context, key string) (forget bool, rEr
|
||||
return false, nil
|
||||
}
|
||||
|
||||
completionMode := string(batch.NonIndexedCompletion)
|
||||
if isIndexedJob(&job) {
|
||||
completionMode = string(batch.IndexedCompletion)
|
||||
}
|
||||
completionMode := getCompletionMode(&job)
|
||||
action := metrics.JobSyncActionReconciling
|
||||
|
||||
defer func() {
|
||||
@@ -906,11 +903,14 @@ func (jm *Controller) syncJob(ctx context.Context, key string) (forget bool, rEr
|
||||
job.Status.CompletedIndexes = succeededIndexes.String()
|
||||
}
|
||||
job.Status.UncountedTerminatedPods = nil
|
||||
jm.enactJobFinished(&job, finishedCondition)
|
||||
jobFinished := jm.enactJobFinished(&job, finishedCondition)
|
||||
|
||||
if _, err := jm.updateStatusHandler(ctx, &job); err != nil {
|
||||
return forget, err
|
||||
}
|
||||
if jobFinished {
|
||||
jm.recordJobFinished(&job, finishedCondition)
|
||||
}
|
||||
|
||||
if jobHasNewFailure && !IsJobFinished(&job) {
|
||||
// returning an error will re-enqueue Job after the backoff period
|
||||
@@ -1105,13 +1105,17 @@ func (jm *Controller) trackJobStatusAndRemoveFinalizers(ctx context.Context, job
|
||||
if job, needsFlush, err = jm.flushUncountedAndRemoveFinalizers(ctx, job, podsToRemoveFinalizer, uidsWithFinalizer, &oldCounters, needsFlush); err != nil {
|
||||
return err
|
||||
}
|
||||
if jm.enactJobFinished(job, finishedCond) {
|
||||
jobFinished := jm.enactJobFinished(job, finishedCond)
|
||||
if jobFinished {
|
||||
needsFlush = true
|
||||
}
|
||||
if needsFlush {
|
||||
if _, err := jm.updateStatusHandler(ctx, job); err != nil {
|
||||
return fmt.Errorf("removing uncounted pods from status: %w", err)
|
||||
}
|
||||
if jobFinished {
|
||||
jm.recordJobFinished(job, finishedCond)
|
||||
}
|
||||
recordJobPodFinished(job, oldCounters)
|
||||
}
|
||||
return nil
|
||||
@@ -1244,16 +1248,20 @@ func (jm *Controller) enactJobFinished(job *batch.Job, finishedCond *batch.JobCo
|
||||
return false
|
||||
}
|
||||
}
|
||||
completionMode := string(batch.NonIndexedCompletion)
|
||||
if isIndexedJob(job) {
|
||||
completionMode = string(*job.Spec.CompletionMode)
|
||||
}
|
||||
job.Status.Conditions, _ = ensureJobConditionStatus(job.Status.Conditions, finishedCond.Type, finishedCond.Status, finishedCond.Reason, finishedCond.Message)
|
||||
if finishedCond.Type == batch.JobComplete {
|
||||
job.Status.CompletionTime = &finishedCond.LastTransitionTime
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// recordJobFinished records events and the job_finished_total metric for a finished job.
|
||||
func (jm *Controller) recordJobFinished(job *batch.Job, finishedCond *batch.JobCondition) bool {
|
||||
completionMode := getCompletionMode(job)
|
||||
if finishedCond.Type == batch.JobComplete {
|
||||
if job.Spec.Completions != nil && job.Status.Succeeded > *job.Spec.Completions {
|
||||
jm.recorder.Event(job, v1.EventTypeWarning, "TooManySucceededPods", "Too many succeeded pods running after completion count reached")
|
||||
}
|
||||
job.Status.CompletionTime = &finishedCond.LastTransitionTime
|
||||
jm.recorder.Event(job, v1.EventTypeNormal, "Completed", "Job completed")
|
||||
metrics.JobFinishedNum.WithLabelValues(completionMode, "succeeded").Inc()
|
||||
} else {
|
||||
@@ -1613,6 +1621,14 @@ func countValidPodsWithFilter(job *batch.Job, pods []*v1.Pod, uncounted sets.Str
|
||||
return result
|
||||
}
|
||||
|
||||
// getCompletionMode returns string representation of the completion mode. Used as a label value for metrics.
|
||||
func getCompletionMode(job *batch.Job) string {
|
||||
if isIndexedJob(job) {
|
||||
return string(batch.IndexedCompletion)
|
||||
}
|
||||
return string(batch.NonIndexedCompletion)
|
||||
}
|
||||
|
||||
func trackingUncountedPods(job *batch.Job) bool {
|
||||
return feature.DefaultFeatureGate.Enabled(features.JobTrackingWithFinalizers) && hasJobTrackingAnnotation(job)
|
||||
}
|
||||
|
Reference in New Issue
Block a user