Introduce the job_finished_indexes_total metric

This commit is contained in:
Michal Wozniak
2023-10-17 15:31:32 +02:00
parent 7b9d244efd
commit b0d04d933b
3 changed files with 107 additions and 4 deletions

View File

@@ -1842,8 +1842,16 @@ func recordJobPodFinished(logger klog.Logger, job *batch.Job, oldCounters batch.
// in tandem, and now a previously completed index is
// now out of range (i.e. index >= spec.Completions).
if isIndexedJob(job) {
completions := int(*job.Spec.Completions)
if job.Status.CompletedIndexes != oldCounters.CompletedIndexes {
diff = parseIndexesFromString(logger, job.Status.CompletedIndexes, int(*job.Spec.Completions)).total() - parseIndexesFromString(logger, oldCounters.CompletedIndexes, int(*job.Spec.Completions)).total()
diff = indexesCount(logger, &job.Status.CompletedIndexes, completions) - indexesCount(logger, &oldCounters.CompletedIndexes, completions)
}
backoffLimitLabel := backoffLimitMetricsLabel(job)
metrics.JobFinishedIndexesTotal.WithLabelValues(metrics.Succeeded, backoffLimitLabel).Add(float64(diff))
if hasBackoffLimitPerIndex(job) && job.Status.FailedIndexes != oldCounters.FailedIndexes {
if failedDiff := indexesCount(logger, job.Status.FailedIndexes, completions) - indexesCount(logger, oldCounters.FailedIndexes, completions); failedDiff > 0 {
metrics.JobFinishedIndexesTotal.WithLabelValues(metrics.Failed, backoffLimitLabel).Add(float64(failedDiff))
}
}
} else {
diff = int(job.Status.Succeeded) - int(oldCounters.Succeeded)
@@ -1855,6 +1863,20 @@ func recordJobPodFinished(logger klog.Logger, job *batch.Job, oldCounters batch.
metrics.JobPodsFinished.WithLabelValues(completionMode, metrics.Failed).Add(float64(diff))
}
func indexesCount(logger klog.Logger, indexesStr *string, completions int) int {
if indexesStr == nil {
return 0
}
return parseIndexesFromString(logger, *indexesStr, completions).total()
}
func backoffLimitMetricsLabel(job *batch.Job) string {
if hasBackoffLimitPerIndex(job) {
return "perIndex"
}
return "global"
}
func recordJobPodFailurePolicyActions(job *batch.Job, podFailureCountByPolicyAction map[string]int) {
for action, count := range podFailureCountByPolicyAction {
metrics.PodFailuresHandledByFailurePolicy.WithLabelValues(action).Add(float64(count))

View File

@@ -114,6 +114,17 @@ var (
that have the finalizer batch.kubernetes.io/job-tracking
The event label can be "add" or "delete".`,
}, []string{"event"})
// JobFinishedIndexesTotal records the number of finished indexes.
JobFinishedIndexesTotal = metrics.NewCounterVec(
&metrics.CounterOpts{
Subsystem: JobControllerSubsystem,
Name: "job_finished_indexes_total",
Help: `The number of finished indexes. Possible values for the
status label are: "succeeded", "failed". Possible values for the
backoffLimit label are: "perIndex" and "global"`,
},
[]string{"status", "backoffLimit"})
)
const (
@@ -158,5 +169,6 @@ func Register() {
legacyregistry.MustRegister(JobPodsFinished)
legacyregistry.MustRegister(PodFailuresHandledByFailurePolicy)
legacyregistry.MustRegister(TerminatedPodsTrackingFinalizerTotal)
legacyregistry.MustRegister(JobFinishedIndexesTotal)
})
}