Add scheduler throughput metric.

This adds a counter to the scheduler that can be used to calculate
throughput and error ratio. Pods which fail to schedule are not counted
as errors, but can still be tracked separately from successes.

We already measure scheduler latency, but throughput was missing. This
should be considered a key metric for the scheduler.
This commit is contained in:
Jonathan Basseri
2018-05-30 11:59:55 -07:00
parent 170dcc2ea0
commit b0a8dbbc9d
2 changed files with 31 additions and 2 deletions

View File

@@ -46,6 +46,18 @@ const (
// All the histogram based metrics have 1ms as size for the smallest bucket.
var (
scheduleAttempts = prometheus.NewCounterVec(
prometheus.CounterOpts{
Subsystem: SchedulerSubsystem,
Name: "schedule_attempts_total",
Help: "Number of attempts to schedule pods, by the result. 'unschedulable' means a pod could not be scheduled, while 'error' means an internal scheduler problem.",
}, []string{"result"})
// PodScheduleSuccesses counts how many pods were scheduled.
PodScheduleSuccesses = scheduleAttempts.With(prometheus.Labels{"result": "scheduled"})
// PodScheduleFailures counts how many pods could not be scheduled.
PodScheduleFailures = scheduleAttempts.With(prometheus.Labels{"result": "unschedulable"})
// PodScheduleErrors counts how many pods could not be scheduled due to a scheduler error.
PodScheduleErrors = scheduleAttempts.With(prometheus.Labels{"result": "error"})
SchedulingLatency = prometheus.NewSummaryVec(
prometheus.SummaryOpts{
Subsystem: SchedulerSubsystem,
@@ -135,6 +147,7 @@ var (
}, []string{"result"})
metricsList = []prometheus.Collector{
scheduleAttempts,
SchedulingLatency,
E2eSchedulingLatency,
SchedulingAlgorithmLatency,