Merge pull request #68068 from krzysztof-jastrzebski/hpas2
Automatic merge from submit-queue. If you want to cherry-pick this change to another branch, please follow the instructions here: https://github.com/kubernetes/community/blob/master/contributors/devel/cherry-picks.md. Change CPU sample sanitization in HPA. **What this PR does / why we need it**: Change CPU sample sanitization in HPA. Ignore samples if: - Pod is beeing initalized - 5 minutes from start defined by flag - pod is unready - pod is ready but full window of metric hasn't been colected since transition - Pod is initialized - 5 minutes from start defined by flag: - Pod has never been ready after initial readiness period. **Release notes:** ```release-note Improve CPU sample sanitization in HPA by taking metric's freshness into account. ```
This commit is contained in:
@@ -32,12 +32,10 @@ import (
|
||||
)
|
||||
|
||||
const (
|
||||
// TODO(jbartosik): use actual value.
|
||||
cpuSampleWindow = time.Minute
|
||||
// defaultTestingTolerance is default value for calculating when to
|
||||
// scale up/scale down.
|
||||
defaultTestingTolerance = 0.1
|
||||
defaultTestingCpuTaintAfterStart = 2 * time.Minute
|
||||
defaultTestingCpuInitializationPeriod = 2 * time.Minute
|
||||
defaultTestingDelayOfInitialReadinessStatus = 10 * time.Second
|
||||
)
|
||||
|
||||
@@ -45,16 +43,16 @@ type ReplicaCalculator struct {
|
||||
metricsClient metricsclient.MetricsClient
|
||||
podsGetter v1coreclient.PodsGetter
|
||||
tolerance float64
|
||||
cpuTaintAfterStart time.Duration
|
||||
cpuInitializationPeriod time.Duration
|
||||
delayOfInitialReadinessStatus time.Duration
|
||||
}
|
||||
|
||||
func NewReplicaCalculator(metricsClient metricsclient.MetricsClient, podsGetter v1coreclient.PodsGetter, tolerance float64, cpuTaintAfterStart, delayOfInitialReadinessStatus time.Duration) *ReplicaCalculator {
|
||||
func NewReplicaCalculator(metricsClient metricsclient.MetricsClient, podsGetter v1coreclient.PodsGetter, tolerance float64, cpuInitializationPeriod, delayOfInitialReadinessStatus time.Duration) *ReplicaCalculator {
|
||||
return &ReplicaCalculator{
|
||||
metricsClient: metricsClient,
|
||||
podsGetter: podsGetter,
|
||||
tolerance: tolerance,
|
||||
cpuTaintAfterStart: cpuTaintAfterStart,
|
||||
cpuInitializationPeriod: cpuInitializationPeriod,
|
||||
delayOfInitialReadinessStatus: delayOfInitialReadinessStatus,
|
||||
}
|
||||
}
|
||||
@@ -77,7 +75,7 @@ func (c *ReplicaCalculator) GetResourceReplicas(currentReplicas int32, targetUti
|
||||
return 0, 0, 0, time.Time{}, fmt.Errorf("no pods returned by selector while calculating replica count")
|
||||
}
|
||||
|
||||
readyPodCount, ignoredPods, missingPods := groupPods(podList.Items, metrics, resource, c.cpuTaintAfterStart, c.delayOfInitialReadinessStatus)
|
||||
readyPodCount, ignoredPods, missingPods := groupPods(podList.Items, metrics, resource, c.cpuInitializationPeriod, c.delayOfInitialReadinessStatus)
|
||||
removeMetricsForPods(metrics, ignoredPods)
|
||||
requests, err := calculatePodRequests(podList.Items, resource)
|
||||
if err != nil {
|
||||
@@ -108,12 +106,12 @@ func (c *ReplicaCalculator) GetResourceReplicas(currentReplicas int32, targetUti
|
||||
if usageRatio < 1.0 {
|
||||
// on a scale-down, treat missing pods as using 100% of the resource request
|
||||
for podName := range missingPods {
|
||||
metrics[podName] = requests[podName]
|
||||
metrics[podName] = metricsclient.PodMetric{Value: requests[podName]}
|
||||
}
|
||||
} else if usageRatio > 1.0 {
|
||||
// on a scale-up, treat missing pods as using 0% of the resource request
|
||||
for podName := range missingPods {
|
||||
metrics[podName] = 0
|
||||
metrics[podName] = metricsclient.PodMetric{Value: 0}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -121,7 +119,7 @@ func (c *ReplicaCalculator) GetResourceReplicas(currentReplicas int32, targetUti
|
||||
if rebalanceIgnored {
|
||||
// on a scale-up, treat unready pods as using 0% of the resource request
|
||||
for podName := range ignoredPods {
|
||||
metrics[podName] = 0
|
||||
metrics[podName] = metricsclient.PodMetric{Value: 0}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -178,7 +176,7 @@ func (c *ReplicaCalculator) calcPlainMetricReplicas(metrics metricsclient.PodMet
|
||||
return 0, 0, fmt.Errorf("no pods returned by selector while calculating replica count")
|
||||
}
|
||||
|
||||
readyPodCount, ignoredPods, missingPods := groupPods(podList.Items, metrics, resource, c.cpuTaintAfterStart, c.delayOfInitialReadinessStatus)
|
||||
readyPodCount, ignoredPods, missingPods := groupPods(podList.Items, metrics, resource, c.cpuInitializationPeriod, c.delayOfInitialReadinessStatus)
|
||||
removeMetricsForPods(metrics, ignoredPods)
|
||||
|
||||
if len(metrics) == 0 {
|
||||
@@ -203,12 +201,12 @@ func (c *ReplicaCalculator) calcPlainMetricReplicas(metrics metricsclient.PodMet
|
||||
if usageRatio < 1.0 {
|
||||
// on a scale-down, treat missing pods as using 100% of the resource request
|
||||
for podName := range missingPods {
|
||||
metrics[podName] = targetUtilization
|
||||
metrics[podName] = metricsclient.PodMetric{Value: targetUtilization}
|
||||
}
|
||||
} else {
|
||||
// on a scale-up, treat missing pods as using 0% of the resource request
|
||||
for podName := range missingPods {
|
||||
metrics[podName] = 0
|
||||
metrics[podName] = metricsclient.PodMetric{Value: 0}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -216,7 +214,7 @@ func (c *ReplicaCalculator) calcPlainMetricReplicas(metrics metricsclient.PodMet
|
||||
if rebalanceIgnored {
|
||||
// on a scale-up, treat unready pods as using 0% of the resource request
|
||||
for podName := range ignoredPods {
|
||||
metrics[podName] = 0
|
||||
metrics[podName] = metricsclient.PodMetric{Value: 0}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -342,14 +340,15 @@ func (c *ReplicaCalculator) GetExternalPerPodMetricReplicas(currentReplicas int3
|
||||
return replicaCount, utilization, timestamp, nil
|
||||
}
|
||||
|
||||
func groupPods(pods []v1.Pod, metrics metricsclient.PodMetricsInfo, resource v1.ResourceName, cpuTaintAfterStart, delayOfInitialReadinessStatus time.Duration) (readyPodCount int, ignoredPods sets.String, missingPods sets.String) {
|
||||
func groupPods(pods []v1.Pod, metrics metricsclient.PodMetricsInfo, resource v1.ResourceName, cpuInitializationPeriod, delayOfInitialReadinessStatus time.Duration) (readyPodCount int, ignoredPods sets.String, missingPods sets.String) {
|
||||
missingPods = sets.NewString()
|
||||
ignoredPods = sets.NewString()
|
||||
for _, pod := range pods {
|
||||
if pod.DeletionTimestamp != nil || pod.Status.Phase == v1.PodFailed {
|
||||
continue
|
||||
}
|
||||
if _, found := metrics[pod.Name]; !found {
|
||||
metric, found := metrics[pod.Name]
|
||||
if !found {
|
||||
missingPods.Insert(pod.Name)
|
||||
continue
|
||||
}
|
||||
@@ -359,10 +358,13 @@ func groupPods(pods []v1.Pod, metrics metricsclient.PodMetricsInfo, resource v1.
|
||||
if condition == nil || pod.Status.StartTime == nil {
|
||||
ignorePod = true
|
||||
} else {
|
||||
if condition.Status == v1.ConditionTrue {
|
||||
ignorePod = pod.Status.StartTime.Add(cpuTaintAfterStart + cpuSampleWindow).After(time.Now())
|
||||
// Pod still within possible initialisation period.
|
||||
if pod.Status.StartTime.Add(cpuInitializationPeriod).After(time.Now()) {
|
||||
// Ignore sample if pod is unready or one window of metric wasn't collected since last state transition.
|
||||
ignorePod = condition.Status == v1.ConditionFalse || metric.Timestamp.Before(condition.LastTransitionTime.Time.Add(metric.Window))
|
||||
} else {
|
||||
ignorePod = pod.Status.StartTime.Add(delayOfInitialReadinessStatus).After(condition.LastTransitionTime.Time)
|
||||
// Ignore metric if pod is unready and it has never been ready.
|
||||
ignorePod = condition.Status == v1.ConditionFalse && pod.Status.StartTime.Add(delayOfInitialReadinessStatus).After(condition.LastTransitionTime.Time)
|
||||
}
|
||||
}
|
||||
if ignorePod {
|
||||
@@ -370,7 +372,6 @@ func groupPods(pods []v1.Pod, metrics metricsclient.PodMetricsInfo, resource v1.
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
readyPodCount++
|
||||
}
|
||||
return
|
||||
|
Reference in New Issue
Block a user