kubernetes/pkg/kubelet/server/stats/summary.go
Yu-Ju Hong 191666d6a3 Fix computing of cpu nano core usage
CRI runtimes do not supply cpu nano core usage as it is not part of CRI
stats. However, there are upstream components that still rely on such
stats to function. The previous fix was faulty because the multiple
callers could compete and update the stats, causing
inconsistent/incoherent metrics. This change, instead, creates a
separate call for updating the usage, and rely on eviction manager,
which runs periodically, to trigger the updates. The caveat is that if
eviction manager is completley turned off, no one would compute the
usage.
2019-03-05 09:25:40 -08:00

151 lines
4.8 KiB
Go

/*
Copyright 2016 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package stats
import (
"fmt"
"k8s.io/klog"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
statsapi "k8s.io/kubernetes/pkg/kubelet/apis/stats/v1alpha1"
"k8s.io/kubernetes/pkg/kubelet/util"
)
// SummaryProvider provides summaries of the stats from Kubelet.
type SummaryProvider interface {
// Get provides a new Summary with the stats from Kubelet,
// and will update some stats if updateStats is true
Get(updateStats bool) (*statsapi.Summary, error)
// GetCPUAndMemoryStats provides a new Summary with the CPU and memory stats from Kubelet,
GetCPUAndMemoryStats() (*statsapi.Summary, error)
}
// summaryProviderImpl implements the SummaryProvider interface.
type summaryProviderImpl struct {
// kubeletCreationTime is the time at which the summaryProvider was created.
kubeletCreationTime metav1.Time
// systemBootTime is the time at which the system was started
systemBootTime metav1.Time
provider Provider
}
var _ SummaryProvider = &summaryProviderImpl{}
// NewSummaryProvider returns a SummaryProvider using the stats provided by the
// specified statsProvider.
func NewSummaryProvider(statsProvider Provider) SummaryProvider {
kubeletCreationTime := metav1.Now()
bootTime, err := util.GetBootTime()
if err != nil {
// bootTime will be zero if we encounter an error getting the boot time.
klog.Warningf("Error getting system boot time. Node metrics will have an incorrect start time: %v", err)
}
return &summaryProviderImpl{
kubeletCreationTime: kubeletCreationTime,
systemBootTime: metav1.NewTime(bootTime),
provider: statsProvider,
}
}
func (sp *summaryProviderImpl) Get(updateStats bool) (*statsapi.Summary, error) {
// TODO(timstclair): Consider returning a best-effort response if any of
// the following errors occur.
node, err := sp.provider.GetNode()
if err != nil {
return nil, fmt.Errorf("failed to get node info: %v", err)
}
nodeConfig := sp.provider.GetNodeConfig()
rootStats, networkStats, err := sp.provider.GetCgroupStats("/", updateStats)
if err != nil {
return nil, fmt.Errorf("failed to get root cgroup stats: %v", err)
}
rootFsStats, err := sp.provider.RootFsStats()
if err != nil {
return nil, fmt.Errorf("failed to get rootFs stats: %v", err)
}
imageFsStats, err := sp.provider.ImageFsStats()
if err != nil {
return nil, fmt.Errorf("failed to get imageFs stats: %v", err)
}
var podStats []statsapi.PodStats
if updateStats {
podStats, err = sp.provider.ListPodStatsAndUpdateCPUNanoCoreUsage()
} else {
podStats, err = sp.provider.ListPodStats()
}
if err != nil {
return nil, fmt.Errorf("failed to list pod stats: %v", err)
}
rlimit, err := sp.provider.RlimitStats()
if err != nil {
return nil, fmt.Errorf("failed to get rlimit stats: %v", err)
}
nodeStats := statsapi.NodeStats{
NodeName: node.Name,
CPU: rootStats.CPU,
Memory: rootStats.Memory,
Network: networkStats,
StartTime: sp.systemBootTime,
Fs: rootFsStats,
Runtime: &statsapi.RuntimeStats{ImageFs: imageFsStats},
Rlimit: rlimit,
SystemContainers: sp.GetSystemContainersStats(nodeConfig, podStats, updateStats),
}
summary := statsapi.Summary{
Node: nodeStats,
Pods: podStats,
}
return &summary, nil
}
func (sp *summaryProviderImpl) GetCPUAndMemoryStats() (*statsapi.Summary, error) {
// TODO(timstclair): Consider returning a best-effort response if any of
// the following errors occur.
node, err := sp.provider.GetNode()
if err != nil {
return nil, fmt.Errorf("failed to get node info: %v", err)
}
nodeConfig := sp.provider.GetNodeConfig()
rootStats, err := sp.provider.GetCgroupCPUAndMemoryStats("/", false)
if err != nil {
return nil, fmt.Errorf("failed to get root cgroup stats: %v", err)
}
podStats, err := sp.provider.ListPodCPUAndMemoryStats()
if err != nil {
return nil, fmt.Errorf("failed to list pod stats: %v", err)
}
nodeStats := statsapi.NodeStats{
NodeName: node.Name,
CPU: rootStats.CPU,
Memory: rootStats.Memory,
StartTime: rootStats.StartTime,
SystemContainers: sp.GetSystemContainersCPUAndMemoryStats(nodeConfig, podStats, false),
}
summary := statsapi.Summary{
Node: nodeStats,
Pods: podStats,
}
return &summary, nil
}