Bump cAdvisor to v0.43.0

Bumping cAdvisor from v0.39.2 -> v0.43.0

* Also pin transitive dependencies
  * containerd v1.4.9 -> v1.4.11
  * docker v20.10.2+incompatible> v20.10.7+incompatible

Signed-off-by: David Porter <david@porter.me>
This commit is contained in:
David Porter
2021-11-09 14:23:06 -08:00
parent e4adf7f31c
commit c6452be958
80 changed files with 1637 additions and 465 deletions

View File

@@ -27,6 +27,7 @@ import (
"strconv"
"strings"
"sync"
"sync/atomic"
"time"
"github.com/google/cadvisor/cache/memory"
@@ -102,6 +103,8 @@ type containerData struct {
// resctrlCollector updates stats for resctrl controller.
resctrlCollector stats.Collector
oomEvents uint64
}
// jitter returns a time.Duration between duration and duration + maxFactor * duration,
@@ -127,6 +130,7 @@ func (cd *containerData) Stop() error {
}
close(cd.stop)
cd.perfCollector.Destroy()
cd.resctrlCollector.Destroy()
return nil
}
@@ -668,6 +672,9 @@ func (cd *containerData) updateStats() error {
klog.V(2).Infof("Failed to add summary stats for %q: %v", cd.info.Name, err)
}
}
stats.OOMEvents = atomic.LoadUint64(&cd.oomEvents)
var customStatsErr error
cm := cd.collectorManager.(*collector.GenericCollectorManager)
if len(cm.Collectors) > 0 {
@@ -721,7 +728,7 @@ func (cd *containerData) updateStats() error {
return perfStatsErr
}
if resctrlStatsErr != nil {
klog.Errorf("error occurred while collecting resctrl stats for container %s: %s", cInfo.Name, err)
klog.Errorf("error occurred while collecting resctrl stats for container %s: %s", cInfo.Name, resctrlStatsErr)
return resctrlStatsErr
}
return customStatsErr

View File

@@ -24,18 +24,18 @@ import (
"strconv"
"strings"
"sync"
"sync/atomic"
"time"
"github.com/google/cadvisor/accelerators"
"github.com/google/cadvisor/cache/memory"
"github.com/google/cadvisor/collector"
"github.com/google/cadvisor/container"
"github.com/google/cadvisor/container/docker"
"github.com/google/cadvisor/container/raw"
"github.com/google/cadvisor/events"
"github.com/google/cadvisor/fs"
info "github.com/google/cadvisor/info/v1"
"github.com/google/cadvisor/info/v2"
v2 "github.com/google/cadvisor/info/v2"
"github.com/google/cadvisor/machine"
"github.com/google/cadvisor/nvm"
"github.com/google/cadvisor/perf"
@@ -47,8 +47,6 @@ import (
"github.com/google/cadvisor/watcher"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fs2"
"github.com/opencontainers/runc/libcontainer/intelrdt"
"k8s.io/klog/v2"
"k8s.io/utils/clock"
@@ -61,6 +59,14 @@ var eventStorageAgeLimit = flag.String("event_storage_age_limit", "default=24h",
var eventStorageEventLimit = flag.String("event_storage_event_limit", "default=100000", "Max number of events to store (per type). Value is a comma separated list of key values, where the keys are event types (e.g.: creation, oom) or \"default\" and the value is an integer. Default is applied to all non-specified event types")
var applicationMetricsCountLimit = flag.Int("application_metrics_count_limit", 100, "Max number of application metrics to store (per container)")
// The namespace under which Docker aliases are unique.
const DockerNamespace = "docker"
var HousekeepingConfigFlags = HouskeepingConfig{
flag.Duration("max_housekeeping_interval", 60*time.Second, "Largest interval to allow between container housekeepings"),
flag.Bool("allow_dynamic_housekeeping", true, "Whether to allow the housekeeping interval to be dynamic"),
}
// The Manager interface defines operations for starting a manager and getting
// container and machine information.
type Manager interface {
@@ -129,12 +135,6 @@ type Manager interface {
CloseEventChannel(watchID int)
// Get status information about docker.
DockerInfo() (info.DockerStatus, error)
// Get details about interesting docker images.
DockerImages() ([]info.DockerImage, error)
// Returns debugging information. Map of lines per category.
DebugInfo() map[string][]string
}
@@ -146,7 +146,7 @@ type HouskeepingConfig = struct {
}
// New takes a memory storage and returns a new manager.
func New(memoryCache *memory.InMemoryCache, sysfs sysfs.SysFs, houskeepingConfig HouskeepingConfig, includedMetricsSet container.MetricSet, collectorHTTPClient *http.Client, rawContainerCgroupPathPrefixWhiteList []string, perfEventsFile string) (Manager, error) {
func New(memoryCache *memory.InMemoryCache, sysfs sysfs.SysFs, houskeepingConfig HouskeepingConfig, includedMetricsSet container.MetricSet, collectorHTTPClient *http.Client, rawContainerCgroupPathPrefixWhiteList, containerEnvMetadataWhiteList []string, perfEventsFile string, resctrlInterval time.Duration) (Manager, error) {
if memoryCache == nil {
return nil, fmt.Errorf("manager requires memory storage")
}
@@ -203,6 +203,7 @@ func New(memoryCache *memory.InMemoryCache, sysfs sysfs.SysFs, houskeepingConfig
collectorHTTPClient: collectorHTTPClient,
nvidiaManager: accelerators.NewNvidiaManager(includedMetricsSet),
rawContainerCgroupPathPrefixWhiteList: rawContainerCgroupPathPrefixWhiteList,
containerEnvMetadataWhiteList: containerEnvMetadataWhiteList,
}
machineInfo, err := machine.Info(sysfs, fsInfo, inHostNamespace)
@@ -217,7 +218,7 @@ func New(memoryCache *memory.InMemoryCache, sysfs sysfs.SysFs, houskeepingConfig
return nil, err
}
newManager.resctrlManager, err = resctrl.NewManager(selfContainer)
newManager.resctrlManager, err = resctrl.NewManager(resctrlInterval, resctrl.Setup, machineInfo.CPUVendorID, inHostNamespace)
if err != nil {
klog.V(4).Infof("Cannot gather resctrl metrics: %v", err)
}
@@ -262,9 +263,11 @@ type manager struct {
collectorHTTPClient *http.Client
nvidiaManager stats.Manager
perfManager stats.Manager
resctrlManager stats.Manager
resctrlManager resctrl.Manager
// List of raw container cgroup path prefix whitelist.
rawContainerCgroupPathPrefixWhiteList []string
// List of container env prefix whitelist, the matched container envs would be collected into metrics as extra labels.
containerEnvMetadataWhiteList []string
}
// Start the container manager.
@@ -327,7 +330,7 @@ func (m *manager) Start() error {
func (m *manager) Stop() error {
defer m.nvidiaManager.Destroy()
defer m.destroyPerfCollectors()
defer m.destroyCollectors()
// Stop and wait on all quit channels.
for i, c := range m.quitChannels {
// Send the exit signal and wait on the thread to exit (by closing the channel).
@@ -345,9 +348,10 @@ func (m *manager) Stop() error {
return nil
}
func (m *manager) destroyPerfCollectors() {
func (m *manager) destroyCollectors() {
for _, container := range m.containers {
container.perfCollector.Destroy()
container.resctrlCollector.Destroy()
}
}
@@ -590,7 +594,7 @@ func (m *manager) getAllDockerContainers() map[string]*containerData {
// Get containers in the Docker namespace.
for name, cont := range m.containers {
if name.Namespace == docker.DockerNamespace {
if name.Namespace == DockerNamespace {
containers[cont.info.Name] = cont
}
}
@@ -622,14 +626,14 @@ func (m *manager) getDockerContainer(containerName string) (*containerData, erro
// Check for the container in the Docker container namespace.
cont, ok := m.containers[namespacedContainerName{
Namespace: docker.DockerNamespace,
Namespace: DockerNamespace,
Name: containerName,
}]
// Look for container by short prefix name if no exact match found.
if !ok {
for contName, c := range m.containers {
if contName.Namespace == docker.DockerNamespace && strings.HasPrefix(contName.Name, containerName) {
if contName.Namespace == DockerNamespace && strings.HasPrefix(contName.Name, containerName) {
if cont == nil {
cont = c
} else {
@@ -692,6 +696,10 @@ func (m *manager) GetRequestedContainersInfo(containerName string, options v2.Re
for name, data := range containers {
info, err := m.containerDataToContainerInfo(data, &query)
if err != nil {
if err == memory.ErrDataNotFound {
klog.Warningf("Error getting data for container %s because of race condition", name)
continue
}
errs.append(name, "containerDataToContainerInfo", err)
}
containersMap[name] = info
@@ -908,7 +916,7 @@ func (m *manager) createContainerLocked(containerName string, watchSource watche
return nil
}
handler, accept, err := container.NewContainerHandler(containerName, watchSource, m.inHostNamespace)
handler, accept, err := container.NewContainerHandler(containerName, watchSource, m.containerEnvMetadataWhiteList, m.inHostNamespace)
if err != nil {
return err
}
@@ -928,13 +936,7 @@ func (m *manager) createContainerLocked(containerName string, watchSource watche
return err
}
if cgroups.IsCgroup2UnifiedMode() {
perfCgroupPath := path.Join(fs2.UnifiedMountpoint, containerName)
cont.perfCollector, err = m.perfManager.GetCollector(perfCgroupPath)
if err != nil {
klog.Errorf("Perf event metrics will not be available for container %q: %v", containerName, err)
}
} else {
if !cgroups.IsCgroup2UnifiedMode() {
devicesCgroupPath, err := handler.GetCgroupPath("devices")
if err != nil {
klog.Warningf("Error getting devices cgroup path: %v", err)
@@ -944,6 +946,8 @@ func (m *manager) createContainerLocked(containerName string, watchSource watche
klog.V(4).Infof("GPU metrics may be unavailable/incomplete for container %s: %s", cont.info.Name, err)
}
}
}
if m.includedMetrics.Has(container.PerfMetrics) {
perfCgroupPath, err := handler.GetCgroupPath("perf_event")
if err != nil {
klog.Warningf("Error getting perf_event cgroup path: %q", err)
@@ -956,14 +960,11 @@ func (m *manager) createContainerLocked(containerName string, watchSource watche
}
if m.includedMetrics.Has(container.ResctrlMetrics) {
resctrlPath, err := intelrdt.GetIntelRdtPath(containerName)
cont.resctrlCollector, err = m.resctrlManager.GetCollector(containerName, func() ([]string, error) {
return cont.getContainerPids(m.inHostNamespace)
}, len(m.machineInfo.Topology))
if err != nil {
klog.V(4).Infof("Error getting resctrl path: %q", err)
} else {
cont.resctrlCollector, err = m.resctrlManager.GetCollector(resctrlPath)
if err != nil {
klog.V(4).Infof("resctrl metrics will not be available for container %s: %s", cont.info.Name, err)
}
klog.V(4).Infof("resctrl metrics will not be available for container %s: %s", cont.info.Name, err)
}
}
@@ -1005,7 +1006,6 @@ func (m *manager) createContainerLocked(containerName string, watchSource watche
if err != nil {
return err
}
// Start the container's housekeeping.
return cont.Start()
}
@@ -1237,6 +1237,24 @@ func (m *manager) watchForNewOoms() error {
if err != nil {
klog.Errorf("failed to add OOM kill event for %q: %v", oomInstance.ContainerName, err)
}
// Count OOM events for later collection by prometheus
request := v2.RequestOptions{
IdType: v2.TypeName,
Count: 1,
}
conts, err := m.getRequestedContainers(oomInstance.ContainerName, request)
if err != nil {
klog.V(2).Infof("failed getting container info for %q: %v", oomInstance.ContainerName, err)
continue
}
if len(conts) != 1 {
klog.V(2).Info("Expected the request to match only one container")
continue
}
for _, cont := range conts {
atomic.AddUint64(&cont.oomEvents, 1)
}
}
}()
return nil
@@ -1304,14 +1322,6 @@ func parseEventsStoragePolicy() events.StoragePolicy {
return policy
}
func (m *manager) DockerImages() ([]info.DockerImage, error) {
return docker.Images()
}
func (m *manager) DockerInfo() (info.DockerStatus, error) {
return docker.Status()
}
func (m *manager) DebugInfo() map[string][]string {
debugInfo := container.DebugInfo()
@@ -1368,20 +1378,10 @@ func getVersionInfo() (*info.VersionInfo, error) {
kernelVersion := machine.KernelVersion()
osVersion := machine.ContainerOsVersion()
dockerVersion, err := docker.VersionString()
if err != nil {
return nil, err
}
dockerAPIVersion, err := docker.APIVersionString()
if err != nil {
return nil, err
}
return &info.VersionInfo{
KernelVersion: kernelVersion,
ContainerOsVersion: osVersion,
DockerVersion: dockerVersion,
DockerAPIVersion: dockerAPIVersion,
CadvisorVersion: version.Info["version"],
CadvisorRevision: version.Info["revision"],
}, nil