Merge pull request #114883 from bobbypage/cadvisor_v047

deps: Bump cAdvisor to v0.47.1
2023-01-12 09:04:54 -08:00
parent 4802d7bb62 8e3a02efa8
commit 3e049c5e68
39 changed files with 144 additions and 7209 deletions
--- a/vendor/github.com/google/cadvisor/accelerators/nvidia.go
+++ b/vendor/github.com/google/cadvisor/accelerators/nvidia.go
@@ -1,276 +0,0 @@
-// Copyright 2017 Google Inc. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package accelerators
-
-import (
-	"bufio"
-	"fmt"
-	"io/ioutil"
-	"os"
-	"path/filepath"
-	"strconv"
-	"strings"
-	"sync"
-	"time"
-
-	"github.com/google/cadvisor/container"
-	info "github.com/google/cadvisor/info/v1"
-	"github.com/google/cadvisor/stats"
-
-	"github.com/mindprince/gonvml"
-	"k8s.io/klog/v2"
-)
-
-type nvidiaManager struct {
-	sync.Mutex
-
-	// true if there are NVIDIA devices present on the node
-	devicesPresent bool
-
-	// true if the NVML library (libnvidia-ml.so.1) was loaded successfully
-	nvmlInitialized bool
-
-	// nvidiaDevices is a map from device minor number to a handle that can be used to get metrics about the device
-	nvidiaDevices map[int]gonvml.Device
-}
-
-var sysFsPCIDevicesPath = "/sys/bus/pci/devices/"
-
-const nvidiaVendorID = "0x10de"
-
-func NewNvidiaManager(includedMetrics container.MetricSet) stats.Manager {
-	if !includedMetrics.Has(container.AcceleratorUsageMetrics) {
-		klog.V(2).Info("NVIDIA GPU metrics disabled")
-		return &stats.NoopManager{}
-	}
-
-	manager := &nvidiaManager{}
-	err := manager.setup()
-	if err != nil {
-		klog.V(2).Infof("NVIDIA setup failed: %s", err)
-	}
-	return manager
-}
-
-// setup initializes NVML if NVIDIA devices are present on the node.
-func (nm *nvidiaManager) setup() error {
-	if !detectDevices(nvidiaVendorID) {
-		return fmt.Errorf("no NVIDIA devices found")
-	}
-
-	nm.devicesPresent = true
-
-	return initializeNVML(nm)
-}
-
-// detectDevices returns true if a device with given pci id is present on the node.
-func detectDevices(vendorID string) bool {
-	devices, err := ioutil.ReadDir(sysFsPCIDevicesPath)
-	if err != nil {
-		klog.Warningf("Error reading %q: %v", sysFsPCIDevicesPath, err)
-		return false
-	}
-
-	for _, device := range devices {
-		vendorPath := filepath.Join(sysFsPCIDevicesPath, device.Name(), "vendor")
-		content, err := ioutil.ReadFile(vendorPath)
-		if err != nil {
-			klog.V(4).Infof("Error while reading %q: %v", vendorPath, err)
-			continue
-		}
-		if strings.EqualFold(strings.TrimSpace(string(content)), vendorID) {
-			klog.V(3).Infof("Found device with vendorID %q", vendorID)
-			return true
-		}
-	}
-	return false
-}
-
-// initializeNVML initializes the NVML library and sets up the nvmlDevices map.
-// This is defined as a variable to help in testing.
-var initializeNVML = func(nm *nvidiaManager) error {
-	if err := gonvml.Initialize(); err != nil {
-		// This is under a logging level because otherwise we may cause
-		// log spam if the drivers/nvml is not installed on the system.
-		return fmt.Errorf("Could not initialize NVML: %v", err)
-	}
-	nm.nvmlInitialized = true
-	numDevices, err := gonvml.DeviceCount()
-	if err != nil {
-		return fmt.Errorf("GPU metrics would not be available. Failed to get the number of NVIDIA devices: %v", err)
-	}
-	if numDevices == 0 {
-		return nil
-	}
-	klog.V(1).Infof("NVML initialized. Number of NVIDIA devices: %v", numDevices)
-	nm.nvidiaDevices = make(map[int]gonvml.Device, numDevices)
-	for i := 0; i < int(numDevices); i++ {
-		device, err := gonvml.DeviceHandleByIndex(uint(i))
-		if err != nil {
-			return fmt.Errorf("Failed to get NVIDIA device handle %d: %v", i, err)
-		}
-		minorNumber, err := device.MinorNumber()
-		if err != nil {
-			return fmt.Errorf("Failed to get NVIDIA device minor number: %v", err)
-		}
-		nm.nvidiaDevices[int(minorNumber)] = device
-	}
-	return nil
-}
-
-// Destroy shuts down NVML.
-func (nm *nvidiaManager) Destroy() {
-	if nm.nvmlInitialized {
-		err := gonvml.Shutdown()
-		if err != nil {
-			klog.Warningf("nvml library shutdown failed: %s", err)
-		}
-	}
-}
-
-// GetCollector returns a collector that can fetch NVIDIA gpu metrics for NVIDIA devices
-// present in the devices.list file in the given devicesCgroupPath.
-func (nm *nvidiaManager) GetCollector(devicesCgroupPath string) (stats.Collector, error) {
-	nc := &nvidiaCollector{}
-
-	if !nm.devicesPresent {
-		return &stats.NoopCollector{}, nil
-	}
-	// Makes sure that we don't call initializeNVML() concurrently and
-	// that we only call initializeNVML() when it's not initialized.
-	nm.Lock()
-	if !nm.nvmlInitialized {
-		err := initializeNVML(nm)
-		if err != nil {
-			nm.Unlock()
-			return &stats.NoopCollector{}, err
-		}
-	}
-	nm.Unlock()
-	if len(nm.nvidiaDevices) == 0 {
-		return &stats.NoopCollector{}, nil
-	}
-	nvidiaMinorNumbers, err := parseDevicesCgroup(devicesCgroupPath)
-	if err != nil {
-		return &stats.NoopCollector{}, err
-	}
-
-	for _, minor := range nvidiaMinorNumbers {
-		device, ok := nm.nvidiaDevices[minor]
-		if !ok {
-			return &stats.NoopCollector{}, fmt.Errorf("NVIDIA device minor number %d not found in cached devices", minor)
-		}
-		nc.devices = append(nc.devices, device)
-	}
-	return nc, nil
-}
-
-// parseDevicesCgroup parses the devices cgroup devices.list file for the container
-// and returns a list of minor numbers corresponding to NVIDIA GPU devices that the
-// container is allowed to access. In cases where the container has access to all
-// devices or all NVIDIA devices but the devices are not enumerated separately in
-// the devices.list file, we return an empty list.
-// This is defined as a variable to help in testing.
-var parseDevicesCgroup = func(devicesCgroupPath string) ([]int, error) {
-	// Always return a non-nil slice
-	nvidiaMinorNumbers := []int{}
-
-	devicesList := filepath.Join(devicesCgroupPath, "devices.list")
-	f, err := os.Open(devicesList)
-	if err != nil {
-		return nvidiaMinorNumbers, fmt.Errorf("error while opening devices cgroup file %q: %v", devicesList, err)
-	}
-	defer f.Close()
-
-	s := bufio.NewScanner(f)
-
-	// See https://www.kernel.org/doc/Documentation/cgroup-v1/devices.txt for the file format
-	for s.Scan() {
-		text := s.Text()
-
-		fields := strings.Fields(text)
-		if len(fields) != 3 {
-			return nvidiaMinorNumbers, fmt.Errorf("invalid devices cgroup entry %q: must contain three whitespace-separated fields", text)
-		}
-
-		// Split the second field to find out major:minor numbers
-		majorMinor := strings.Split(fields[1], ":")
-		if len(majorMinor) != 2 {
-			return nvidiaMinorNumbers, fmt.Errorf("invalid devices cgroup entry %q: second field should have one colon", text)
-		}
-
-		// NVIDIA graphics devices are character devices with major number 195.
-		// https://github.com/torvalds/linux/blob/v4.13/Documentation/admin-guide/devices.txt#L2583
-		if fields[0] == "c" && majorMinor[0] == "195" {
-			minorNumber, err := strconv.Atoi(majorMinor[1])
-			if err != nil {
-				return nvidiaMinorNumbers, fmt.Errorf("invalid devices cgroup entry %q: minor number is not integer", text)
-			}
-			// We don't want devices like nvidiactl (195:255) and nvidia-modeset (195:254)
-			if minorNumber < 128 {
-				nvidiaMinorNumbers = append(nvidiaMinorNumbers, minorNumber)
-			}
-			// We are ignoring the "195:*" case
-			// where the container has access to all NVIDIA devices on the machine.
-		}
-		// We are ignoring the "*:*" case
-		// where the container has access to all devices on the machine.
-	}
-	return nvidiaMinorNumbers, nil
-}
-
-type nvidiaCollector struct {
-	// Exposed for testing
-	devices []gonvml.Device
-
-	stats.NoopDestroy
-}
-
-func NewNvidiaCollector(devices []gonvml.Device) stats.Collector {
-	return &nvidiaCollector{devices: devices}
-}
-
-// UpdateStats updates the stats for NVIDIA GPUs (if any) attached to the container.
-func (nc *nvidiaCollector) UpdateStats(stats *info.ContainerStats) error {
-	for _, device := range nc.devices {
-		model, err := device.Name()
-		if err != nil {
-			return fmt.Errorf("error while getting gpu name: %v", err)
-		}
-		uuid, err := device.UUID()
-		if err != nil {
-			return fmt.Errorf("error while getting gpu uuid: %v", err)
-		}
-		memoryTotal, memoryUsed, err := device.MemoryInfo()
-		if err != nil {
-			return fmt.Errorf("error while getting gpu memory info: %v", err)
-		}
-		//TODO: Use housekeepingInterval
-		utilizationGPU, err := device.AverageGPUUtilization(10 * time.Second)
-		if err != nil {
-			return fmt.Errorf("error while getting gpu utilization: %v", err)
-		}
-
-		stats.Accelerators = append(stats.Accelerators, info.AcceleratorStats{
-			Make:        "nvidia",
-			Model:       model,
-			ID:          uuid,
-			MemoryTotal: memoryTotal,
-			MemoryUsed:  memoryUsed,
-			DutyCycle:   uint64(utilizationGPU),
-		})
-	}
-	return nil
-}
--- a/vendor/github.com/google/cadvisor/container/common/helpers.go
+++ b/vendor/github.com/google/cadvisor/container/common/helpers.go
@@ -172,7 +172,7 @@ func getSpecInternal(cgroupPaths map[string]string, machineInfoFactory info.Mach
 		if cgroup2UnifiedMode {
 			if utils.FileExists(path.Join(memoryRoot, "memory.max")) {
 				spec.HasMemory = true
-				spec.Memory.Reservation = readUInt64(memoryRoot, "memory.high")
+				spec.Memory.Reservation = readUInt64(memoryRoot, "memory.min")
 				spec.Memory.Limit = readUInt64(memoryRoot, "memory.max")
 				spec.Memory.SwapLimit = readUInt64(memoryRoot, "memory.swap.max")
 			}
--- a/vendor/github.com/google/cadvisor/container/factory.go
+++ b/vendor/github.com/google/cadvisor/container/factory.go
@@ -57,7 +57,6 @@ const (
 	NetworkTcpUsageMetrics         MetricKind = "tcp"
 	NetworkAdvancedTcpUsageMetrics MetricKind = "advtcp"
 	NetworkUdpUsageMetrics         MetricKind = "udp"
-	AcceleratorUsageMetrics        MetricKind = "accelerator"
 	AppMetrics                     MetricKind = "app"
 	ProcessMetrics                 MetricKind = "process"
 	HugetlbUsageMetrics            MetricKind = "hugetlb"
@@ -78,7 +77,6 @@ var AllMetrics = MetricSet{
 	MemoryNumaMetrics:              struct{}{},
 	CpuLoadMetrics:                 struct{}{},
 	DiskIOMetrics:                  struct{}{},
-	AcceleratorUsageMetrics:        struct{}{},
 	DiskUsageMetrics:               struct{}{},
 	NetworkUsageMetrics:            struct{}{},
 	NetworkTcpUsageMetrics:         struct{}{},
--- a/vendor/github.com/google/cadvisor/info/v1/machine.go
+++ b/vendor/github.com/google/cadvisor/info/v1/machine.go
@@ -197,6 +197,9 @@ type MachineInfo struct {
 	// The amount of memory (in bytes) in this machine
 	MemoryCapacity uint64 `json:"memory_capacity"`

+	// The amount of swap (in bytes) in this machine
+	SwapCapacity uint64 `json:"swap_capacity"`
+
 	// Memory capacity and number of DIMMs by memory type
 	MemoryByType map[string]*MemoryInfo `json:"memory_by_type"`

--- a/vendor/github.com/google/cadvisor/machine/info.go
+++ b/vendor/github.com/google/cadvisor/machine/info.go
@@ -79,6 +79,11 @@ func Info(sysFs sysfs.SysFs, fsInfo fs.FsInfo, inHostNamespace bool) (*info.Mach
 		return nil, err
 	}

+	swapCapacity, err := GetMachineSwapCapacity()
+	if err != nil {
+		return nil, err
+	}
+
 	nvmInfo, err := nvm.GetInfo()
 	if err != nil {
 		return nil, err
@@ -128,6 +133,7 @@ func Info(sysFs sysfs.SysFs, fsInfo fs.FsInfo, inHostNamespace bool) (*info.Mach
 		CpuFrequency:     clockSpeed,
 		MemoryCapacity:   memoryCapacity,
 		MemoryByType:     memoryByType,
+		SwapCapacity:     swapCapacity,
 		NVMInfo:          nvmInfo,
 		HugePages:        hugePagesInfo,
 		DiskMap:          diskMap,
--- a/vendor/github.com/google/cadvisor/manager/container.go
+++ b/vendor/github.com/google/cadvisor/manager/container.go
@@ -40,6 +40,7 @@ import (
 	"github.com/google/cadvisor/utils/cpuload"

 	"github.com/docker/go-units"
+
 	"k8s.io/klog/v2"
 	"k8s.io/utils/clock"
 )
@@ -96,9 +97,6 @@ type containerData struct {
 	// Runs custom metric collectors.
 	collectorManager collector.CollectorManager

-	// nvidiaCollector updates stats for Nvidia GPUs attached to the container.
-	nvidiaCollector stats.Collector
-
 	// perfCollector updates stats for perf_event cgroup controller.
 	perfCollector stats.Collector

@@ -448,7 +446,6 @@ func newContainerData(containerName string, memoryCache *memory.InMemoryCache, h
 		onDemandChan:             make(chan chan struct{}, 100),
 		clock:                    clock,
 		perfCollector:            &stats.NoopCollector{},
-		nvidiaCollector:          &stats.NoopCollector{},
 		resctrlCollector:         &stats.NoopCollector{},
 	}
 	cont.info.ContainerReference = ref
@@ -688,12 +685,6 @@ func (cd *containerData) updateStats() error {
 		}
 	}

-	var nvidiaStatsErr error
-	if cd.nvidiaCollector != nil {
-		// This updates the Accelerators field of the stats struct
-		nvidiaStatsErr = cd.nvidiaCollector.UpdateStats(stats)
-	}
-
 	perfStatsErr := cd.perfCollector.UpdateStats(stats)

 	resctrlStatsErr := cd.resctrlCollector.UpdateStats(stats)
@@ -718,10 +709,6 @@ func (cd *containerData) updateStats() error {
 	if statsErr != nil {
 		return statsErr
 	}
-	if nvidiaStatsErr != nil {
-		klog.Errorf("error occurred while collecting nvidia stats for container %s: %s", cInfo.Name, err)
-		return nvidiaStatsErr
-	}
 	if perfStatsErr != nil {
 		klog.Errorf("error occurred while collecting perf stats for container %s: %s", cInfo.Name, err)
 		return perfStatsErr
--- a/vendor/github.com/google/cadvisor/manager/manager.go
+++ b/vendor/github.com/google/cadvisor/manager/manager.go
@@ -27,7 +27,6 @@ import (
 	"sync/atomic"
 	"time"

-	"github.com/google/cadvisor/accelerators"
 	"github.com/google/cadvisor/cache/memory"
 	"github.com/google/cadvisor/collector"
 	"github.com/google/cadvisor/container"
@@ -199,7 +198,6 @@ func New(memoryCache *memory.InMemoryCache, sysfs sysfs.SysFs, houskeepingConfig
 		containerWatchers:                     []watcher.ContainerWatcher{},
 		eventsChannel:                         eventsChannel,
 		collectorHTTPClient:                   collectorHTTPClient,
-		nvidiaManager:                         accelerators.NewNvidiaManager(includedMetricsSet),
 		rawContainerCgroupPathPrefixWhiteList: rawContainerCgroupPathPrefixWhiteList,
 		containerEnvMetadataWhiteList:         containerEnvMetadataWhiteList,
 	}
@@ -259,7 +257,6 @@ type manager struct {
 	containerWatchers        []watcher.ContainerWatcher
 	eventsChannel            chan watcher.ContainerEvent
 	collectorHTTPClient      *http.Client
-	nvidiaManager            stats.Manager
 	perfManager              stats.Manager
 	resctrlManager           resctrl.Manager
 	// List of raw container cgroup path prefix whitelist.
@@ -327,7 +324,6 @@ func (m *manager) Start() error {
 }

 func (m *manager) Stop() error {
-	defer m.nvidiaManager.Destroy()
 	defer m.destroyCollectors()
 	// Stop and wait on all quit channels.
 	for i, c := range m.quitChannels {
@@ -934,17 +930,6 @@ func (m *manager) createContainerLocked(containerName string, watchSource watche
 		return err
 	}

-	if !cgroups.IsCgroup2UnifiedMode() {
-		devicesCgroupPath, err := handler.GetCgroupPath("devices")
-		if err != nil {
-			klog.Warningf("Error getting devices cgroup path: %v", err)
-		} else {
-			cont.nvidiaCollector, err = m.nvidiaManager.GetCollector(devicesCgroupPath)
-			if err != nil {
-				klog.V(4).Infof("GPU metrics may be unavailable/incomplete for container %s: %s", cont.info.Name, err)
-			}
-		}
-	}
 	if m.includedMetrics.Has(container.PerfMetrics) {
 		perfCgroupPath, err := handler.GetCgroupPath("perf_event")
 		if err != nil {
--- a/vendor/github.com/google/cadvisor/metrics/prometheus.go
+++ b/vendor/github.com/google/cadvisor/metrics/prometheus.go
@@ -25,6 +25,7 @@ import (
 	v2 "github.com/google/cadvisor/info/v2"

 	"github.com/prometheus/client_golang/prometheus"
+
 	"k8s.io/klog/v2"
 	"k8s.io/utils/clock"
 )
@@ -492,59 +493,6 @@ func NewPrometheusCollector(i infoProvider, f ContainerLabelsFunc, includedMetri
 			},
 		}...)
 	}
-	if includedMetrics.Has(container.AcceleratorUsageMetrics) {
-		c.containerMetrics = append(c.containerMetrics, []containerMetric{
-			{
-				name:        "container_accelerator_memory_total_bytes",
-				help:        "Total accelerator memory.",
-				valueType:   prometheus.GaugeValue,
-				extraLabels: []string{"make", "model", "acc_id"},
-				getValues: func(s *info.ContainerStats) metricValues {
-					values := make(metricValues, 0, len(s.Accelerators))
-					for _, value := range s.Accelerators {
-						values = append(values, metricValue{
-							value:     float64(value.MemoryTotal),
-							labels:    []string{value.Make, value.Model, value.ID},
-							timestamp: s.Timestamp,
-						})
-					}
-					return values
-				},
-			}, {
-				name:        "container_accelerator_memory_used_bytes",
-				help:        "Total accelerator memory allocated.",
-				valueType:   prometheus.GaugeValue,
-				extraLabels: []string{"make", "model", "acc_id"},
-				getValues: func(s *info.ContainerStats) metricValues {
-					values := make(metricValues, 0, len(s.Accelerators))
-					for _, value := range s.Accelerators {
-						values = append(values, metricValue{
-							value:     float64(value.MemoryUsed),
-							labels:    []string{value.Make, value.Model, value.ID},
-							timestamp: s.Timestamp,
-						})
-					}
-					return values
-				},
-			}, {
-				name:        "container_accelerator_duty_cycle",
-				help:        "Percent of time over the past sample period during which the accelerator was actively processing.",
-				valueType:   prometheus.GaugeValue,
-				extraLabels: []string{"make", "model", "acc_id"},
-				getValues: func(s *info.ContainerStats) metricValues {
-					values := make(metricValues, 0, len(s.Accelerators))
-					for _, value := range s.Accelerators {
-						values = append(values, metricValue{
-							value:     float64(value.DutyCycle),
-							labels:    []string{value.Make, value.Model, value.ID},
-							timestamp: s.Timestamp,
-						})
-					}
-					return values
-				},
-			},
-		}...)
-	}
 	if includedMetrics.Has(container.DiskUsageMetrics) {
 		c.containerMetrics = append(c.containerMetrics, []containerMetric{
 			{