Merge pull request #114883 from bobbypage/cadvisor_v047

deps: Bump cAdvisor to v0.47.1
This commit is contained in:
Kubernetes Prow Robot
2023-01-12 09:04:54 -08:00
committed by GitHub
39 changed files with 144 additions and 7209 deletions

View File

@@ -1,276 +0,0 @@
// Copyright 2017 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package accelerators
import (
"bufio"
"fmt"
"io/ioutil"
"os"
"path/filepath"
"strconv"
"strings"
"sync"
"time"
"github.com/google/cadvisor/container"
info "github.com/google/cadvisor/info/v1"
"github.com/google/cadvisor/stats"
"github.com/mindprince/gonvml"
"k8s.io/klog/v2"
)
type nvidiaManager struct {
sync.Mutex
// true if there are NVIDIA devices present on the node
devicesPresent bool
// true if the NVML library (libnvidia-ml.so.1) was loaded successfully
nvmlInitialized bool
// nvidiaDevices is a map from device minor number to a handle that can be used to get metrics about the device
nvidiaDevices map[int]gonvml.Device
}
var sysFsPCIDevicesPath = "/sys/bus/pci/devices/"
const nvidiaVendorID = "0x10de"
func NewNvidiaManager(includedMetrics container.MetricSet) stats.Manager {
if !includedMetrics.Has(container.AcceleratorUsageMetrics) {
klog.V(2).Info("NVIDIA GPU metrics disabled")
return &stats.NoopManager{}
}
manager := &nvidiaManager{}
err := manager.setup()
if err != nil {
klog.V(2).Infof("NVIDIA setup failed: %s", err)
}
return manager
}
// setup initializes NVML if NVIDIA devices are present on the node.
func (nm *nvidiaManager) setup() error {
if !detectDevices(nvidiaVendorID) {
return fmt.Errorf("no NVIDIA devices found")
}
nm.devicesPresent = true
return initializeNVML(nm)
}
// detectDevices returns true if a device with given pci id is present on the node.
func detectDevices(vendorID string) bool {
devices, err := ioutil.ReadDir(sysFsPCIDevicesPath)
if err != nil {
klog.Warningf("Error reading %q: %v", sysFsPCIDevicesPath, err)
return false
}
for _, device := range devices {
vendorPath := filepath.Join(sysFsPCIDevicesPath, device.Name(), "vendor")
content, err := ioutil.ReadFile(vendorPath)
if err != nil {
klog.V(4).Infof("Error while reading %q: %v", vendorPath, err)
continue
}
if strings.EqualFold(strings.TrimSpace(string(content)), vendorID) {
klog.V(3).Infof("Found device with vendorID %q", vendorID)
return true
}
}
return false
}
// initializeNVML initializes the NVML library and sets up the nvmlDevices map.
// This is defined as a variable to help in testing.
var initializeNVML = func(nm *nvidiaManager) error {
if err := gonvml.Initialize(); err != nil {
// This is under a logging level because otherwise we may cause
// log spam if the drivers/nvml is not installed on the system.
return fmt.Errorf("Could not initialize NVML: %v", err)
}
nm.nvmlInitialized = true
numDevices, err := gonvml.DeviceCount()
if err != nil {
return fmt.Errorf("GPU metrics would not be available. Failed to get the number of NVIDIA devices: %v", err)
}
if numDevices == 0 {
return nil
}
klog.V(1).Infof("NVML initialized. Number of NVIDIA devices: %v", numDevices)
nm.nvidiaDevices = make(map[int]gonvml.Device, numDevices)
for i := 0; i < int(numDevices); i++ {
device, err := gonvml.DeviceHandleByIndex(uint(i))
if err != nil {
return fmt.Errorf("Failed to get NVIDIA device handle %d: %v", i, err)
}
minorNumber, err := device.MinorNumber()
if err != nil {
return fmt.Errorf("Failed to get NVIDIA device minor number: %v", err)
}
nm.nvidiaDevices[int(minorNumber)] = device
}
return nil
}
// Destroy shuts down NVML.
func (nm *nvidiaManager) Destroy() {
if nm.nvmlInitialized {
err := gonvml.Shutdown()
if err != nil {
klog.Warningf("nvml library shutdown failed: %s", err)
}
}
}
// GetCollector returns a collector that can fetch NVIDIA gpu metrics for NVIDIA devices
// present in the devices.list file in the given devicesCgroupPath.
func (nm *nvidiaManager) GetCollector(devicesCgroupPath string) (stats.Collector, error) {
nc := &nvidiaCollector{}
if !nm.devicesPresent {
return &stats.NoopCollector{}, nil
}
// Makes sure that we don't call initializeNVML() concurrently and
// that we only call initializeNVML() when it's not initialized.
nm.Lock()
if !nm.nvmlInitialized {
err := initializeNVML(nm)
if err != nil {
nm.Unlock()
return &stats.NoopCollector{}, err
}
}
nm.Unlock()
if len(nm.nvidiaDevices) == 0 {
return &stats.NoopCollector{}, nil
}
nvidiaMinorNumbers, err := parseDevicesCgroup(devicesCgroupPath)
if err != nil {
return &stats.NoopCollector{}, err
}
for _, minor := range nvidiaMinorNumbers {
device, ok := nm.nvidiaDevices[minor]
if !ok {
return &stats.NoopCollector{}, fmt.Errorf("NVIDIA device minor number %d not found in cached devices", minor)
}
nc.devices = append(nc.devices, device)
}
return nc, nil
}
// parseDevicesCgroup parses the devices cgroup devices.list file for the container
// and returns a list of minor numbers corresponding to NVIDIA GPU devices that the
// container is allowed to access. In cases where the container has access to all
// devices or all NVIDIA devices but the devices are not enumerated separately in
// the devices.list file, we return an empty list.
// This is defined as a variable to help in testing.
var parseDevicesCgroup = func(devicesCgroupPath string) ([]int, error) {
// Always return a non-nil slice
nvidiaMinorNumbers := []int{}
devicesList := filepath.Join(devicesCgroupPath, "devices.list")
f, err := os.Open(devicesList)
if err != nil {
return nvidiaMinorNumbers, fmt.Errorf("error while opening devices cgroup file %q: %v", devicesList, err)
}
defer f.Close()
s := bufio.NewScanner(f)
// See https://www.kernel.org/doc/Documentation/cgroup-v1/devices.txt for the file format
for s.Scan() {
text := s.Text()
fields := strings.Fields(text)
if len(fields) != 3 {
return nvidiaMinorNumbers, fmt.Errorf("invalid devices cgroup entry %q: must contain three whitespace-separated fields", text)
}
// Split the second field to find out major:minor numbers
majorMinor := strings.Split(fields[1], ":")
if len(majorMinor) != 2 {
return nvidiaMinorNumbers, fmt.Errorf("invalid devices cgroup entry %q: second field should have one colon", text)
}
// NVIDIA graphics devices are character devices with major number 195.
// https://github.com/torvalds/linux/blob/v4.13/Documentation/admin-guide/devices.txt#L2583
if fields[0] == "c" && majorMinor[0] == "195" {
minorNumber, err := strconv.Atoi(majorMinor[1])
if err != nil {
return nvidiaMinorNumbers, fmt.Errorf("invalid devices cgroup entry %q: minor number is not integer", text)
}
// We don't want devices like nvidiactl (195:255) and nvidia-modeset (195:254)
if minorNumber < 128 {
nvidiaMinorNumbers = append(nvidiaMinorNumbers, minorNumber)
}
// We are ignoring the "195:*" case
// where the container has access to all NVIDIA devices on the machine.
}
// We are ignoring the "*:*" case
// where the container has access to all devices on the machine.
}
return nvidiaMinorNumbers, nil
}
type nvidiaCollector struct {
// Exposed for testing
devices []gonvml.Device
stats.NoopDestroy
}
func NewNvidiaCollector(devices []gonvml.Device) stats.Collector {
return &nvidiaCollector{devices: devices}
}
// UpdateStats updates the stats for NVIDIA GPUs (if any) attached to the container.
func (nc *nvidiaCollector) UpdateStats(stats *info.ContainerStats) error {
for _, device := range nc.devices {
model, err := device.Name()
if err != nil {
return fmt.Errorf("error while getting gpu name: %v", err)
}
uuid, err := device.UUID()
if err != nil {
return fmt.Errorf("error while getting gpu uuid: %v", err)
}
memoryTotal, memoryUsed, err := device.MemoryInfo()
if err != nil {
return fmt.Errorf("error while getting gpu memory info: %v", err)
}
//TODO: Use housekeepingInterval
utilizationGPU, err := device.AverageGPUUtilization(10 * time.Second)
if err != nil {
return fmt.Errorf("error while getting gpu utilization: %v", err)
}
stats.Accelerators = append(stats.Accelerators, info.AcceleratorStats{
Make: "nvidia",
Model: model,
ID: uuid,
MemoryTotal: memoryTotal,
MemoryUsed: memoryUsed,
DutyCycle: uint64(utilizationGPU),
})
}
return nil
}

View File

@@ -172,7 +172,7 @@ func getSpecInternal(cgroupPaths map[string]string, machineInfoFactory info.Mach
if cgroup2UnifiedMode {
if utils.FileExists(path.Join(memoryRoot, "memory.max")) {
spec.HasMemory = true
spec.Memory.Reservation = readUInt64(memoryRoot, "memory.high")
spec.Memory.Reservation = readUInt64(memoryRoot, "memory.min")
spec.Memory.Limit = readUInt64(memoryRoot, "memory.max")
spec.Memory.SwapLimit = readUInt64(memoryRoot, "memory.swap.max")
}

View File

@@ -57,7 +57,6 @@ const (
NetworkTcpUsageMetrics MetricKind = "tcp"
NetworkAdvancedTcpUsageMetrics MetricKind = "advtcp"
NetworkUdpUsageMetrics MetricKind = "udp"
AcceleratorUsageMetrics MetricKind = "accelerator"
AppMetrics MetricKind = "app"
ProcessMetrics MetricKind = "process"
HugetlbUsageMetrics MetricKind = "hugetlb"
@@ -78,7 +77,6 @@ var AllMetrics = MetricSet{
MemoryNumaMetrics: struct{}{},
CpuLoadMetrics: struct{}{},
DiskIOMetrics: struct{}{},
AcceleratorUsageMetrics: struct{}{},
DiskUsageMetrics: struct{}{},
NetworkUsageMetrics: struct{}{},
NetworkTcpUsageMetrics: struct{}{},

View File

@@ -197,6 +197,9 @@ type MachineInfo struct {
// The amount of memory (in bytes) in this machine
MemoryCapacity uint64 `json:"memory_capacity"`
// The amount of swap (in bytes) in this machine
SwapCapacity uint64 `json:"swap_capacity"`
// Memory capacity and number of DIMMs by memory type
MemoryByType map[string]*MemoryInfo `json:"memory_by_type"`

View File

@@ -79,6 +79,11 @@ func Info(sysFs sysfs.SysFs, fsInfo fs.FsInfo, inHostNamespace bool) (*info.Mach
return nil, err
}
swapCapacity, err := GetMachineSwapCapacity()
if err != nil {
return nil, err
}
nvmInfo, err := nvm.GetInfo()
if err != nil {
return nil, err
@@ -128,6 +133,7 @@ func Info(sysFs sysfs.SysFs, fsInfo fs.FsInfo, inHostNamespace bool) (*info.Mach
CpuFrequency: clockSpeed,
MemoryCapacity: memoryCapacity,
MemoryByType: memoryByType,
SwapCapacity: swapCapacity,
NVMInfo: nvmInfo,
HugePages: hugePagesInfo,
DiskMap: diskMap,

View File

@@ -40,6 +40,7 @@ import (
"github.com/google/cadvisor/utils/cpuload"
"github.com/docker/go-units"
"k8s.io/klog/v2"
"k8s.io/utils/clock"
)
@@ -96,9 +97,6 @@ type containerData struct {
// Runs custom metric collectors.
collectorManager collector.CollectorManager
// nvidiaCollector updates stats for Nvidia GPUs attached to the container.
nvidiaCollector stats.Collector
// perfCollector updates stats for perf_event cgroup controller.
perfCollector stats.Collector
@@ -448,7 +446,6 @@ func newContainerData(containerName string, memoryCache *memory.InMemoryCache, h
onDemandChan: make(chan chan struct{}, 100),
clock: clock,
perfCollector: &stats.NoopCollector{},
nvidiaCollector: &stats.NoopCollector{},
resctrlCollector: &stats.NoopCollector{},
}
cont.info.ContainerReference = ref
@@ -688,12 +685,6 @@ func (cd *containerData) updateStats() error {
}
}
var nvidiaStatsErr error
if cd.nvidiaCollector != nil {
// This updates the Accelerators field of the stats struct
nvidiaStatsErr = cd.nvidiaCollector.UpdateStats(stats)
}
perfStatsErr := cd.perfCollector.UpdateStats(stats)
resctrlStatsErr := cd.resctrlCollector.UpdateStats(stats)
@@ -718,10 +709,6 @@ func (cd *containerData) updateStats() error {
if statsErr != nil {
return statsErr
}
if nvidiaStatsErr != nil {
klog.Errorf("error occurred while collecting nvidia stats for container %s: %s", cInfo.Name, err)
return nvidiaStatsErr
}
if perfStatsErr != nil {
klog.Errorf("error occurred while collecting perf stats for container %s: %s", cInfo.Name, err)
return perfStatsErr

View File

@@ -27,7 +27,6 @@ import (
"sync/atomic"
"time"
"github.com/google/cadvisor/accelerators"
"github.com/google/cadvisor/cache/memory"
"github.com/google/cadvisor/collector"
"github.com/google/cadvisor/container"
@@ -199,7 +198,6 @@ func New(memoryCache *memory.InMemoryCache, sysfs sysfs.SysFs, houskeepingConfig
containerWatchers: []watcher.ContainerWatcher{},
eventsChannel: eventsChannel,
collectorHTTPClient: collectorHTTPClient,
nvidiaManager: accelerators.NewNvidiaManager(includedMetricsSet),
rawContainerCgroupPathPrefixWhiteList: rawContainerCgroupPathPrefixWhiteList,
containerEnvMetadataWhiteList: containerEnvMetadataWhiteList,
}
@@ -259,7 +257,6 @@ type manager struct {
containerWatchers []watcher.ContainerWatcher
eventsChannel chan watcher.ContainerEvent
collectorHTTPClient *http.Client
nvidiaManager stats.Manager
perfManager stats.Manager
resctrlManager resctrl.Manager
// List of raw container cgroup path prefix whitelist.
@@ -327,7 +324,6 @@ func (m *manager) Start() error {
}
func (m *manager) Stop() error {
defer m.nvidiaManager.Destroy()
defer m.destroyCollectors()
// Stop and wait on all quit channels.
for i, c := range m.quitChannels {
@@ -934,17 +930,6 @@ func (m *manager) createContainerLocked(containerName string, watchSource watche
return err
}
if !cgroups.IsCgroup2UnifiedMode() {
devicesCgroupPath, err := handler.GetCgroupPath("devices")
if err != nil {
klog.Warningf("Error getting devices cgroup path: %v", err)
} else {
cont.nvidiaCollector, err = m.nvidiaManager.GetCollector(devicesCgroupPath)
if err != nil {
klog.V(4).Infof("GPU metrics may be unavailable/incomplete for container %s: %s", cont.info.Name, err)
}
}
}
if m.includedMetrics.Has(container.PerfMetrics) {
perfCgroupPath, err := handler.GetCgroupPath("perf_event")
if err != nil {

View File

@@ -25,6 +25,7 @@ import (
v2 "github.com/google/cadvisor/info/v2"
"github.com/prometheus/client_golang/prometheus"
"k8s.io/klog/v2"
"k8s.io/utils/clock"
)
@@ -492,59 +493,6 @@ func NewPrometheusCollector(i infoProvider, f ContainerLabelsFunc, includedMetri
},
}...)
}
if includedMetrics.Has(container.AcceleratorUsageMetrics) {
c.containerMetrics = append(c.containerMetrics, []containerMetric{
{
name: "container_accelerator_memory_total_bytes",
help: "Total accelerator memory.",
valueType: prometheus.GaugeValue,
extraLabels: []string{"make", "model", "acc_id"},
getValues: func(s *info.ContainerStats) metricValues {
values := make(metricValues, 0, len(s.Accelerators))
for _, value := range s.Accelerators {
values = append(values, metricValue{
value: float64(value.MemoryTotal),
labels: []string{value.Make, value.Model, value.ID},
timestamp: s.Timestamp,
})
}
return values
},
}, {
name: "container_accelerator_memory_used_bytes",
help: "Total accelerator memory allocated.",
valueType: prometheus.GaugeValue,
extraLabels: []string{"make", "model", "acc_id"},
getValues: func(s *info.ContainerStats) metricValues {
values := make(metricValues, 0, len(s.Accelerators))
for _, value := range s.Accelerators {
values = append(values, metricValue{
value: float64(value.MemoryUsed),
labels: []string{value.Make, value.Model, value.ID},
timestamp: s.Timestamp,
})
}
return values
},
}, {
name: "container_accelerator_duty_cycle",
help: "Percent of time over the past sample period during which the accelerator was actively processing.",
valueType: prometheus.GaugeValue,
extraLabels: []string{"make", "model", "acc_id"},
getValues: func(s *info.ContainerStats) metricValues {
values := make(metricValues, 0, len(s.Accelerators))
for _, value := range s.Accelerators {
values = append(values, metricValue{
value: float64(value.DutyCycle),
labels: []string{value.Make, value.Model, value.ID},
timestamp: s.Timestamp,
})
}
return values
},
},
}...)
}
if includedMetrics.Has(container.DiskUsageMetrics) {
c.containerMetrics = append(c.containerMetrics, []containerMetric{
{