Update cAdvisor.

Also update golang.org/x/sys because of google/cadvisor#1786
This commit is contained in:
Rohit Agarwal
2017-11-06 13:54:48 -08:00
parent dad41f8526
commit fe5ef1b494
297 changed files with 10024 additions and 8974 deletions

30
vendor/github.com/google/cadvisor/accelerators/BUILD generated vendored Normal file
View File

@@ -0,0 +1,30 @@
load("@io_bazel_rules_go//go:def.bzl", "go_library")
go_library(
name = "go_default_library",
srcs = [
"nvidia.go",
"types.go",
],
importpath = "github.com/google/cadvisor/accelerators",
visibility = ["//visibility:public"],
deps = [
"//vendor/github.com/golang/glog:go_default_library",
"//vendor/github.com/google/cadvisor/info/v1:go_default_library",
"//vendor/github.com/mindprince/gonvml:go_default_library",
],
)
filegroup(
name = "package-srcs",
srcs = glob(["**"]),
tags = ["automanaged"],
visibility = ["//visibility:private"],
)
filegroup(
name = "all-srcs",
srcs = [":package-srcs"],
tags = ["automanaged"],
visibility = ["//visibility:public"],
)

View File

@@ -0,0 +1,239 @@
// Copyright 2017 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package accelerators
import (
"bufio"
"fmt"
"io/ioutil"
"os"
"path/filepath"
"strconv"
"strings"
"time"
info "github.com/google/cadvisor/info/v1"
"github.com/golang/glog"
"github.com/mindprince/gonvml"
)
type NvidiaManager struct {
// true if the NVML library (libnvidia-ml.so.1) was loaded successfully
nvmlInitialized bool
// nvidiaDevices is a map from device minor number to a handle that can be used to get metrics about the device
nvidiaDevices map[int]gonvml.Device
}
var sysFsPCIDevicesPath = "/sys/bus/pci/devices/"
const nvidiaVendorId = "0x10de"
// Setup initializes NVML if nvidia devices are present on the node.
func (nm *NvidiaManager) Setup() {
if !detectDevices(nvidiaVendorId) {
glog.Info("No NVIDIA devices found.")
return
}
go func() {
glog.Info("Starting goroutine to initialize NVML")
nm.initializeNVML()
if nm.nvmlInitialized {
return
}
// TODO: use globalHousekeepingInterval
for range time.Tick(time.Minute) {
nm.initializeNVML()
if nm.nvmlInitialized {
return
}
}
}()
}
// detectDevices returns true if a device with given pci id is present on the node.
func detectDevices(vendorId string) bool {
devices, err := ioutil.ReadDir(sysFsPCIDevicesPath)
if err != nil {
glog.Warningf("error reading %q: %v", sysFsPCIDevicesPath, err)
return false
}
for _, device := range devices {
vendorPath := filepath.Join(sysFsPCIDevicesPath, device.Name(), "vendor")
content, err := ioutil.ReadFile(vendorPath)
if err != nil {
glog.Infof("Error while reading %q: %v", vendorPath, err)
continue
}
if strings.EqualFold(strings.TrimSpace(string(content)), vendorId) {
glog.Infof("Found device with vendorId %q", vendorId)
return true
}
}
return false
}
// initializeNVML initializes the NVML library and sets up the nvmlDevices map.
func (nm *NvidiaManager) initializeNVML() {
if err := gonvml.Initialize(); err != nil {
// This is under a logging level because otherwise we may cause
// log spam if the drivers/nvml is not installed on the system.
glog.V(3).Infof("Could not initialize NVML: %v", err)
return
}
nm.nvmlInitialized = true
numDevices, err := gonvml.DeviceCount()
if err != nil {
glog.Warningf("GPU metrics would not be available. Failed to get the number of nvidia devices: %v", err)
return
}
glog.Infof("NVML initialized. Number of nvidia devices: %v", numDevices)
nm.nvidiaDevices = make(map[int]gonvml.Device, numDevices)
for i := 0; i < int(numDevices); i++ {
device, err := gonvml.DeviceHandleByIndex(uint(i))
if err != nil {
glog.Warningf("Failed to get nvidia device handle %d: %v", i, err)
continue
}
minorNumber, err := device.MinorNumber()
if err != nil {
glog.Warningf("Failed to get nvidia device minor number: %v", err)
continue
}
nm.nvidiaDevices[int(minorNumber)] = device
}
}
// Destroy shuts down NVML.
func (nm *NvidiaManager) Destroy() {
if nm.nvmlInitialized {
gonvml.Shutdown()
}
}
// GetCollector returns a collector that can fetch nvidia gpu metrics for nvidia devices
// present in the devices.list file in the given devicesCgroupPath.
func (nm *NvidiaManager) GetCollector(devicesCgroupPath string) (AcceleratorCollector, error) {
nc := &NvidiaCollector{}
if !nm.nvmlInitialized || len(nm.nvidiaDevices) == 0 {
return nc, nil
}
nvidiaMinorNumbers, err := parseDevicesCgroup(devicesCgroupPath)
if err != nil {
return nc, err
}
for _, minor := range nvidiaMinorNumbers {
device, ok := nm.nvidiaDevices[minor]
if !ok {
return nc, fmt.Errorf("nvidia device minor number %d not found in cached devices", minor)
}
nc.Devices = append(nc.Devices, device)
}
return nc, nil
}
// parseDevicesCgroup parses the devices cgroup devices.list file for the container
// and returns a list of minor numbers corresponding to NVIDIA GPU devices that the
// container is allowed to access. In cases where the container has access to all
// devices or all NVIDIA devices but the devices are not enumerated separately in
// the devices.list file, we return an empty list.
// This is defined as a variable to help in testing.
var parseDevicesCgroup = func(devicesCgroupPath string) ([]int, error) {
// Always return a non-nil slice
nvidiaMinorNumbers := []int{}
devicesList := filepath.Join(devicesCgroupPath, "devices.list")
f, err := os.Open(devicesList)
if err != nil {
return nvidiaMinorNumbers, fmt.Errorf("error while opening devices cgroup file %q: %v", devicesList, err)
}
defer f.Close()
s := bufio.NewScanner(f)
// See https://www.kernel.org/doc/Documentation/cgroup-v1/devices.txt for the file format
for s.Scan() {
text := s.Text()
fields := strings.Fields(text)
if len(fields) != 3 {
return nvidiaMinorNumbers, fmt.Errorf("invalid devices cgroup entry %q: must contain three whitespace-separated fields", text)
}
// Split the second field to find out major:minor numbers
majorMinor := strings.Split(fields[1], ":")
if len(majorMinor) != 2 {
return nvidiaMinorNumbers, fmt.Errorf("invalid devices cgroup entry %q: second field should have one colon", text)
}
// NVIDIA graphics devices are character devices with major number 195.
// https://github.com/torvalds/linux/blob/v4.13/Documentation/admin-guide/devices.txt#L2583
if fields[0] == "c" && majorMinor[0] == "195" {
minorNumber, err := strconv.Atoi(majorMinor[1])
if err != nil {
return nvidiaMinorNumbers, fmt.Errorf("invalid devices cgroup entry %q: minor number is not integer", text)
}
// We don't want devices like nvidiactl (195:255) and nvidia-modeset (195:254)
if minorNumber < 128 {
nvidiaMinorNumbers = append(nvidiaMinorNumbers, minorNumber)
}
// We are ignoring the "195:*" case
// where the container has access to all NVIDIA devices on the machine.
}
// We are ignoring the "*:*" case
// where the container has access to all devices on the machine.
}
return nvidiaMinorNumbers, nil
}
type NvidiaCollector struct {
// Exposed for testing
Devices []gonvml.Device
}
// UpdateStats updates the stats for NVIDIA GPUs (if any) attached to the container.
func (nc *NvidiaCollector) UpdateStats(stats *info.ContainerStats) error {
for _, device := range nc.Devices {
model, err := device.Name()
if err != nil {
return fmt.Errorf("error while getting gpu name: %v", err)
}
uuid, err := device.UUID()
if err != nil {
return fmt.Errorf("error while getting gpu uuid: %v", err)
}
memoryTotal, memoryUsed, err := device.MemoryInfo()
if err != nil {
return fmt.Errorf("error while getting gpu memory info: %v", err)
}
//TODO: Use housekeepingInterval
utilizationGPU, err := device.AverageGPUUtilization(10 * time.Second)
if err != nil {
return fmt.Errorf("error while getting gpu utilization: %v", err)
}
stats.Accelerators = append(stats.Accelerators, info.AcceleratorStats{
Make: "nvidia",
Model: model,
ID: uuid,
MemoryTotal: memoryTotal,
MemoryUsed: memoryUsed,
DutyCycle: uint64(utilizationGPU),
})
}
return nil
}

View File

@@ -0,0 +1,32 @@
// Copyright 2017 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package accelerators
import info "github.com/google/cadvisor/info/v1"
// This is supposed to store global state about an accelerator metrics collector.
// cadvisor manager will call Setup() when it starts and Destroy() when it stops.
// For each container detected by the cadvisor manager, it will call
// GetCollector() with the devices cgroup path for that container.
// GetCollector() is supposed to return an object that can update
// accelerator stats for that container.
type AcceleratorManager interface {
Setup()
Destroy()
GetCollector(deviceCgroup string) (AcceleratorCollector, error)
}
type AcceleratorCollector interface {
UpdateStats(*info.ContainerStats) error
}

View File

@@ -11,12 +11,12 @@ go_library(
importpath = "github.com/google/cadvisor/container/common",
visibility = ["//visibility:public"],
deps = [
"//vendor/github.com/fsnotify/fsnotify:go_default_library",
"//vendor/github.com/golang/glog:go_default_library",
"//vendor/github.com/google/cadvisor/container:go_default_library",
"//vendor/github.com/google/cadvisor/fs:go_default_library",
"//vendor/github.com/google/cadvisor/info/v1:go_default_library",
"//vendor/github.com/google/cadvisor/utils:go_default_library",
"//vendor/golang.org/x/exp/inotify:go_default_library",
],
)

View File

@@ -51,7 +51,6 @@ type realFsHandler struct {
}
const (
longOp = time.Second
timeout = 2 * time.Minute
maxBackoffFactor = 20
)
@@ -93,10 +92,10 @@ func (fh *realFsHandler) update() error {
fh.Lock()
defer fh.Unlock()
fh.lastUpdate = time.Now()
if rootDiskErr == nil && fh.rootfs != "" {
if rootInodeErr == nil && fh.rootfs != "" {
fh.usage.InodeUsage = inodeUsage
}
if rootInodeErr == nil && fh.rootfs != "" {
if rootDiskErr == nil && fh.rootfs != "" {
fh.usage.TotalUsageBytes = baseUsage + extraDirUsage
}
if extraDiskErr == nil && fh.extraDir != "" {
@@ -111,6 +110,7 @@ func (fh *realFsHandler) update() error {
func (fh *realFsHandler) trackUsage() {
fh.update()
longOp := time.Second
for {
select {
case <-fh.stopChan:
@@ -128,7 +128,11 @@ func (fh *realFsHandler) trackUsage() {
}
duration := time.Since(start)
if duration > longOp {
glog.V(2).Infof("du and find on following dirs took %v: %v", duration, []string{fh.rootfs, fh.extraDir})
// adapt longOp time so that message doesn't continue to print
// if the long duration is persistent either because of slow
// disk or lots of containers.
longOp = longOp + time.Second
glog.V(2).Infof("du and find on following dirs took %v: %v; will not log again for this container unless duration exceeds %v", duration, []string{fh.rootfs, fh.extraDir}, longOp)
}
}
}

View File

@@ -17,15 +17,15 @@ package common
import (
"sync"
"golang.org/x/exp/inotify"
"github.com/fsnotify/fsnotify"
)
// Watcher for container-related inotify events in the cgroup hierarchy.
// Watcher for container-related fsnotify events in the cgroup hierarchy.
//
// Implementation is thread-safe.
type InotifyWatcher struct {
// Underlying inotify watcher.
watcher *inotify.Watcher
// Underlying fsnotify watcher.
watcher *fsnotify.Watcher
// Map of containers being watched to cgroup paths watched for that container.
containersWatched map[string]map[string]bool
@@ -35,7 +35,7 @@ type InotifyWatcher struct {
}
func NewInotifyWatcher() (*InotifyWatcher, error) {
w, err := inotify.NewWatcher()
w, err := fsnotify.NewWatcher()
if err != nil {
return nil, err
}
@@ -53,9 +53,9 @@ func (iw *InotifyWatcher) AddWatch(containerName, dir string) (bool, error) {
cgroupsWatched, alreadyWatched := iw.containersWatched[containerName]
// Register an inotify notification.
// Register an fsnotify notification.
if !cgroupsWatched[dir] {
err := iw.watcher.AddWatch(dir, inotify.IN_CREATE|inotify.IN_DELETE|inotify.IN_MOVE)
err := iw.watcher.Add(dir)
if err != nil {
return alreadyWatched, err
}
@@ -84,9 +84,9 @@ func (iw *InotifyWatcher) RemoveWatch(containerName, dir string) (bool, error) {
return false, nil
}
// Remove the inotify watch if it exists.
// Remove the fsnotify watch if it exists.
if cgroupsWatched[dir] {
err := iw.watcher.RemoveWatch(dir)
err := iw.watcher.Remove(dir)
if err != nil {
return false, nil
}
@@ -104,15 +104,15 @@ func (iw *InotifyWatcher) RemoveWatch(containerName, dir string) (bool, error) {
// Errors are returned on this channel.
func (iw *InotifyWatcher) Error() chan error {
return iw.watcher.Error
return iw.watcher.Errors
}
// Events are returned on this channel.
func (iw *InotifyWatcher) Event() chan *inotify.Event {
return iw.watcher.Event
func (iw *InotifyWatcher) Event() chan fsnotify.Event {
return iw.watcher.Events
}
// Closes the inotify watcher.
// Closes the fsnotify watcher.
func (iw *InotifyWatcher) Close() error {
return iw.watcher.Close()
}

View File

@@ -12,9 +12,9 @@ go_library(
visibility = ["//visibility:public"],
deps = [
"//vendor/github.com/blang/semver:go_default_library",
"//vendor/github.com/docker/engine-api/client:go_default_library",
"//vendor/github.com/docker/engine-api/types:go_default_library",
"//vendor/github.com/docker/engine-api/types/container:go_default_library",
"//vendor/github.com/docker/docker/api/types:go_default_library",
"//vendor/github.com/docker/docker/api/types/container:go_default_library",
"//vendor/github.com/docker/docker/client:go_default_library",
"//vendor/github.com/docker/go-connections/tlsconfig:go_default_library",
"//vendor/github.com/golang/glog:go_default_library",
"//vendor/github.com/google/cadvisor/container:go_default_library",

View File

@@ -21,7 +21,7 @@ import (
"net/http"
"sync"
dclient "github.com/docker/engine-api/client"
dclient "github.com/docker/docker/client"
"github.com/docker/go-connections/tlsconfig"
)

View File

@@ -19,9 +19,8 @@ import (
"fmt"
"regexp"
"strconv"
"strings"
dockertypes "github.com/docker/engine-api/types"
dockertypes "github.com/docker/docker/api/types"
"golang.org/x/net/context"
"github.com/google/cadvisor/info/v1"
@@ -49,7 +48,6 @@ func StatusFromDockerInfo(dockerInfo dockertypes.Info) v1.DockerStatus {
out.Hostname = dockerInfo.Name
out.RootDir = dockerInfo.DockerRootDir
out.Driver = dockerInfo.Driver
out.ExecDriver = dockerInfo.ExecutionDriver
out.NumImages = dockerInfo.Images
out.NumContainers = dockerInfo.Containers
out.DriverStatus = make(map[string]string, len(dockerInfo.DriverStatus))
@@ -119,13 +117,6 @@ func ValidateInfo() (*dockertypes.Info, error) {
return nil, fmt.Errorf("cAdvisor requires docker version %v or above but we have found version %v reported as %q", []int{1, 0, 0}, version, dockerInfo.ServerVersion)
}
// Check that the libcontainer execdriver is used if the version is < 1.11
// (execution drivers are no longer supported as of 1.11).
if version[0] <= 1 && version[1] <= 10 &&
!strings.HasPrefix(dockerInfo.ExecutionDriver, "native") {
return nil, fmt.Errorf("docker found, but not using native exec driver")
}
if dockerInfo.Driver == "" {
return nil, fmt.Errorf("failed to find docker storage driver")
}

View File

@@ -24,7 +24,7 @@ import (
"sync"
"github.com/blang/semver"
dockertypes "github.com/docker/engine-api/types"
dockertypes "github.com/docker/docker/api/types"
"github.com/google/cadvisor/container"
"github.com/google/cadvisor/container/libcontainer"
"github.com/google/cadvisor/devicemapper"
@@ -35,7 +35,7 @@ import (
dockerutil "github.com/google/cadvisor/utils/docker"
"github.com/google/cadvisor/zfs"
docker "github.com/docker/engine-api/client"
docker "github.com/docker/docker/client"
"github.com/golang/glog"
"golang.org/x/net/context"
)

View File

@@ -32,8 +32,8 @@ import (
dockerutil "github.com/google/cadvisor/utils/docker"
"github.com/google/cadvisor/zfs"
docker "github.com/docker/engine-api/client"
dockercontainer "github.com/docker/engine-api/types/container"
dockercontainer "github.com/docker/docker/api/types/container"
docker "github.com/docker/docker/client"
"github.com/golang/glog"
"github.com/opencontainers/runc/libcontainer/cgroups"
cgroupfs "github.com/opencontainers/runc/libcontainer/cgroups/fs"

View File

@@ -84,6 +84,7 @@ var supportedSubsystems map[string]struct{} = map[string]struct{}{
"memory": {},
"cpuset": {},
"blkio": {},
"devices": {},
}
// Get cgroup and networking stats of the specified container
@@ -452,6 +453,17 @@ var numCpusFunc = getNumberOnlineCPUs
func setCpuStats(s *cgroups.Stats, ret *info.ContainerStats) {
ret.Cpu.Usage.User = s.CpuStats.CpuUsage.UsageInUsermode
ret.Cpu.Usage.System = s.CpuStats.CpuUsage.UsageInKernelmode
ret.Cpu.Usage.Total = 0
ret.Cpu.CFS.Periods = s.CpuStats.ThrottlingData.Periods
ret.Cpu.CFS.ThrottledPeriods = s.CpuStats.ThrottlingData.ThrottledPeriods
ret.Cpu.CFS.ThrottledTime = s.CpuStats.ThrottlingData.ThrottledTime
if len(s.CpuStats.CpuUsage.PercpuUsage) == 0 {
// libcontainer's 'GetStats' can leave 'PercpuUsage' nil if it skipped the
// cpuacct subsystem.
return
}
numPossible := uint32(len(s.CpuStats.CpuUsage.PercpuUsage))
// Note that as of https://patchwork.kernel.org/patch/8607101/ (kernel v4.7),
// the percpu usage information includes extra zero values for all additional
@@ -470,15 +482,11 @@ func setCpuStats(s *cgroups.Stats, ret *info.ContainerStats) {
numActual = minUint32(numPossible, numActual)
ret.Cpu.Usage.PerCpu = make([]uint64, numActual)
ret.Cpu.Usage.Total = 0
for i := uint32(0); i < numActual; i++ {
ret.Cpu.Usage.PerCpu[i] = s.CpuStats.CpuUsage.PercpuUsage[i]
ret.Cpu.Usage.Total += s.CpuStats.CpuUsage.PercpuUsage[i]
}
ret.Cpu.CFS.Periods = s.CpuStats.ThrottlingData.Periods
ret.Cpu.CFS.ThrottledPeriods = s.CpuStats.ThrottlingData.ThrottledPeriods
ret.Cpu.CFS.ThrottledTime = s.CpuStats.ThrottlingData.ThrottledTime
}
// Copied from
@@ -509,6 +517,7 @@ func setDiskIoStats(s *cgroups.Stats, ret *info.ContainerStats) {
func setMemoryStats(s *cgroups.Stats, ret *info.ContainerStats) {
ret.Memory.Usage = s.MemoryStats.Usage.Usage
ret.Memory.MaxUsage = s.MemoryStats.Usage.MaxUsage
ret.Memory.Failcnt = s.MemoryStats.Usage.Failcnt
ret.Memory.Cache = s.MemoryStats.Stats["cache"]

View File

@@ -603,10 +603,11 @@ func GetDirInodeUsage(dir string, timeout time.Duration) (uint64, error) {
glog.Infof("killing cmd %v due to timeout(%s)", findCmd.Args, timeout.String())
findCmd.Process.Kill()
})
if err := findCmd.Wait(); err != nil {
err := findCmd.Wait()
timer.Stop()
if err != nil {
return 0, fmt.Errorf("cmd %v failed. stderr: %s; err: %v", findCmd.Args, stderr.String(), err)
}
timer.Stop()
return counter.bytesWritten, nil
}

View File

@@ -330,6 +330,10 @@ type MemoryStats struct {
// Units: Bytes.
Usage uint64 `json:"usage"`
// Maximum memory usage recorded.
// Units: Bytes.
MaxUsage uint64 `json:"max_usage"`
// Number of bytes of page cache memory.
// Units: Bytes.
Cache uint64 `json:"cache"`
@@ -516,6 +520,29 @@ type FsStats struct {
WeightedIoTime uint64 `json:"weighted_io_time"`
}
type AcceleratorStats struct {
// Make of the accelerator (nvidia, amd, google etc.)
Make string `json:"make"`
// Model of the accelerator (tesla-p100, tesla-k80 etc.)
Model string `json:"model"`
// ID of the accelerator.
ID string `json:"id"`
// Total accelerator memory.
// unit: bytes
MemoryTotal uint64 `json:"memory_total"`
// Total accelerator memory allocated.
// unit: bytes
MemoryUsed uint64 `json:"memory_used"`
// Percent of time over the past sample period during which
// the accelerator was actively processing.
DutyCycle uint64 `json:"duty_cycle"`
}
type ContainerStats struct {
// The time of this stat point.
Timestamp time.Time `json:"timestamp"`
@@ -530,6 +557,9 @@ type ContainerStats struct {
// Task load stats
TaskStats LoadStats `json:"task_stats,omitempty"`
// Metrics for Accelerators. Each Accelerator corresponds to one element in the array.
Accelerators []AcceleratorStats `json:"accelerators,omitempty"`
// Custom metrics from all collectors
CustomMetrics map[string][]MetricVal `json:"custom_metrics,omitempty"`
}

View File

@@ -146,6 +146,8 @@ type ContainerStats struct {
Filesystem *FilesystemStats `json:"filesystem,omitempty"`
// Task load statistics
Load *v1.LoadStats `json:"load_stats,omitempty"`
// Metrics for Accelerators. Each Accelerator corresponds to one element in the array.
Accelerators []v1.AcceleratorStats `json:"accelerators,omitempty"`
// Custom Metrics
CustomMetrics map[string][]v1.MetricVal `json:"custom_metrics,omitempty"`
}

View File

@@ -142,6 +142,9 @@ func ContainerStatsFromV1(containerName string, spec *v1.ContainerSpec, stats []
if spec.HasCustomMetrics {
stat.CustomMetrics = val.CustomMetrics
}
if len(val.Accelerators) > 0 {
stat.Accelerators = val.Accelerators
}
// TODO(rjnagal): Handle load stats.
newStats = append(newStats, stat)
}

View File

@@ -16,6 +16,7 @@ go_library(
"//vendor/github.com/google/cadvisor/utils/cloudinfo:go_default_library",
"//vendor/github.com/google/cadvisor/utils/sysfs:go_default_library",
"//vendor/github.com/google/cadvisor/utils/sysinfo:go_default_library",
"//vendor/golang.org/x/sys/unix:go_default_library",
],
)

View File

@@ -22,7 +22,6 @@ import (
"path/filepath"
"strconv"
"strings"
"syscall"
"github.com/google/cadvisor/fs"
info "github.com/google/cadvisor/info/v1"
@@ -31,6 +30,8 @@ import (
"github.com/google/cadvisor/utils/sysinfo"
"github.com/golang/glog"
"golang.org/x/sys/unix"
)
const hugepagesDirectory = "/sys/kernel/mm/hugepages/"
@@ -189,19 +190,11 @@ func ContainerOsVersion() string {
}
func KernelVersion() string {
uname := &syscall.Utsname{}
uname := &unix.Utsname{}
if err := syscall.Uname(uname); err != nil {
if err := unix.Uname(uname); err != nil {
return "Unknown"
}
release := make([]byte, len(uname.Release))
i := 0
for _, c := range uname.Release {
release[i] = byte(c)
i++
}
release = release[:bytes.IndexByte(release, 0)]
return string(release)
return string(uname.Release[:bytes.IndexByte(uname.Release[:], 0)])
}

View File

@@ -24,7 +24,6 @@ import (
// s390/s390x changes
"runtime"
"syscall"
info "github.com/google/cadvisor/info/v1"
"github.com/google/cadvisor/utils"
@@ -32,6 +31,8 @@ import (
"github.com/google/cadvisor/utils/sysinfo"
"github.com/golang/glog"
"golang.org/x/sys/unix"
)
var (
@@ -265,18 +266,13 @@ func addNode(nodes *[]info.Node, id int) (int, error) {
// s390/s390x changes
func getMachineArch() (string, error) {
uname := syscall.Utsname{}
err := syscall.Uname(&uname)
uname := unix.Utsname{}
err := unix.Uname(&uname)
if err != nil {
return "", err
}
var arch string
for _, val := range uname.Machine {
arch += string(int(val))
}
return arch, nil
return string(uname.Machine[:]), nil
}
// arm32 chanes

View File

@@ -11,6 +11,7 @@ go_library(
deps = [
"//vendor/github.com/docker/go-units:go_default_library",
"//vendor/github.com/golang/glog:go_default_library",
"//vendor/github.com/google/cadvisor/accelerators:go_default_library",
"//vendor/github.com/google/cadvisor/cache/memory:go_default_library",
"//vendor/github.com/google/cadvisor/collector:go_default_library",
"//vendor/github.com/google/cadvisor/container:go_default_library",

View File

@@ -29,6 +29,7 @@ import (
"sync"
"time"
"github.com/google/cadvisor/accelerators"
"github.com/google/cadvisor/cache/memory"
"github.com/google/cadvisor/collector"
"github.com/google/cadvisor/container"
@@ -78,6 +79,9 @@ type containerData struct {
// Runs custom metric collectors.
collectorManager collector.CollectorManager
// nvidiaCollector updates stats for Nvidia GPUs attached to the container.
nvidiaCollector accelerators.AcceleratorCollector
}
// jitter returns a time.Duration between duration and duration + maxFactor * duration,
@@ -557,6 +561,12 @@ func (c *containerData) updateStats() error {
}
}
var nvidiaStatsErr error
if c.nvidiaCollector != nil {
// This updates the Accelerators field of the stats struct
nvidiaStatsErr = c.nvidiaCollector.UpdateStats(stats)
}
ref, err := c.handler.ContainerReference()
if err != nil {
// Ignore errors if the container is dead.
@@ -572,6 +582,9 @@ func (c *containerData) updateStats() error {
if statsErr != nil {
return statsErr
}
if nvidiaStatsErr != nil {
return nvidiaStatsErr
}
return customStatsErr
}

View File

@@ -18,6 +18,7 @@ package manager
import (
"flag"
"fmt"
"net/http"
"os"
"path"
"strconv"
@@ -25,6 +26,7 @@ import (
"sync"
"time"
"github.com/google/cadvisor/accelerators"
"github.com/google/cadvisor/cache/memory"
"github.com/google/cadvisor/collector"
"github.com/google/cadvisor/container"
@@ -45,8 +47,6 @@ import (
"github.com/google/cadvisor/utils/sysfs"
"github.com/google/cadvisor/version"
"net/http"
"github.com/golang/glog"
"github.com/opencontainers/runc/libcontainer/cgroups"
)
@@ -148,13 +148,19 @@ func New(memoryCache *memory.InMemoryCache, sysfs sysfs.SysFs, maxHousekeepingIn
}
glog.Infof("cAdvisor running in container: %q", selfContainer)
dockerStatus, err := docker.Status()
if err != nil {
var (
dockerStatus info.DockerStatus
rktPath string
)
if tempDockerStatus, err := docker.Status(); err != nil {
glog.Warningf("Unable to connect to Docker: %v", err)
} else {
dockerStatus = tempDockerStatus
}
rktPath, err := rkt.RktPath()
if err != nil {
if tmpRktPath, err := rkt.RktPath(); err != nil {
glog.Warningf("unable to connect to Rkt api service: %v", err)
} else {
rktPath = tmpRktPath
}
crioClient, err := crio.Client()
@@ -206,6 +212,7 @@ func New(memoryCache *memory.InMemoryCache, sysfs sysfs.SysFs, maxHousekeepingIn
containerWatchers: []watcher.ContainerWatcher{},
eventsChannel: eventsChannel,
collectorHttpClient: collectorHttpClient,
nvidiaManager: &accelerators.NvidiaManager{},
}
machineInfo, err := machine.Info(sysfs, fsInfo, inHostNamespace)
@@ -251,6 +258,7 @@ type manager struct {
containerWatchers []watcher.ContainerWatcher
eventsChannel chan watcher.ContainerEvent
collectorHttpClient *http.Client
nvidiaManager accelerators.AcceleratorManager
}
// Start the container manager.
@@ -303,6 +311,9 @@ func (self *manager) Start() error {
return nil
}
// Setup collection of nvidia GPU metrics if any of them are attached to the machine.
self.nvidiaManager.Setup()
// Create root and then recover all containers.
err = self.createContainer("/", watcher.Raw)
if err != nil {
@@ -332,6 +343,7 @@ func (self *manager) Start() error {
}
func (self *manager) Stop() error {
defer self.nvidiaManager.Destroy()
// Stop and wait on all quit channels.
for i, c := range self.quitChannels {
// Send the exit signal and wait on the thread to exit (by closing the channel).
@@ -911,6 +923,15 @@ func (m *manager) createContainerLocked(containerName string, watchSource watche
if err != nil {
return err
}
devicesCgroupPath, err := handler.GetCgroupPath("devices")
if err != nil {
glog.Infof("Error getting devices cgroup path: %v", err)
} else {
cont.nvidiaCollector, err = m.nvidiaManager.GetCollector(devicesCgroupPath)
if err != nil {
glog.Infof("GPU metrics may be unavailable/incomplete for container %q: %v", cont.info.Name, err)
}
}
// Add collectors
labels := handler.GetContainerLabels()

View File

@@ -6,11 +6,11 @@ go_library(
importpath = "github.com/google/cadvisor/manager/watcher/raw",
visibility = ["//visibility:public"],
deps = [
"//vendor/github.com/fsnotify/fsnotify:go_default_library",
"//vendor/github.com/golang/glog:go_default_library",
"//vendor/github.com/google/cadvisor/container/common:go_default_library",
"//vendor/github.com/google/cadvisor/container/libcontainer:go_default_library",
"//vendor/github.com/google/cadvisor/manager/watcher:go_default_library",
"//vendor/golang.org/x/exp/inotify:go_default_library",
],
)

View File

@@ -27,8 +27,8 @@ import (
"github.com/google/cadvisor/container/libcontainer"
"github.com/google/cadvisor/manager/watcher"
"github.com/fsnotify/fsnotify"
"github.com/golang/glog"
"golang.org/x/exp/inotify"
)
type rawContainerWatcher struct {
@@ -121,7 +121,7 @@ func (self *rawContainerWatcher) watchDirectory(dir string, containerName string
if cleanup {
_, err := self.watcher.RemoveWatch(containerName, dir)
if err != nil {
glog.Warningf("Failed to remove inotify watch for %q: %v", dir, err)
glog.Warningf("Failed to remove fsnotify watch for %q: %v", dir, err)
}
}
}()
@@ -152,18 +152,16 @@ func (self *rawContainerWatcher) watchDirectory(dir string, containerName string
return alreadyWatching, nil
}
func (self *rawContainerWatcher) processEvent(event *inotify.Event, events chan watcher.ContainerEvent) error {
// Convert the inotify event type to a container create or delete.
func (self *rawContainerWatcher) processEvent(event fsnotify.Event, events chan watcher.ContainerEvent) error {
// Convert the fsnotify event type to a container create or delete.
var eventType watcher.ContainerEventType
switch {
case (event.Mask & inotify.IN_CREATE) > 0:
case event.Op == fsnotify.Create:
eventType = watcher.ContainerAdd
case (event.Mask & inotify.IN_DELETE) > 0:
case event.Op == fsnotify.Remove:
eventType = watcher.ContainerDelete
case (event.Mask & inotify.IN_MOVED_FROM) > 0:
case event.Op == fsnotify.Rename:
eventType = watcher.ContainerDelete
case (event.Mask & inotify.IN_MOVED_TO) > 0:
eventType = watcher.ContainerAdd
default:
// Ignore other events.
return nil

View File

@@ -226,11 +226,19 @@ func NewPrometheusCollector(i infoProvider, f ContainerLabelsFunc) *PrometheusCo
},
}, {
name: "container_memory_usage_bytes",
help: "Current memory usage in bytes.",
help: "Current memory usage in bytes, including all memory regardless of when it was accessed",
valueType: prometheus.GaugeValue,
getValues: func(s *info.ContainerStats) metricValues {
return metricValues{{value: float64(s.Memory.Usage)}}
},
},
{
name: "container_memory_max_usage_bytes",
help: "Maximum memory usage recorded in bytes",
valueType: prometheus.GaugeValue,
getValues: func(s *info.ContainerStats) metricValues {
return metricValues{{value: float64(s.Memory.MaxUsage)}}
},
}, {
name: "container_memory_working_set_bytes",
help: "Current working set in bytes.",
@@ -263,6 +271,51 @@ func NewPrometheusCollector(i infoProvider, f ContainerLabelsFunc) *PrometheusCo
},
}
},
}, {
name: "container_accelerator_memory_total_bytes",
help: "Total accelerator memory.",
valueType: prometheus.GaugeValue,
extraLabels: []string{"make", "model", "acc_id"},
getValues: func(s *info.ContainerStats) metricValues {
values := make(metricValues, 0, len(s.Accelerators))
for _, value := range s.Accelerators {
values = append(values, metricValue{
value: float64(value.MemoryTotal),
labels: []string{value.Make, value.Model, value.ID},
})
}
return values
},
}, {
name: "container_accelerator_memory_used_bytes",
help: "Total accelerator memory allocated.",
valueType: prometheus.GaugeValue,
extraLabels: []string{"make", "model", "acc_id"},
getValues: func(s *info.ContainerStats) metricValues {
values := make(metricValues, 0, len(s.Accelerators))
for _, value := range s.Accelerators {
values = append(values, metricValue{
value: float64(value.MemoryUsed),
labels: []string{value.Make, value.Model, value.ID},
})
}
return values
},
}, {
name: "container_accelerator_duty_cycle",
help: "Percent of time over the past sample period during which the accelerator was actively processing.",
valueType: prometheus.GaugeValue,
extraLabels: []string{"make", "model", "acc_id"},
getValues: func(s *info.ContainerStats) metricValues {
values := make(metricValues, 0, len(s.Accelerators))
for _, value := range s.Accelerators {
values = append(values, metricValue{
value: float64(value.DutyCycle),
labels: []string{value.Make, value.Model, value.ID},
})
}
return values
},
}, {
name: "container_fs_inodes_free",
help: "Number of available Inodes",
@@ -794,6 +847,8 @@ func (c *PrometheusCollector) collectContainersInfo(ch chan<- prometheus.Metric)
ch <- prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, specMemoryValue(container.Spec.Memory.Limit), values...)
desc = prometheus.NewDesc("container_spec_memory_swap_limit_bytes", "Memory swap limit for the container.", labels, nil)
ch <- prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, specMemoryValue(container.Spec.Memory.SwapLimit), values...)
desc = prometheus.NewDesc("container_spec_memory_reservation_limit_bytes", "Memory reservation limit for the container.", labels, nil)
ch <- prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, specMemoryValue(container.Spec.Memory.Reservation), values...)
}
// Now for the actual metrics

View File

@@ -5,7 +5,7 @@ go_library(
srcs = ["docker.go"],
importpath = "github.com/google/cadvisor/utils/docker",
visibility = ["//visibility:public"],
deps = ["//vendor/github.com/docker/engine-api/types:go_default_library"],
deps = ["//vendor/github.com/docker/docker/api/types:go_default_library"],
)
filegroup(

View File

@@ -19,7 +19,7 @@ import (
"os"
"strings"
dockertypes "github.com/docker/engine-api/types"
dockertypes "github.com/docker/docker/api/types"
)
const (

View File

@@ -190,7 +190,7 @@ func validateDockerInfo() (string, string) {
return Unsupported, fmt.Sprintf("Docker setup is invalid: %v", err)
}
desc := fmt.Sprintf("Docker exec driver is %s. Storage driver is %s.\n", info.ExecutionDriver, info.Driver)
desc := fmt.Sprintf("Storage driver is %s.\n", info.Driver)
return Recommended, desc
}