update cadvisor godeps to v0.30.2

This commit is contained in:
David Ashpole
2018-06-21 13:28:00 -07:00
parent 7ab706a943
commit e13362c93c
6 changed files with 147 additions and 130 deletions

View File

@@ -31,7 +31,10 @@ import (
)
type NvidiaManager struct {
sync.RWMutex
sync.Mutex
// true if there are NVIDIA devices present on the node
devicesPresent bool
// true if the NVML library (libnvidia-ml.so.1) was loaded successfully
nvmlInitialized bool
@@ -51,20 +54,9 @@ func (nm *NvidiaManager) Setup() {
return
}
nm.initializeNVML()
if nm.nvmlInitialized {
return
}
go func() {
glog.V(2).Info("Starting goroutine to initialize NVML")
// TODO: use globalHousekeepingInterval
for range time.Tick(time.Minute) {
nm.initializeNVML()
if nm.nvmlInitialized {
return
}
}
}()
nm.devicesPresent = true
initializeNVML(nm)
}
// detectDevices returns true if a device with given pci id is present on the node.
@@ -91,20 +83,18 @@ func detectDevices(vendorId string) bool {
}
// initializeNVML initializes the NVML library and sets up the nvmlDevices map.
func (nm *NvidiaManager) initializeNVML() {
// This is defined as a variable to help in testing.
var initializeNVML = func(nm *NvidiaManager) {
if err := gonvml.Initialize(); err != nil {
// This is under a logging level because otherwise we may cause
// log spam if the drivers/nvml is not installed on the system.
glog.V(4).Infof("Could not initialize NVML: %v", err)
return
}
nm.nvmlInitialized = true
numDevices, err := gonvml.DeviceCount()
if err != nil {
glog.Warningf("GPU metrics would not be available. Failed to get the number of nvidia devices: %v", err)
nm.Lock()
// Even though we won't have GPU metrics, the library was initialized and should be shutdown when exiting.
nm.nvmlInitialized = true
nm.Unlock()
return
}
glog.V(1).Infof("NVML initialized. Number of nvidia devices: %v", numDevices)
@@ -122,10 +112,6 @@ func (nm *NvidiaManager) initializeNVML() {
}
nm.nvidiaDevices[int(minorNumber)] = device
}
nm.Lock()
// Doing this at the end to avoid race in accessing nvidiaDevices in GetCollector.
nm.nvmlInitialized = true
nm.Unlock()
}
// Destroy shuts down NVML.
@@ -139,12 +125,21 @@ func (nm *NvidiaManager) Destroy() {
// present in the devices.list file in the given devicesCgroupPath.
func (nm *NvidiaManager) GetCollector(devicesCgroupPath string) (AcceleratorCollector, error) {
nc := &NvidiaCollector{}
nm.RLock()
if !nm.nvmlInitialized || len(nm.nvidiaDevices) == 0 {
nm.RUnlock()
if !nm.devicesPresent {
return nc, nil
}
nm.RUnlock()
// Makes sure that we don't call initializeNVML() concurrently and
// that we only call initializeNVML() when it's not initialized.
nm.Lock()
if !nm.nvmlInitialized {
initializeNVML(nm)
}
if !nm.nvmlInitialized || len(nm.nvidiaDevices) == 0 {
nm.Unlock()
return nc, nil
}
nm.Unlock()
nvidiaMinorNumbers, err := parseDevicesCgroup(devicesCgroupPath)
if err != nil {
return nc, err

View File

@@ -65,9 +65,6 @@ type crioContainerHandler struct {
ignoreMetrics container.MetricSet
// container restart count
restartCount int
reference info.ContainerReference
libcontainerHandler *containerlibcontainer.Handler
@@ -166,7 +163,10 @@ func newCrioContainerHandler(
// ignore err and get zero as default, this happens with sandboxes, not sure why...
// kube isn't sending restart count in labels for sandboxes.
restartCount, _ := strconv.Atoi(cInfo.Annotations["io.kubernetes.container.restartCount"])
handler.restartCount = restartCount
// Only adds restartcount label if it's greater than 0
if restartCount > 0 {
handler.labels["restartcount"] = strconv.Itoa(restartCount)
}
handler.ipAddress = cInfo.IP
@@ -210,10 +210,6 @@ func (self *crioContainerHandler) GetSpec() (info.ContainerSpec, error) {
spec, err := common.GetSpec(self.cgroupPaths, self.machineInfoFactory, self.needNet(), hasFilesystem)
spec.Labels = self.labels
// Only adds restartcount label if it's greater than 0
if self.restartCount > 0 {
spec.Labels["restartcount"] = strconv.Itoa(self.restartCount)
}
spec.Envs = self.envs
spec.Image = self.image

View File

@@ -91,9 +91,6 @@ type dockerContainerHandler struct {
// zfsParent is the parent for docker zfs
zfsParent string
// container restart count
restartCount int
// Reference to the container
reference info.ContainerReference
@@ -226,7 +223,10 @@ func newDockerContainerHandler(
}
handler.image = ctnr.Config.Image
handler.networkMode = ctnr.HostConfig.NetworkMode
handler.restartCount = ctnr.RestartCount
// Only adds restartcount label if it's greater than 0
if ctnr.RestartCount > 0 {
handler.labels["restartcount"] = strconv.Itoa(ctnr.RestartCount)
}
// Obtain the IP address for the contianer.
// If the NetworkMode starts with 'container:' then we need to use the IP address of the container specified.
@@ -356,10 +356,6 @@ func (self *dockerContainerHandler) GetSpec() (info.ContainerSpec, error) {
spec, err := common.GetSpec(self.cgroupPaths, self.machineInfoFactory, self.needNet(), hasFilesystem)
spec.Labels = self.labels
// Only adds restartcount label if it's greater than 0
if self.restartCount > 0 {
spec.Labels["restartcount"] = strconv.Itoa(self.restartCount)
}
spec.Envs = self.envs
spec.Image = self.image
spec.CreationTime = self.creationTime

View File

@@ -533,6 +533,21 @@ func (self *RealFsInfo) GetDirFsDevice(dir string) (*DeviceInfo, error) {
}
mount, found := self.mounts[dir]
// try the parent dir if not found until we reach the root dir
// this is an issue on btrfs systems where the directory is not
// the subvolume
for !found {
pathdir, _ := filepath.Split(dir)
// break when we reach root
if pathdir == "/" {
break
}
// trim "/" from the new parent path otherwise the next possible
// filepath.Split in the loop will not split the string any further
dir = strings.TrimSuffix(pathdir, "/")
mount, found = self.mounts[dir]
}
if found && mount.Fstype == "btrfs" && mount.Major == 0 && strings.HasPrefix(mount.Source, "/dev/") {
major, minor, err := getBtrfsMajorMinorIds(mount)
if err != nil {

View File

@@ -110,6 +110,11 @@ func (self *rawContainerWatcher) Stop() error {
// Watches the specified directory and all subdirectories. Returns whether the path was
// already being watched and an error (if any).
func (self *rawContainerWatcher) watchDirectory(events chan watcher.ContainerEvent, dir string, containerName string) (bool, error) {
// Don't watch .mount cgroups because they never have containers as sub-cgroups. A single container
// can have many .mount cgroups associated with it which can quickly exhaust the inotify watches on a node.
if strings.HasSuffix(containerName, ".mount") {
return false, nil
}
alreadyWatching, err := self.watcher.AddWatch(containerName, dir)
if err != nil {
return alreadyWatching, err