Update cadvisor and containerd

This commit is contained in:
Davanum Srinivas
2020-03-24 13:11:42 -04:00
parent d00f9c7c10
commit 4274ea2c89
172 changed files with 8572 additions and 4031 deletions

View File

@@ -2,15 +2,13 @@ load("@io_bazel_rules_go//go:def.bzl", "go_library")
go_library(
name = "go_default_library",
srcs = [
"nvidia.go",
"types.go",
],
srcs = ["nvidia.go"],
importmap = "k8s.io/kubernetes/vendor/github.com/google/cadvisor/accelerators",
importpath = "github.com/google/cadvisor/accelerators",
visibility = ["//visibility:public"],
deps = [
"//vendor/github.com/google/cadvisor/info/v1:go_default_library",
"//vendor/github.com/google/cadvisor/stats:go_default_library",
"//vendor/github.com/mindprince/gonvml:go_default_library",
"//vendor/k8s.io/klog:go_default_library",
],

View File

@@ -25,12 +25,13 @@ import (
"time"
info "github.com/google/cadvisor/info/v1"
"github.com/google/cadvisor/stats"
"github.com/mindprince/gonvml"
"k8s.io/klog"
)
type NvidiaManager struct {
type nvidiaManager struct {
sync.Mutex
// true if there are NVIDIA devices present on the node
@@ -47,8 +48,12 @@ var sysFsPCIDevicesPath = "/sys/bus/pci/devices/"
const nvidiaVendorId = "0x10de"
func NewNvidiaManager() stats.Manager {
return &nvidiaManager{}
}
// Setup initializes NVML if nvidia devices are present on the node.
func (nm *NvidiaManager) Setup() {
func (nm *nvidiaManager) Setup() {
if !detectDevices(nvidiaVendorId) {
klog.V(4).Info("No NVIDIA devices found.")
return
@@ -84,7 +89,7 @@ func detectDevices(vendorId string) bool {
// initializeNVML initializes the NVML library and sets up the nvmlDevices map.
// This is defined as a variable to help in testing.
var initializeNVML = func(nm *NvidiaManager) {
var initializeNVML = func(nm *nvidiaManager) {
if err := gonvml.Initialize(); err != nil {
// This is under a logging level because otherwise we may cause
// log spam if the drivers/nvml is not installed on the system.
@@ -115,7 +120,7 @@ var initializeNVML = func(nm *NvidiaManager) {
}
// Destroy shuts down NVML.
func (nm *NvidiaManager) Destroy() {
func (nm *nvidiaManager) Destroy() {
if nm.nvmlInitialized {
gonvml.Shutdown()
}
@@ -123,8 +128,8 @@ func (nm *NvidiaManager) Destroy() {
// GetCollector returns a collector that can fetch nvidia gpu metrics for nvidia devices
// present in the devices.list file in the given devicesCgroupPath.
func (nm *NvidiaManager) GetCollector(devicesCgroupPath string) (AcceleratorCollector, error) {
nc := &NvidiaCollector{}
func (nm *nvidiaManager) GetCollector(devicesCgroupPath string) (stats.Collector, error) {
nc := &nvidiaCollector{}
if !nm.devicesPresent {
return nc, nil
@@ -149,7 +154,7 @@ func (nm *NvidiaManager) GetCollector(devicesCgroupPath string) (AcceleratorColl
if !ok {
return nc, fmt.Errorf("nvidia device minor number %d not found in cached devices", minor)
}
nc.Devices = append(nc.Devices, device)
nc.devices = append(nc.devices, device)
}
return nc, nil
}
@@ -208,14 +213,18 @@ var parseDevicesCgroup = func(devicesCgroupPath string) ([]int, error) {
return nvidiaMinorNumbers, nil
}
type NvidiaCollector struct {
type nvidiaCollector struct {
// Exposed for testing
Devices []gonvml.Device
devices []gonvml.Device
}
func NewNvidiaCollector(devices []gonvml.Device) stats.Collector {
return &nvidiaCollector{devices: devices}
}
// UpdateStats updates the stats for NVIDIA GPUs (if any) attached to the container.
func (nc *NvidiaCollector) UpdateStats(stats *info.ContainerStats) error {
for _, device := range nc.Devices {
func (nc *nvidiaCollector) UpdateStats(stats *info.ContainerStats) error {
for _, device := range nc.devices {
model, err := device.Name()
if err != nil {
return fmt.Errorf("error while getting gpu name: %v", err)

View File

@@ -184,6 +184,17 @@ func (s byName) Less(i, j int) bool {
return s.prometheusLabels[i].GetName() < s.prometheusLabels[j].GetName()
}
func prometheusLabelSetToCadvisorLabels(promLabels model.Metric) map[string]string {
labels := make(map[string]string)
for k, v := range promLabels {
if string(k) == "__name__" {
continue
}
labels[string(k)] = string(v)
}
return labels
}
func prometheusLabelSetToCadvisorLabel(promLabels model.Metric) string {
labels := labelSetToLabelPairs(promLabels)
sort.Sort(byName{labels})
@@ -247,11 +258,13 @@ func (collector *PrometheusCollector) Collect(metrics map[string][]v1.MetricVal)
// TODO Handle multiple labels nicer. Prometheus metrics can have multiple
// labels, cadvisor only accepts a single string for the metric label.
label := prometheusLabelSetToCadvisorLabel(sample.Metric)
labels := prometheusLabelSetToCadvisorLabels(sample.Metric)
metric := v1.MetricVal{
FloatValue: float64(sample.Value),
Timestamp: sample.Timestamp.Time(),
Label: label,
Labels: labels,
}
newMetrics[metName] = append(newMetrics[metName], metric)
if len(newMetrics) > collector.metricCountLimit {

View File

@@ -92,10 +92,11 @@ func (fh *realFsHandler) update() error {
fh.lastUpdate = time.Now()
if fh.rootfs != "" && rootErr == nil {
fh.usage.InodeUsage = rootUsage.Inodes
fh.usage.TotalUsageBytes = rootUsage.Bytes + extraUsage.Bytes
fh.usage.BaseUsageBytes = rootUsage.Bytes
fh.usage.TotalUsageBytes = rootUsage.Bytes
}
if fh.extraDir != "" && extraErr == nil {
fh.usage.BaseUsageBytes = rootUsage.Bytes
fh.usage.TotalUsageBytes += extraUsage.Bytes
}
// Combine errors into a single error to return
@@ -106,31 +107,30 @@ func (fh *realFsHandler) update() error {
}
func (fh *realFsHandler) trackUsage() {
fh.update()
longOp := time.Second
for {
start := time.Now()
if err := fh.update(); err != nil {
klog.Errorf("failed to collect filesystem stats - %v", err)
fh.period = fh.period * 2
if fh.period > maxBackoffFactor*fh.minPeriod {
fh.period = maxBackoffFactor * fh.minPeriod
}
} else {
fh.period = fh.minPeriod
}
duration := time.Since(start)
if duration > longOp {
// adapt longOp time so that message doesn't continue to print
// if the long duration is persistent either because of slow
// disk or lots of containers.
longOp = longOp + time.Second
klog.V(2).Infof("fs: disk usage and inodes count on following dirs took %v: %v; will not log again for this container unless duration exceeds %v", duration, []string{fh.rootfs, fh.extraDir}, longOp)
}
select {
case <-fh.stopChan:
return
case <-time.After(fh.period):
start := time.Now()
if err := fh.update(); err != nil {
klog.Errorf("failed to collect filesystem stats - %v", err)
fh.period = fh.period * 2
if fh.period > maxBackoffFactor*fh.minPeriod {
fh.period = maxBackoffFactor * fh.minPeriod
}
} else {
fh.period = fh.minPeriod
}
duration := time.Since(start)
if duration > longOp {
// adapt longOp time so that message doesn't continue to print
// if the long duration is persistent either because of slow
// disk or lots of containers.
longOp = longOp + time.Second
klog.V(2).Infof("fs: disk usage and inodes count on following dirs took %v: %v; will not log again for this container unless duration exceeds %v", duration, []string{fh.rootfs, fh.extraDir}, longOp)
}
}
}
}

View File

@@ -109,8 +109,9 @@ func GetSpec(cgroupPaths map[string]string, machineInfoFactory info.MachineInfoF
val, err := strconv.ParseUint(quota, 10, 64)
if err != nil {
klog.Errorf("GetSpec: Failed to parse CPUQuota from %q: %s", path.Join(cpuRoot, "cpu.cfs_quota_us"), err)
} else {
spec.Cpu.Quota = val
}
spec.Cpu.Quota = val
}
}
}
@@ -155,6 +156,14 @@ func GetSpec(cgroupPaths map[string]string, machineInfoFactory info.MachineInfoF
}
}
// Hugepage
hugepageRoot, ok := cgroupPaths["hugetlb"]
if ok {
if utils.FileExists(hugepageRoot) {
spec.HasHugetlb = true
}
}
// Processes, read it's value from pids path directly
pidsRoot, ok := cgroupPaths["pids"]
if ok {

View File

@@ -17,9 +17,9 @@ go_library(
"//vendor/github.com/containerd/containerd/api/services/tasks/v1:go_default_library",
"//vendor/github.com/containerd/containerd/api/services/version/v1:go_default_library",
"//vendor/github.com/containerd/containerd/containers:go_default_library",
"//vendor/github.com/containerd/containerd/dialer:go_default_library",
"//vendor/github.com/containerd/containerd/errdefs:go_default_library",
"//vendor/github.com/containerd/containerd/namespaces:go_default_library",
"//vendor/github.com/containerd/containerd/pkg/dialer:go_default_library",
"//vendor/github.com/gogo/protobuf/types:go_default_library",
"//vendor/github.com/google/cadvisor/container:go_default_library",
"//vendor/github.com/google/cadvisor/container/common:go_default_library",

View File

@@ -25,8 +25,8 @@ import (
tasksapi "github.com/containerd/containerd/api/services/tasks/v1"
versionapi "github.com/containerd/containerd/api/services/version/v1"
"github.com/containerd/containerd/containers"
"github.com/containerd/containerd/dialer"
"github.com/containerd/containerd/errdefs"
"github.com/containerd/containerd/pkg/dialer"
ptypes "github.com/gogo/protobuf/types"
"google.golang.org/grpc"
)

View File

@@ -272,6 +272,9 @@ func (self *crioContainerHandler) getFsStats(stats *info.ContainerStats) error {
}
}
if fsType == "" {
return fmt.Errorf("unable to determine fs type for device: %v", device)
}
fsStat := info.FsStats{Device: device, Type: fsType, Limit: limit}
usage := self.fsHandler.Usage()
fsStat.BaseUsage = usage.BaseUsageBytes

View File

@@ -20,12 +20,12 @@ go_library(
"//vendor/github.com/docker/go-connections/tlsconfig:go_default_library",
"//vendor/github.com/google/cadvisor/container:go_default_library",
"//vendor/github.com/google/cadvisor/container/common:go_default_library",
"//vendor/github.com/google/cadvisor/container/docker/utils:go_default_library",
"//vendor/github.com/google/cadvisor/container/libcontainer:go_default_library",
"//vendor/github.com/google/cadvisor/devicemapper:go_default_library",
"//vendor/github.com/google/cadvisor/fs:go_default_library",
"//vendor/github.com/google/cadvisor/info/v1:go_default_library",
"//vendor/github.com/google/cadvisor/machine:go_default_library",
"//vendor/github.com/google/cadvisor/utils/docker:go_default_library",
"//vendor/github.com/google/cadvisor/watcher:go_default_library",
"//vendor/github.com/google/cadvisor/zfs:go_default_library",
"//vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs:go_default_library",
@@ -47,6 +47,7 @@ filegroup(
srcs = [
":package-srcs",
"//vendor/github.com/google/cadvisor/container/docker/install:all-srcs",
"//vendor/github.com/google/cadvisor/container/docker/utils:all-srcs",
],
tags = ["automanaged"],
visibility = ["//visibility:public"],

View File

@@ -22,16 +22,17 @@ import (
"strconv"
"strings"
"sync"
"time"
"github.com/blang/semver"
dockertypes "github.com/docker/docker/api/types"
"github.com/google/cadvisor/container"
dockerutil "github.com/google/cadvisor/container/docker/utils"
"github.com/google/cadvisor/container/libcontainer"
"github.com/google/cadvisor/devicemapper"
"github.com/google/cadvisor/fs"
info "github.com/google/cadvisor/info/v1"
"github.com/google/cadvisor/machine"
dockerutil "github.com/google/cadvisor/utils/docker"
"github.com/google/cadvisor/watcher"
"github.com/google/cadvisor/zfs"
@@ -49,11 +50,17 @@ var ArgDockerCA = flag.String("docker-tls-ca", "ca.pem", "path to trusted CA")
// The namespace under which Docker aliases are unique.
const DockerNamespace = "docker"
// The retry times for getting docker root dir
const rootDirRetries = 5
//The retry period for getting docker root dir, Millisecond
const rootDirRetryPeriod time.Duration = 1000 * time.Millisecond
// Regexp that identifies docker cgroups, containers started with
// --cgroup-parent have another prefix than 'docker'
var dockerCgroupRegexp = regexp.MustCompile(`([a-z0-9]{64})`)
var dockerEnvWhitelist = flag.String("docker_env_metadata_whitelist", "", "a comma-separated list of environment variable keys that needs to be collected for docker containers")
var dockerEnvWhitelist = flag.String("docker_env_metadata_whitelist", "", "a comma-separated list of environment variable keys matched with specified prefix that needs to be collected for docker containers")
var (
// Basepath to all container specific information that libcontainer stores.
@@ -72,10 +79,16 @@ var (
func RootDir() string {
dockerRootDirOnce.Do(func() {
status, err := Status()
if err == nil && status.RootDir != "" {
dockerRootDir = status.RootDir
} else {
for i := 0; i < rootDirRetries; i++ {
status, err := Status()
if err == nil && status.RootDir != "" {
dockerRootDir = status.RootDir
break
} else {
time.Sleep(rootDirRetryPeriod)
}
}
if dockerRootDir == "" {
dockerRootDir = *dockerRootDirFlag
}
})

View File

@@ -25,11 +25,11 @@ import (
"github.com/google/cadvisor/container"
"github.com/google/cadvisor/container/common"
dockerutil "github.com/google/cadvisor/container/docker/utils"
containerlibcontainer "github.com/google/cadvisor/container/libcontainer"
"github.com/google/cadvisor/devicemapper"
"github.com/google/cadvisor/fs"
info "github.com/google/cadvisor/info/v1"
dockerutil "github.com/google/cadvisor/utils/docker"
"github.com/google/cadvisor/zfs"
dockercontainer "github.com/docker/docker/api/types/container"
@@ -51,7 +51,6 @@ const (
)
type dockerContainerHandler struct {
// machineInfoFactory provides info.MachineInfo
machineInfoFactory info.MachineInfoFactory
@@ -253,11 +252,16 @@ func newDockerContainerHandler(
// split env vars to get metadata map.
for _, exposedEnv := range metadataEnvs {
if exposedEnv == "" {
// if no dockerEnvWhitelist provided, len(metadataEnvs) == 1, metadataEnvs[0] == ""
continue
}
for _, envVar := range ctnr.Config.Env {
if envVar != "" {
splits := strings.SplitN(envVar, "=", 2)
if len(splits) == 2 && splits[0] == exposedEnv {
handler.envs[strings.ToLower(exposedEnv)] = splits[1]
if len(splits) == 2 && strings.HasPrefix(splits[0], exposedEnv) {
handler.envs[strings.ToLower(splits[0])] = splits[1]
}
}
}

View File

@@ -3,8 +3,8 @@ load("@io_bazel_rules_go//go:def.bzl", "go_library")
go_library(
name = "go_default_library",
srcs = ["docker.go"],
importmap = "k8s.io/kubernetes/vendor/github.com/google/cadvisor/utils/docker",
importpath = "github.com/google/cadvisor/utils/docker",
importmap = "k8s.io/kubernetes/vendor/github.com/google/cadvisor/container/docker/utils",
importpath = "github.com/google/cadvisor/container/docker/utils",
visibility = ["//visibility:public"],
deps = ["//vendor/github.com/docker/docker/api/types:go_default_library"],
)

View File

@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
package docker
package utils
import (
"fmt"

View File

@@ -43,21 +43,42 @@ type ContainerHandlerFactory interface {
type MetricKind string
const (
CpuUsageMetrics MetricKind = "cpu"
ProcessSchedulerMetrics MetricKind = "sched"
PerCpuUsageMetrics MetricKind = "percpu"
MemoryUsageMetrics MetricKind = "memory"
CpuLoadMetrics MetricKind = "cpuLoad"
DiskIOMetrics MetricKind = "diskIO"
DiskUsageMetrics MetricKind = "disk"
NetworkUsageMetrics MetricKind = "network"
NetworkTcpUsageMetrics MetricKind = "tcp"
NetworkUdpUsageMetrics MetricKind = "udp"
AcceleratorUsageMetrics MetricKind = "accelerator"
AppMetrics MetricKind = "app"
ProcessMetrics MetricKind = "process"
CpuUsageMetrics MetricKind = "cpu"
ProcessSchedulerMetrics MetricKind = "sched"
PerCpuUsageMetrics MetricKind = "percpu"
MemoryUsageMetrics MetricKind = "memory"
CpuLoadMetrics MetricKind = "cpuLoad"
DiskIOMetrics MetricKind = "diskIO"
DiskUsageMetrics MetricKind = "disk"
NetworkUsageMetrics MetricKind = "network"
NetworkTcpUsageMetrics MetricKind = "tcp"
NetworkAdvancedTcpUsageMetrics MetricKind = "advtcp"
NetworkUdpUsageMetrics MetricKind = "udp"
AcceleratorUsageMetrics MetricKind = "accelerator"
AppMetrics MetricKind = "app"
ProcessMetrics MetricKind = "process"
HugetlbUsageMetrics MetricKind = "hugetlb"
)
// AllMetrics represents all kinds of metrics that cAdvisor supported.
var AllMetrics = MetricSet{
CpuUsageMetrics: struct{}{},
ProcessSchedulerMetrics: struct{}{},
PerCpuUsageMetrics: struct{}{},
MemoryUsageMetrics: struct{}{},
CpuLoadMetrics: struct{}{},
DiskIOMetrics: struct{}{},
AcceleratorUsageMetrics: struct{}{},
DiskUsageMetrics: struct{}{},
NetworkUsageMetrics: struct{}{},
NetworkTcpUsageMetrics: struct{}{},
NetworkAdvancedTcpUsageMetrics: struct{}{},
NetworkUdpUsageMetrics: struct{}{},
ProcessMetrics: struct{}{},
AppMetrics: struct{}{},
HugetlbUsageMetrics: struct{}{},
}
func (mk MetricKind) String() string {
return string(mk)
}
@@ -73,6 +94,16 @@ func (ms MetricSet) Add(mk MetricKind) {
ms[mk] = struct{}{}
}
func (ms MetricSet) Difference(ms1 MetricSet) MetricSet {
result := MetricSet{}
for kind := range ms {
if !ms1.Has(kind) {
result.Add(kind)
}
}
return result
}
// All registered auth provider plugins.
var pluginsLock sync.Mutex
var plugins = make(map[string]Plugin)

View File

@@ -16,11 +16,13 @@ package libcontainer
import (
"bufio"
"encoding/json"
"fmt"
"io"
"io/ioutil"
"os"
"path"
"regexp"
"strconv"
"strings"
"time"
@@ -44,6 +46,8 @@ type Handler struct {
pidMetricsCache map[int]*info.CpuSchedstat
}
var whitelistedUlimits = [...]string{"max_open_files"}
func NewHandler(cgroupManager cgroups.Manager, rootFs string, pid int, includedMetrics container.MetricSet) *Handler {
return &Handler{
cgroupManager: cgroupManager,
@@ -103,6 +107,15 @@ func (h *Handler) GetStats() (*info.ContainerStats, error) {
} else {
stats.Network.Tcp6 = t6
}
}
if h.includedMetrics.Has(container.NetworkAdvancedTcpUsageMetrics) {
ta, err := advancedTcpStatsFromProc(h.rootFs, h.pid, "net/netstat", "net/snmp")
if err != nil {
klog.V(4).Infof("Unable to get advanced tcp stats from pid %d: %v", h.pid, err)
} else {
stats.Network.TcpAdvanced = ta
}
}
if h.includedMetrics.Has(container.NetworkUdpUsageMetrics) {
u, err := udpStatsFromProc(h.rootFs, h.pid, "net/udp")
@@ -125,7 +138,7 @@ func (h *Handler) GetStats() (*info.ContainerStats, error) {
if !ok {
klog.V(4).Infof("Could not find cgroups CPU for container %d", h.pid)
} else {
stats.Processes, err = processStatsFromProcs(h.rootFs, path)
stats.Processes, err = processStatsFromProcs(h.rootFs, path, h.pid)
if err != nil {
klog.V(4).Infof("Unable to get Process Stats: %v", err)
}
@@ -143,7 +156,76 @@ func (h *Handler) GetStats() (*info.ContainerStats, error) {
return stats, nil
}
func processStatsFromProcs(rootFs string, cgroupPath string) (info.ProcessStats, error) {
func parseUlimit(value string) (int64, error) {
num, err := strconv.ParseInt(value, 10, 64)
if err != nil {
if strings.EqualFold(value, "unlimited") {
// -1 implies unlimited except for priority and nice; man limits.conf
num = -1
} else {
// Value is not a number or "unlimited"; return an error
return 0, fmt.Errorf("unable to parse limit: %s", value)
}
}
return num, nil
}
func isUlimitWhitelisted(name string) bool {
for _, whitelist := range whitelistedUlimits {
if name == whitelist {
return true
}
}
return false
}
func processLimitsFile(fileData string) []info.UlimitSpec {
limits := strings.Split(fileData, "\n")
ulimits := make([]info.UlimitSpec, 0, len(limits))
for _, lim := range limits {
// Skip any headers/footers
if strings.HasPrefix(lim, "Max") {
// Line format: Max open files 16384 16384 files
fields := regexp.MustCompile("[\\s]{2,}").Split(lim, -1)
name := strings.Replace(strings.ToLower(strings.TrimSpace(fields[0])), " ", "_", -1)
found := isUlimitWhitelisted(name)
if !found {
continue
}
soft := strings.TrimSpace(fields[1])
soft_num, soft_err := parseUlimit(soft)
hard := strings.TrimSpace(fields[2])
hard_num, hard_err := parseUlimit(hard)
// Omit metric if there were any parsing errors
if soft_err == nil && hard_err == nil {
ulimitSpec := info.UlimitSpec{
Name: name,
SoftLimit: int64(soft_num),
HardLimit: int64(hard_num),
}
ulimits = append(ulimits, ulimitSpec)
}
}
}
return ulimits
}
func processRootProcUlimits(rootFs string, rootPid int) []info.UlimitSpec {
filePath := path.Join(rootFs, "/proc", strconv.Itoa(rootPid), "limits")
out, err := ioutil.ReadFile(filePath)
if err != nil {
klog.V(4).Infof("error while listing directory %q to read ulimits: %v", filePath, err)
return []info.UlimitSpec{}
}
return processLimitsFile(string(out))
}
func processStatsFromProcs(rootFs string, cgroupPath string, rootPid int) (info.ProcessStats, error) {
var fdCount, socketCount uint64
filePath := path.Join(cgroupPath, "cgroup.procs")
out, err := ioutil.ReadFile(filePath)
@@ -180,11 +262,13 @@ func processStatsFromProcs(rootFs string, cgroupPath string) (info.ProcessStats,
}
}
}
ulimits := processRootProcUlimits(rootFs, rootPid)
processStats := info.ProcessStats{
ProcessCount: uint64(len(pids)),
FdCount: fdCount,
SocketCount: socketCount,
Ulimits: ulimits,
}
return processStats, nil
@@ -334,6 +418,80 @@ func tcpStatsFromProc(rootFs string, pid int, file string) (info.TcpStat, error)
return tcpStats, nil
}
func advancedTcpStatsFromProc(rootFs string, pid int, file1, file2 string) (info.TcpAdvancedStat, error) {
var advancedStats info.TcpAdvancedStat
var err error
netstatFile := path.Join(rootFs, "proc", strconv.Itoa(pid), file1)
err = scanAdvancedTcpStats(&advancedStats, netstatFile)
if err != nil {
return advancedStats, err
}
snmpFile := path.Join(rootFs, "proc", strconv.Itoa(pid), file2)
err = scanAdvancedTcpStats(&advancedStats, snmpFile)
if err != nil {
return advancedStats, err
}
return advancedStats, nil
}
func scanAdvancedTcpStats(advancedStats *info.TcpAdvancedStat, advancedTcpStatsFile string) error {
data, err := ioutil.ReadFile(advancedTcpStatsFile)
if err != nil {
return fmt.Errorf("failure opening %s: %v", advancedTcpStatsFile, err)
}
reader := strings.NewReader(string(data))
scanner := bufio.NewScanner(reader)
scanner.Split(bufio.ScanLines)
advancedTcpStats := make(map[string]interface{})
for scanner.Scan() {
nameParts := strings.Split(scanner.Text(), " ")
scanner.Scan()
valueParts := strings.Split(scanner.Text(), " ")
// Remove trailing :. and ignore non-tcp
protocol := nameParts[0][:len(nameParts[0])-1]
if protocol != "TcpExt" && protocol != "Tcp" {
continue
}
if len(nameParts) != len(valueParts) {
return fmt.Errorf("mismatch field count mismatch in %s: %s",
advancedTcpStatsFile, protocol)
}
for i := 1; i < len(nameParts); i++ {
if strings.Contains(valueParts[i], "-") {
vInt64, err := strconv.ParseInt(valueParts[i], 10, 64)
if err != nil {
return fmt.Errorf("decode value: %s to int64 error: %s", valueParts[i], err)
}
advancedTcpStats[nameParts[i]] = vInt64
} else {
vUint64, err := strconv.ParseUint(valueParts[i], 10, 64)
if err != nil {
return fmt.Errorf("decode value: %s to uint64 error: %s", valueParts[i], err)
}
advancedTcpStats[nameParts[i]] = vUint64
}
}
}
b, err := json.Marshal(advancedTcpStats)
if err != nil {
return err
}
err = json.Unmarshal(b, advancedStats)
if err != nil {
return err
}
return scanner.Err()
}
func scanTcpStats(tcpStatsFile string) (info.TcpStat, error) {
var stats info.TcpStat
@@ -581,6 +739,17 @@ func setMemoryStats(s *cgroups.Stats, ret *info.ContainerStats) {
ret.Memory.WorkingSet = workingSet
}
func setHugepageStats(s *cgroups.Stats, ret *info.ContainerStats) {
ret.Hugetlb = make(map[string]info.HugetlbStats)
for k, v := range s.HugetlbStats {
ret.Hugetlb[k] = info.HugetlbStats{
Usage: v.Usage,
MaxUsage: v.MaxUsage,
Failcnt: v.Failcnt,
}
}
}
func setNetworkStats(libcontainerStats *libcontainer.Stats, ret *info.ContainerStats) {
ret.Network.Interfaces = make([]info.InterfaceStats, len(libcontainerStats.Interfaces))
for i := range libcontainerStats.Interfaces {
@@ -623,6 +792,9 @@ func newContainerStats(libcontainerStats *libcontainer.Stats, includedMetrics co
setDiskIoStats(s, ret)
}
setMemoryStats(s, ret)
if includedMetrics.Has(container.HugetlbUsageMetrics) {
setHugepageStats(s, ret)
}
}
if len(libcontainerStats.Interfaces) > 0 {
setNetworkStats(libcontainerStats, ret)

View File

@@ -107,6 +107,7 @@ var supportedSubsystems map[string]struct{} = map[string]struct{}{
"cpu": {},
"cpuacct": {},
"memory": {},
"hugetlb": {},
"pids": {},
"cpuset": {},
"blkio": {},

View File

@@ -42,7 +42,8 @@ func (f *systemdFactory) CanHandleAndAccept(name string) (bool, bool, error) {
if strings.HasSuffix(name, ".mount") {
return true, false, nil
}
return false, false, fmt.Errorf("%s not handled by systemd handler", name)
klog.V(5).Infof("%s not handled by systemd handler", name)
return false, false, nil
}
func (f *systemdFactory) DebugInfo() map[string][]string {

View File

@@ -11,20 +11,18 @@ go_library(
visibility = ["//visibility:public"],
deps = select({
"@io_bazel_rules_go//go/platform:android": [
"//vendor/github.com/docker/docker/pkg/mount:go_default_library",
"//vendor/github.com/google/cadvisor/devicemapper:go_default_library",
"//vendor/github.com/google/cadvisor/utils:go_default_library",
"//vendor/github.com/google/cadvisor/utils/docker:go_default_library",
"//vendor/github.com/mistifyio/go-zfs:go_default_library",
"//vendor/k8s.io/klog:go_default_library",
"//vendor/k8s.io/utils/mount:go_default_library",
],
"@io_bazel_rules_go//go/platform:linux": [
"//vendor/github.com/docker/docker/pkg/mount:go_default_library",
"//vendor/github.com/google/cadvisor/devicemapper:go_default_library",
"//vendor/github.com/google/cadvisor/utils:go_default_library",
"//vendor/github.com/google/cadvisor/utils/docker:go_default_library",
"//vendor/github.com/mistifyio/go-zfs:go_default_library",
"//vendor/k8s.io/klog:go_default_library",
"//vendor/k8s.io/utils/mount:go_default_library",
],
"//conditions:default": [],
}),

View File

@@ -30,18 +30,20 @@ import (
"strings"
"syscall"
"github.com/docker/docker/pkg/mount"
"github.com/google/cadvisor/devicemapper"
"github.com/google/cadvisor/utils"
dockerutil "github.com/google/cadvisor/utils/docker"
zfs "github.com/mistifyio/go-zfs"
"k8s.io/klog"
"k8s.io/utils/mount"
)
const (
LabelSystemRoot = "root"
LabelDockerImages = "docker-images"
LabelCrioImages = "crio-images"
LabelSystemRoot = "root"
LabelDockerImages = "docker-images"
LabelCrioImages = "crio-images"
DriverStatusPoolName = "Pool Name"
DriverStatusDataLoopFile = "Data loop file"
)
const (
@@ -83,7 +85,7 @@ type RealFsInfo struct {
// Labels are intent-specific tags that are auto-detected.
labels map[string]string
// Map from mountpoint to mount information.
mounts map[string]*mount.Info
mounts map[string]mount.MountInfo
// devicemapper client
dmsetup devicemapper.DmsetupClient
// fsUUIDToDeviceName is a map from the filesystem UUID to its device name.
@@ -91,7 +93,7 @@ type RealFsInfo struct {
}
func NewFsInfo(context Context) (FsInfo, error) {
mounts, err := mount.GetMounts(nil)
mounts, err := mount.ParseMountInfo("/proc/self/mountinfo")
if err != nil {
return nil, err
}
@@ -108,13 +110,13 @@ func NewFsInfo(context Context) (FsInfo, error) {
fsInfo := &RealFsInfo{
partitions: processMounts(mounts, excluded),
labels: make(map[string]string, 0),
mounts: make(map[string]*mount.Info, 0),
mounts: make(map[string]mount.MountInfo, 0),
dmsetup: devicemapper.NewDmsetupClient(),
fsUUIDToDeviceName: fsUUIDToDeviceName,
}
for _, mount := range mounts {
fsInfo.mounts[mount.Mountpoint] = mount
fsInfo.mounts[mount.MountPoint] = mount
}
// need to call this before the log line below printing out the partitions, as this function may
@@ -160,7 +162,7 @@ func getFsUUIDToDeviceNameMap() (map[string]string, error) {
return fsUUIDToDeviceName, nil
}
func processMounts(mounts []*mount.Info, excludedMountpointPrefixes []string) map[string]partition {
func processMounts(mounts []mount.MountInfo, excludedMountpointPrefixes []string) map[string]partition {
partitions := make(map[string]partition, 0)
supportedFsType := map[string]bool{
@@ -173,19 +175,19 @@ func processMounts(mounts []*mount.Info, excludedMountpointPrefixes []string) ma
}
for _, mount := range mounts {
if !strings.HasPrefix(mount.Fstype, "ext") && !supportedFsType[mount.Fstype] {
if !strings.HasPrefix(mount.FsType, "ext") && !supportedFsType[mount.FsType] {
continue
}
// Avoid bind mounts, exclude tmpfs.
if _, ok := partitions[mount.Source]; ok {
if mount.Fstype != "tmpfs" {
if mount.FsType != "tmpfs" {
continue
}
}
hasPrefix := false
for _, prefix := range excludedMountpointPrefixes {
if strings.HasPrefix(mount.Mountpoint, prefix) {
if strings.HasPrefix(mount.MountPoint, prefix) {
hasPrefix = true
break
}
@@ -195,13 +197,13 @@ func processMounts(mounts []*mount.Info, excludedMountpointPrefixes []string) ma
}
// using mountpoint to replace device once fstype it tmpfs
if mount.Fstype == "tmpfs" {
mount.Source = mount.Mountpoint
if mount.FsType == "tmpfs" {
mount.Source = mount.MountPoint
}
// btrfs fix: following workaround fixes wrong btrfs Major and Minor Ids reported in /proc/self/mountinfo.
// instead of using values from /proc/self/mountinfo we use stat to get Ids from btrfs mount point
if mount.Fstype == "btrfs" && mount.Major == 0 && strings.HasPrefix(mount.Source, "/dev/") {
major, minor, err := getBtrfsMajorMinorIds(mount)
if mount.FsType == "btrfs" && mount.Major == 0 && strings.HasPrefix(mount.Source, "/dev/") {
major, minor, err := getBtrfsMajorMinorIds(&mount)
if err != nil {
klog.Warningf("%s", err)
} else {
@@ -211,13 +213,13 @@ func processMounts(mounts []*mount.Info, excludedMountpointPrefixes []string) ma
}
// overlay fix: Making mount source unique for all overlay mounts, using the mount's major and minor ids.
if mount.Fstype == "overlay" {
if mount.FsType == "overlay" {
mount.Source = fmt.Sprintf("%s_%d-%d", mount.Source, mount.Major, mount.Minor)
}
partitions[mount.Source] = partition{
fsType: mount.Fstype,
mountpoint: mount.Mountpoint,
fsType: mount.FsType,
mountpoint: mount.MountPoint,
major: uint(mount.Major),
minor: uint(mount.Minor),
}
@@ -235,7 +237,7 @@ func (self *RealFsInfo) getDockerDeviceMapperInfo(context DockerContext) (string
return "", nil, nil
}
dataLoopFile := context.DriverStatus[dockerutil.DriverStatusDataLoopFile]
dataLoopFile := context.DriverStatus[DriverStatusDataLoopFile]
if len(dataLoopFile) > 0 {
return "", nil, nil
}
@@ -254,12 +256,12 @@ func (self *RealFsInfo) getDockerDeviceMapperInfo(context DockerContext) (string
}
// addSystemRootLabel attempts to determine which device contains the mount for /.
func (self *RealFsInfo) addSystemRootLabel(mounts []*mount.Info) {
func (self *RealFsInfo) addSystemRootLabel(mounts []mount.MountInfo) {
for _, m := range mounts {
if m.Mountpoint == "/" {
if m.MountPoint == "/" {
self.partitions[m.Source] = partition{
fsType: m.Fstype,
mountpoint: m.Mountpoint,
fsType: m.FsType,
mountpoint: m.MountPoint,
major: uint(m.Major),
minor: uint(m.Minor),
}
@@ -270,7 +272,7 @@ func (self *RealFsInfo) addSystemRootLabel(mounts []*mount.Info) {
}
// addDockerImagesLabel attempts to determine which device contains the mount for docker images.
func (self *RealFsInfo) addDockerImagesLabel(context Context, mounts []*mount.Info) {
func (self *RealFsInfo) addDockerImagesLabel(context Context, mounts []mount.MountInfo) {
dockerDev, dockerPartition, err := self.getDockerDeviceMapperInfo(context.Docker)
if err != nil {
klog.Warningf("Could not get Docker devicemapper device: %v", err)
@@ -283,7 +285,7 @@ func (self *RealFsInfo) addDockerImagesLabel(context Context, mounts []*mount.In
}
}
func (self *RealFsInfo) addCrioImagesLabel(context Context, mounts []*mount.Info) {
func (self *RealFsInfo) addCrioImagesLabel(context Context, mounts []mount.MountInfo) {
if context.Crio.Root != "" {
crioPath := context.Crio.Root
crioImagePaths := map[string]struct{}{
@@ -322,19 +324,19 @@ func getDockerImagePaths(context Context) map[string]struct{} {
// This method compares the mountpoints with possible container image mount points. If a match is found,
// the label is added to the partition.
func (self *RealFsInfo) updateContainerImagesPath(label string, mounts []*mount.Info, containerImagePaths map[string]struct{}) {
var useMount *mount.Info
func (self *RealFsInfo) updateContainerImagesPath(label string, mounts []mount.MountInfo, containerImagePaths map[string]struct{}) {
var useMount *mount.MountInfo
for _, m := range mounts {
if _, ok := containerImagePaths[m.Mountpoint]; ok {
if useMount == nil || (len(useMount.Mountpoint) < len(m.Mountpoint)) {
useMount = m
if _, ok := containerImagePaths[m.MountPoint]; ok {
if useMount == nil || (len(useMount.MountPoint) < len(m.MountPoint)) {
useMount = &m
}
}
}
if useMount != nil {
self.partitions[useMount.Source] = partition{
fsType: useMount.Fstype,
mountpoint: useMount.Mountpoint,
fsType: useMount.FsType,
mountpoint: useMount.MountPoint,
major: uint(useMount.Major),
minor: uint(useMount.Minor),
}
@@ -510,8 +512,9 @@ func (self *RealFsInfo) GetDirFsDevice(dir string) (*DeviceInfo, error) {
return nil, fmt.Errorf("stat failed on %s with error: %s", dir, err)
}
major := major(buf.Dev)
minor := minor(buf.Dev)
// The type Dev in Stat_t is 32bit on mips.
major := major(uint64(buf.Dev)) // nolint: unconvert
minor := minor(uint64(buf.Dev)) // nolint: unconvert
for device, partition := range self.partitions {
if partition.major == major && partition.minor == minor {
return &DeviceInfo{device, major, minor}, nil
@@ -534,8 +537,8 @@ func (self *RealFsInfo) GetDirFsDevice(dir string) (*DeviceInfo, error) {
mount, found = self.mounts[dir]
}
if found && mount.Fstype == "btrfs" && mount.Major == 0 && strings.HasPrefix(mount.Source, "/dev/") {
major, minor, err := getBtrfsMajorMinorIds(mount)
if found && mount.FsType == "btrfs" && mount.Major == 0 && strings.HasPrefix(mount.Source, "/dev/") {
major, minor, err := getBtrfsMajorMinorIds(&mount)
if err != nil {
klog.Warningf("%s", err)
} else {
@@ -605,7 +608,7 @@ func GetDirUsage(dir string) (UsageInfo, error) {
return nil
})
return usage, nil
return usage, err
}
func (self *RealFsInfo) GetDirUsage(dir string) (UsageInfo, error) {
@@ -630,7 +633,7 @@ func getVfsStats(path string) (total uint64, free uint64, avail uint64, inodes u
// Devicemapper thin provisioning is detailed at
// https://www.kernel.org/doc/Documentation/device-mapper/thin-provisioning.txt
func dockerDMDevice(driverStatus map[string]string, dmsetup devicemapper.DmsetupClient) (string, uint, uint, uint, error) {
poolName, ok := driverStatus[dockerutil.DriverStatusPoolName]
poolName, ok := driverStatus[DriverStatusPoolName]
if !ok || len(poolName) == 0 {
return "", 0, 0, 0, fmt.Errorf("Could not get dm pool name")
}
@@ -733,7 +736,7 @@ func (b *byteCounter) Write(p []byte) (int, error) {
}
// Get major and minor Ids for a mount point using btrfs as filesystem.
func getBtrfsMajorMinorIds(mount *mount.Info) (int, int, error) {
func getBtrfsMajorMinorIds(mount *mount.MountInfo) (int, int, error) {
// btrfs fix: following workaround fixes wrong btrfs Major and Minor Ids reported in /proc/self/mountinfo.
// instead of using values from /proc/self/mountinfo we use stat to get Ids from btrfs mount point
@@ -746,16 +749,17 @@ func getBtrfsMajorMinorIds(mount *mount.Info) (int, int, error) {
klog.V(4).Infof("btrfs mount %#v", mount)
if buf.Mode&syscall.S_IFMT == syscall.S_IFBLK {
err := syscall.Stat(mount.Mountpoint, buf)
err := syscall.Stat(mount.MountPoint, buf)
if err != nil {
err = fmt.Errorf("stat failed on %s with error: %s", mount.Mountpoint, err)
err = fmt.Errorf("stat failed on %s with error: %s", mount.MountPoint, err)
return 0, 0, err
}
klog.V(4).Infof("btrfs dev major:minor %d:%d\n", int(major(buf.Dev)), int(minor(buf.Dev)))
klog.V(4).Infof("btrfs rdev major:minor %d:%d\n", int(major(buf.Rdev)), int(minor(buf.Rdev)))
// The type Dev and Rdev in Stat_t are 32bit on mips.
klog.V(4).Infof("btrfs dev major:minor %d:%d\n", int(major(uint64(buf.Dev))), int(minor(uint64(buf.Dev)))) // nolint: unconvert
klog.V(4).Infof("btrfs rdev major:minor %d:%d\n", int(major(uint64(buf.Rdev))), int(minor(uint64(buf.Rdev)))) // nolint: unconvert
return int(major(buf.Dev)), int(minor(buf.Dev)), nil
return int(major(uint64(buf.Dev))), int(minor(uint64(buf.Dev))), nil // nolint: unconvert
} else {
return 0, 0, fmt.Errorf("%s is not a block device", mount.Source)
}

View File

@@ -60,6 +60,8 @@ type ContainerSpec struct {
HasMemory bool `json:"has_memory"`
Memory MemorySpec `json:"memory,omitempty"`
HasHugetlb bool `json:"has_hugetlb"`
HasNetwork bool `json:"has_network"`
HasProcesses bool `json:"has_processes"`
@@ -342,6 +344,15 @@ type DiskIoStats struct {
IoTime []PerDiskStats `json:"io_time,omitempty"`
}
type HugetlbStats struct {
// current res_counter usage for hugetlb
Usage uint64 `json:"usage,omitempty"`
// maximum usage ever recorded.
MaxUsage uint64 `json:"max_usage,omitempty"`
// number of times hugetlb usage allocation failure.
Failcnt uint64 `json:"failcnt"`
}
type MemoryStats struct {
// Current memory usage, this includes all memory regardless of when it was
// accessed.
@@ -416,6 +427,8 @@ type NetworkStats struct {
Udp UdpStat `json:"udp"`
// UDP6 connection stats
Udp6 UdpStat `json:"udp6"`
// TCP advanced stats
TcpAdvanced TcpAdvancedStat `json:"tcp_advanced"`
}
type TcpStat struct {
@@ -443,6 +456,245 @@ type TcpStat struct {
Closing uint64
}
type TcpAdvancedStat struct {
// The algorithm used to determine the timeout value used for
// retransmitting unacknowledged octets, ref: RFC2698, default 1
RtoAlgorithm uint64
// The minimum value permitted by a TCP implementation for the
// retransmission timeout, measured in milliseconds, default 200ms
RtoMin uint64
// The maximum value permitted by a TCP implementation for the
// retransmission timeout, measured in milliseconds, default 120s
RtoMax uint64
// The limit on the total number of TCP connections the entity
// can support., default -1, i.e. infinity
MaxConn int64
// The number of times TCP connections have made a direct
// transition to the SYN-SENT state from the CLOSED state.
ActiveOpens uint64
// The number of times TCP connections have made a direct
// transition to the SYN-RCVD state from the LISTEN state.
PassiveOpens uint64
// The number of times TCP connections have made a direct
// transition to the CLOSED state from either the SYN-SENT
// state or the SYN-RCVD state, plus the number of times TCP
// connections have made a direct transition to the LISTEN
// state from the SYN-RCVD state.
AttemptFails uint64
// The number of times TCP connections have made a direct
// transition to the CLOSED state from either the ESTABLISHED
// state or the CLOSE-WAIT state.
EstabResets uint64
// The number of TCP connections for which the current state
// is either ESTABLISHED or CLOSE- WAIT.
CurrEstab uint64
// The total number of segments received, including those
// received in error.
InSegs uint64
// The total number of segments sent, including those on
// current connections but excluding those containing only
// retransmitted octets.
OutSegs uint64
// The total number of segments retransmitted - that is, the
// number of TCP segments transmitted containing one or more
// previously transmitted octets.
RetransSegs uint64
// The total number of segments received in error (e.g., bad
// TCP checksums).
InErrs uint64
// The number of TCP segments sent containing the RST flag.
OutRsts uint64
// The number of IP Packets with checksum errors
InCsumErrors uint64
// The number of resets received for embryonic SYN_RECV sockets
EmbryonicRsts uint64
// The number of SYN cookies sent
SyncookiesSent uint64
// The number of SYN cookies received
SyncookiesRecv uint64
// The number of invalid SYN cookies received
SyncookiesFailed uint64
// The number of packets pruned from receive queue because of socket buffer overrun
PruneCalled uint64
// The number of packets pruned from receive queue
RcvPruned uint64
// The number of packets dropped from out-of-order queue because of socket buffer overrun
OfoPruned uint64
// The number of ICMP packets dropped because they were out-of-window
OutOfWindowIcmps uint64
// The number of ICMP packets dropped because socket was locked
LockDroppedIcmps uint64
// The number of TCP sockets finished time wait in fast timer
TW uint64
// The number of time wait sockets recycled by time stamp
TWRecycled uint64
// The number of TCP sockets finished time wait in slow timer
TWKilled uint64
// counter, if no more mem for TIME-WAIT struct, +1
TCPTimeWaitOverflow uint64
// The number of RTO timer first timeout times
TCPTimeouts uint64
// The number of fake timeouts detected by F-RTO
TCPSpuriousRTOs uint64
// The number of send Tail Loss Probe (TLP) times by Probe Timeout(PTO)
TCPLossProbes uint64
// The number of recovery times by TLP
TCPLossProbeRecovery uint64
// The number of RTO failed times when in Recovery state, and remote end has no sack
TCPRenoRecoveryFail uint64
// The number of RTO failed times when in Recovery state, and remote end has sack
TCPSackRecoveryFail uint64
// The number of RTO failed times when in TCP_CA_Disorder state, and remote end has no sack
TCPRenoFailures uint64
// The number of RTO failed times when in TCP_CA_Disorder state, and remote end has sack
TCPSackFailures uint64
// The number of RTO failed times when in TCP_CA_Loss state,
TCPLossFailures uint64
// The number of delayed acks sent
DelayedACKs uint64
// The number of delayed acks further delayed because of locked socket
DelayedACKLocked uint64
// The number of quick ack mode was activated times
DelayedACKLost uint64
// The number of times the listen queue of a socket overflowed
ListenOverflows uint64
// The number of SYNs to LISTEN sockets dropped
ListenDrops uint64
// The number of packet headers predicted
TCPHPHits uint64
// The number of acknowledgments not containing data payload received
TCPPureAcks uint64
// The number of predicted acknowledgments
TCPHPAcks uint64
// The number of times recovered from packet loss due to fast retransmit
TCPRenoRecovery uint64
// The number of SACK retransmits failed
TCPSackRecovery uint64
// The number of bad SACK blocks received
TCPSACKReneging uint64
// The number of detected reordering times using FACK
TCPFACKReorder uint64
// The number of detected reordering times using SACK
TCPSACKReorder uint64
// The number of detected reordering times using Reno
TCPRenoReorder uint64
// The number of detected reordering times using time stamp
TCPTSReorder uint64
// The number of congestion windows fully recovered without slow start
TCPFullUndo uint64
// The number of congestion windows partially recovered using Hoe heuristic
TCPPartialUndo uint64
// The number of congestion windows recovered without slow start by DSACK
TCPDSACKUndo uint64
// The number of congestion windows recovered without slow start after partial ack
TCPLossUndo uint64
// The number of fast retransmits
TCPFastRetrans uint64
// The number of retransmits in slow start
TCPSlowStartRetrans uint64
// The number of retransmits lost
TCPLostRetransmit uint64
// The number of retransmits failed, including FastRetrans, SlowStartRetrans
TCPRetransFail uint64
// he number of packets collapsed in receive queue due to low socket buffer
TCPRcvCollapsed uint64
// The number of DSACKs sent for old packets
TCPDSACKOldSent uint64
// The number of DSACKs sent for out of order packets
TCPDSACKOfoSent uint64
// The number of DSACKs received
TCPDSACKRecv uint64
// The number of DSACKs for out of order packets received
TCPDSACKOfoRecv uint64
// The number of connections reset due to unexpected data
TCPAbortOnData uint64
// The number of connections reset due to early user close
TCPAbortOnClose uint64
// The number of connections aborted due to memory pressure
TCPAbortOnMemory uint64
// The number of connections aborted due to timeout
TCPAbortOnTimeout uint64
// The number of connections aborted after user close in linger timeout
TCPAbortOnLinger uint64
// The number of times unable to send RST due to no memory
TCPAbortFailed uint64
// The number of TCP ran low on memory times
TCPMemoryPressures uint64
// The number of TCP cumulative duration of
// memory pressure events, by ms
TCPMemoryPressuresChrono uint64
// The number of SACKs discard
TCPSACKDiscard uint64
// The number of DSACKs ignore old
TCPDSACKIgnoredOld uint64
// The number of DSACKs ignore no undo
TCPDSACKIgnoredNoUndo uint64
// The number of MD5 not found
TCPMD5NotFound uint64
// The number of MD5 unexpected
TCPMD5Unexpected uint64
// The number of MD5 failed
TCPMD5Failure uint64
// The number of Sack shifted
TCPSackShifted uint64
// The number of Sack merged
TCPSackMerged uint64
// The number of Sack shift fall back
TCPSackShiftFallback uint64
// The number of Backlog drop
TCPBacklogDrop uint64
// The number of PFmemalloc drop
PFMemallocDrop uint64
// The number of memalloc drop
TCPMinTTLDrop uint64
// The number of DeferAccept drop
TCPDeferAcceptDrop uint64
// The number of IP reverse path filter
IPReversePathFilter uint64
// The number of request full do cookies
TCPReqQFullDoCookies uint64
// The number of request full drop
TCPReqQFullDrop uint64
// number of successful outbound TFO connections
TCPFastOpenActive uint64
// number of SYN-ACK packets received that did not acknowledge data
// sent in the SYN packet and caused a retransmissions without SYN data.
TCPFastOpenActiveFail uint64
// number of successful inbound TFO connections
TCPFastOpenPassive uint64
// number of inbound SYN packets with TFO cookie that was invalid
TCPFastOpenPassiveFail uint64
// number of inbound SYN packets that will have TFO disabled because
// the socket has exceeded the max queue length
TCPFastOpenListenOverflow uint64
// number of inbound SYN packets requesting TFO with TFO set but no cookie
TCPFastOpenCookieReqd uint64
// number of SYN and SYN/ACK retransmits to break down retransmissions
// into SYN, fast-retransmits, timeout retransmits, etc.
TCPSynRetrans uint64
// number of outgoing packets with original data
// (excluding retransmission but including data-in-SYN).
TCPOrigDataSent uint64
// The number of active connections rejected because of time stamp
PAWSActive uint64
// The number of packetes rejected in established connections because of timestamp
PAWSEstab uint64
}
type UdpStat struct {
// Count of UDP sockets in state "Listen"
Listen uint64
@@ -564,6 +816,12 @@ type AcceleratorStats struct {
DutyCycle uint64 `json:"duty_cycle"`
}
type UlimitSpec struct {
Name string `json:"name"`
SoftLimit int64 `json:"soft_limit"`
HardLimit int64 `json:"hard_limit"`
}
type ProcessStats struct {
// Number of processes
ProcessCount uint64 `json:"process_count"`
@@ -579,16 +837,19 @@ type ProcessStats struct {
// Maxium number of threads allowed in container
ThreadsMax uint64 `json:"threads_max,omitempty"`
// Ulimits for the top-level container process
Ulimits []UlimitSpec `json:"ulimits,omitempty"`
}
type ContainerStats struct {
// The time of this stat point.
Timestamp time.Time `json:"timestamp"`
Cpu CpuStats `json:"cpu,omitempty"`
DiskIo DiskIoStats `json:"diskio,omitempty"`
Memory MemoryStats `json:"memory,omitempty"`
Network NetworkStats `json:"network,omitempty"`
Timestamp time.Time `json:"timestamp"`
Cpu CpuStats `json:"cpu,omitempty"`
DiskIo DiskIoStats `json:"diskio,omitempty"`
Memory MemoryStats `json:"memory,omitempty"`
Hugetlb map[string]HugetlbStats `json:"hugetlb,omitempty"`
Network NetworkStats `json:"network,omitempty"`
// Filesystem statistics
Filesystem []FsStats `json:"filesystem,omitempty"`
@@ -640,6 +901,9 @@ func (a *ContainerStats) StatsEq(b *ContainerStats) bool {
if !reflect.DeepEqual(a.Memory, b.Memory) {
return false
}
if !reflect.DeepEqual(a.Hugetlb, b.Hugetlb) {
return false
}
if !reflect.DeepEqual(a.DiskIo, b.DiskIo) {
return false
}

View File

@@ -161,12 +161,23 @@ type MachineInfo struct {
// The number of cores in this machine.
NumCores int `json:"num_cores"`
// The number of physical cores in this machine.
NumPhysicalCores int `json:"num_physical_cores"`
// The number of cpu sockets in this machine.
NumSockets int `json:"num_sockets"`
// Maximum clock speed for the cores, in KHz.
CpuFrequency uint64 `json:"cpu_frequency_khz"`
// The amount of memory (in bytes) in this machine
MemoryCapacity uint64 `json:"memory_capacity"`
// Memory capacity and number of DIMMs by memory type
MemoryByType map[string]*MemoryInfo `json:"memory_by_type"`
NVMInfo NVMInfo `json:"nvm"`
// HugePages on this machine.
HugePages []HugePagesInfo `json:"hugepages"`
@@ -202,6 +213,25 @@ type MachineInfo struct {
InstanceID InstanceID `json:"instance_id"`
}
type MemoryInfo struct {
// The amount of memory (in bytes).
Capacity uint64 `json:"capacity"`
// Number of memory DIMMs.
DimmCount uint `json:"dimm_count"`
}
type NVMInfo struct {
// The total NVM capacity in bytes for memory mode.
MemoryModeCapacity uint64 `json:"memory_mode_capacity"`
//The total NVM capacity in bytes for app direct mode.
AppDirectModeCapacity uint64 `json:"app direct_mode_capacity"`
// Average power budget in watts for NVM devices configured in BIOS.
AvgPowerBudget uint `json:"avg_power_budget"`
}
type VersionInfo struct {
// Kernel version.
KernelVersion string `json:"kernel_version"`

View File

@@ -68,7 +68,8 @@ type MetricValBasic struct {
// An exported metric.
type MetricVal struct {
// Label associated with a metric
Label string `json:"label,omitempty"`
Label string `json:"label,omitempty"`
Labels map[string]string `json:"labels,omitempty"`
// Time at which the metric was queried
Timestamp time.Time `json:"timestamp"`

View File

@@ -87,6 +87,8 @@ type ContainerSpec struct {
HasMemory bool `json:"has_memory"`
Memory MemorySpec `json:"memory,omitempty"`
HasHugetlb bool `json:"has_hugetlb"`
HasCustomMetrics bool `json:"has_custom_metrics"`
CustomMetrics []v1.MetricSpec `json:"custom_metrics,omitempty"`
@@ -117,6 +119,8 @@ type DeprecatedContainerStats struct {
// Memory statistics
HasMemory bool `json:"has_memory"`
Memory v1.MemoryStats `json:"memory,omitempty"`
// Hugepage statistics
HasHugetlb bool `json:"has_hugetlb"`
// Network statistics
HasNetwork bool `json:"has_network"`
Network NetworkStats `json:"network,omitempty"`
@@ -146,6 +150,8 @@ type ContainerStats struct {
DiskIo *v1.DiskIoStats `json:"diskio,omitempty"`
// Memory statistics
Memory *v1.MemoryStats `json:"memory,omitempty"`
// Hugepage statistics
Hugetlb *map[string]v1.HugetlbStats `json:"hugetlb,omitempty"`
// Network statistics
Network *NetworkStats `json:"network,omitempty"`
// Processes statistics
@@ -290,6 +296,8 @@ type NetworkStats struct {
Udp v1.UdpStat `json:"udp"`
// UDP6 connection stats
Udp6 v1.UdpStat `json:"udp6"`
// TCP advanced stats
TcpAdvanced v1.TcpAdvancedStat `json:"tcp_advanced"`
}
// Instantaneous CPU stats

View File

@@ -116,6 +116,9 @@ func ContainerStatsFromV1(containerName string, spec *v1.ContainerSpec, stats []
if spec.HasMemory {
stat.Memory = &val.Memory
}
if spec.HasHugetlb {
stat.Hugetlb = &val.Hugetlb
}
if spec.HasNetwork {
// TODO: Handle TcpStats
stat.Network = &NetworkStats{
@@ -259,6 +262,7 @@ func ContainerSpecFromV1(specV1 *v1.ContainerSpec, aliases []string, namespace s
CreationTime: specV1.CreationTime,
HasCpu: specV1.HasCpu,
HasMemory: specV1.HasMemory,
HasHugetlb: specV1.HasHugetlb,
HasFilesystem: specV1.HasFilesystem,
HasNetwork: specV1.HasNetwork,
HasProcesses: specV1.HasProcesses,

View File

@@ -5,12 +5,15 @@ go_library(
srcs = [
"info.go",
"machine.go",
"machine_no_libipmctl.go",
"operatingsystem_unix.go",
"operatingsystem_windows.go",
],
cgo = True,
importmap = "k8s.io/kubernetes/vendor/github.com/google/cadvisor/machine",
importpath = "github.com/google/cadvisor/machine",
visibility = ["//visibility:public"],
deps = [
"//vendor/github.com/docker/docker/pkg/parsers/operatingsystem:go_default_library",
"//vendor/github.com/google/cadvisor/fs:go_default_library",
"//vendor/github.com/google/cadvisor/info/v1:go_default_library",
"//vendor/github.com/google/cadvisor/utils:go_default_library",
@@ -19,7 +22,12 @@ go_library(
"//vendor/github.com/google/cadvisor/utils/sysinfo:go_default_library",
"//vendor/golang.org/x/sys/unix:go_default_library",
"//vendor/k8s.io/klog:go_default_library",
],
] + select({
"@io_bazel_rules_go//go/platform:windows": [
"//vendor/golang.org/x/sys/windows/registry:go_default_library",
],
"//conditions:default": [],
}),
)
filegroup(

View File

@@ -21,7 +21,6 @@ import (
"path/filepath"
"strings"
"github.com/docker/docker/pkg/parsers/operatingsystem"
"github.com/google/cadvisor/fs"
info "github.com/google/cadvisor/info/v1"
"github.com/google/cadvisor/utils/cloudinfo"
@@ -34,6 +33,7 @@ import (
)
const hugepagesDirectory = "/sys/kernel/mm/hugepages/"
const memoryControllerPath = "/sys/devices/system/edac/mc/"
var machineIdFilePath = flag.String("machine_id_file", "/etc/machine-id,/var/lib/dbus/machine-id", "Comma-separated list of files to check for machine-id. Use the first one that exists.")
var bootIdFilePath = flag.String("boot_id_file", "/proc/sys/kernel/random/boot_id", "Comma-separated list of files to check for boot-id. Use the first one that exists.")
@@ -72,7 +72,17 @@ func Info(sysFs sysfs.SysFs, fsInfo fs.FsInfo, inHostNamespace bool) (*info.Mach
return nil, err
}
hugePagesInfo, err := GetHugePagesInfo(hugepagesDirectory)
memoryByType, err := GetMachineMemoryByType(memoryControllerPath)
if err != nil {
return nil, err
}
nvmInfo, err := GetNVMInfo()
if err != nil {
return nil, err
}
hugePagesInfo, err := sysinfo.GetHugePagesInfo(sysFs, hugepagesDirectory)
if err != nil {
return nil, err
}
@@ -92,7 +102,7 @@ func Info(sysFs sysfs.SysFs, fsInfo fs.FsInfo, inHostNamespace bool) (*info.Mach
klog.Errorf("Failed to get network devices: %v", err)
}
topology, numCores, err := GetTopology(sysFs, string(cpuinfo))
topology, numCores, err := GetTopology(sysFs)
if err != nil {
klog.Errorf("Failed to get topology information: %v", err)
}
@@ -108,19 +118,23 @@ func Info(sysFs sysfs.SysFs, fsInfo fs.FsInfo, inHostNamespace bool) (*info.Mach
instanceID := realCloudInfo.GetInstanceID()
machineInfo := &info.MachineInfo{
NumCores: numCores,
CpuFrequency: clockSpeed,
MemoryCapacity: memoryCapacity,
HugePages: hugePagesInfo,
DiskMap: diskMap,
NetworkDevices: netDevices,
Topology: topology,
MachineID: getInfoFromFiles(filepath.Join(rootFs, *machineIdFilePath)),
SystemUUID: systemUUID,
BootID: getInfoFromFiles(filepath.Join(rootFs, *bootIdFilePath)),
CloudProvider: cloudProvider,
InstanceType: instanceType,
InstanceID: instanceID,
NumCores: numCores,
NumPhysicalCores: GetPhysicalCores(cpuinfo),
NumSockets: GetSockets(cpuinfo),
CpuFrequency: clockSpeed,
MemoryCapacity: memoryCapacity,
MemoryByType: memoryByType,
NVMInfo: nvmInfo,
HugePages: hugePagesInfo,
DiskMap: diskMap,
NetworkDevices: netDevices,
Topology: topology,
MachineID: getInfoFromFiles(filepath.Join(rootFs, *machineIdFilePath)),
SystemUUID: systemUUID,
BootID: getInfoFromFiles(filepath.Join(rootFs, *bootIdFilePath)),
CloudProvider: cloudProvider,
InstanceType: instanceType,
InstanceID: instanceID,
}
for i := range filesystems {
@@ -136,7 +150,7 @@ func Info(sysFs sysfs.SysFs, fsInfo fs.FsInfo, inHostNamespace bool) (*info.Mach
}
func ContainerOsVersion() string {
os, err := operatingsystem.GetOperatingSystem()
os, err := getOperatingSystem()
if err != nil {
os = "Unknown"
}

View File

@@ -16,9 +16,10 @@
package machine
import (
"bytes"
"fmt"
"io/ioutil"
"os"
"path"
"path/filepath"
"regexp"
"strconv"
@@ -39,23 +40,60 @@ import (
var (
cpuRegExp = regexp.MustCompile(`^processor\s*:\s*([0-9]+)$`)
coreRegExp = regexp.MustCompile(`^core id\s*:\s*([0-9]+)$`)
nodeRegExp = regexp.MustCompile(`^physical id\s*:\s*([0-9]+)$`)
coreRegExp = regexp.MustCompile(`(?m)^core id\s*:\s*([0-9]+)$`)
nodeRegExp = regexp.MustCompile(`(?m)^physical id\s*:\s*([0-9]+)$`)
nodeBusRegExp = regexp.MustCompile(`^node([0-9]+)$`)
// Power systems have a different format so cater for both
cpuClockSpeedMHz = regexp.MustCompile(`(?:cpu MHz|clock)\s*:\s*([0-9]+\.[0-9]+)(?:MHz)?`)
memoryCapacityRegexp = regexp.MustCompile(`MemTotal:\s*([0-9]+) kB`)
swapCapacityRegexp = regexp.MustCompile(`SwapTotal:\s*([0-9]+) kB`)
cpuBusPath = "/sys/bus/cpu/devices/"
isMemoryController = regexp.MustCompile("mc[0-9]+")
isDimm = regexp.MustCompile("dimm[0-9]+")
machineArch = getMachineArch()
)
const maxFreqFile = "/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq"
const cpuBusPath = "/sys/bus/cpu/devices/"
const nodePath = "/sys/devices/system/node"
const sysFsCPUCoreID = "core_id"
const sysFsCPUPhysicalPackageID = "physical_package_id"
const sysFsCPUTopology = "topology"
const memTypeFileName = "dimm_mem_type"
const sizeFileName = "size"
// GetPhysicalCores returns number of CPU cores reading /proc/cpuinfo file or if needed information from sysfs cpu path
func GetPhysicalCores(procInfo []byte) int {
numCores := getUniqueMatchesCount(string(procInfo), coreRegExp)
if numCores == 0 {
// read number of cores from /sys/bus/cpu/devices/cpu*/topology/core_id to deal with processors
// for which 'core id' is not available in /proc/cpuinfo
numCores = getUniqueCPUPropertyCount(cpuBusPath, sysFsCPUCoreID)
}
if numCores == 0 {
klog.Errorf("Cannot read number of physical cores correctly, number of cores set to %d", numCores)
}
return numCores
}
// GetSockets returns number of CPU sockets reading /proc/cpuinfo file or if needed information from sysfs cpu path
func GetSockets(procInfo []byte) int {
numSocket := getUniqueMatchesCount(string(procInfo), nodeRegExp)
if numSocket == 0 {
// read number of sockets from /sys/bus/cpu/devices/cpu*/topology/physical_package_id to deal with processors
// for which 'physical id' is not available in /proc/cpuinfo
numSocket = getUniqueCPUPropertyCount(cpuBusPath, sysFsCPUPhysicalPackageID)
}
if numSocket == 0 {
klog.Errorf("Cannot read number of sockets correctly, number of sockets set to %d", numSocket)
}
return numSocket
}
// GetClockSpeed returns the CPU clock speed, given a []byte formatted as the /proc/cpuinfo file.
func GetClockSpeed(procInfo []byte) (uint64, error) {
// s390/s390x, aarch64 and arm32 changes
if isSystemZ() || isAArch64() || isArm32() {
// s390/s390x, mips64, riscv64, aarch64 and arm32 changes
if isMips64() || isSystemZ() || isAArch64() || isArm32() || isRiscv64() {
return 0, nil
}
@@ -101,6 +139,65 @@ func GetMachineMemoryCapacity() (uint64, error) {
return memoryCapacity, err
}
// GetMachineMemoryByType returns information about memory capacity and number of DIMMs.
// Information is retrieved from sysfs edac per-DIMM API (/sys/devices/system/edac/mc/)
// introduced in kernel 3.6. Documentation can be found at
// https://www.kernel.org/doc/Documentation/admin-guide/ras.rst.
// Full list of memory types can be found in edac_mc.c
// (https://github.com/torvalds/linux/blob/v5.5/drivers/edac/edac_mc.c#L198)
func GetMachineMemoryByType(edacPath string) (map[string]*info.MemoryInfo, error) {
memory := map[string]*info.MemoryInfo{}
names, err := ioutil.ReadDir(edacPath)
// On some architectures (such as ARM) memory controller device may not exist.
// If this is the case then we ignore error and return empty slice.
_, ok := err.(*os.PathError)
if err != nil && ok {
return memory, nil
} else if err != nil {
return memory, err
}
for _, controllerDir := range names {
controller := controllerDir.Name()
if !isMemoryController.MatchString(controller) {
continue
}
dimms, err := ioutil.ReadDir(path.Join(edacPath, controllerDir.Name()))
if err != nil {
return map[string]*info.MemoryInfo{}, err
}
for _, dimmDir := range dimms {
dimm := dimmDir.Name()
if !isDimm.MatchString(dimm) {
continue
}
memType, err := ioutil.ReadFile(path.Join(edacPath, controller, dimm, memTypeFileName))
readableMemType := strings.TrimSpace(string(memType))
if err != nil {
return map[string]*info.MemoryInfo{}, err
}
if _, exists := memory[readableMemType]; !exists {
memory[readableMemType] = &info.MemoryInfo{}
}
size, err := ioutil.ReadFile(path.Join(edacPath, controller, dimm, sizeFileName))
if err != nil {
return map[string]*info.MemoryInfo{}, err
}
capacity, err := strconv.Atoi(strings.TrimSpace(string(size)))
if err != nil {
return map[string]*info.MemoryInfo{}, err
}
memory[readableMemType].Capacity += uint64(mbToBytes(capacity))
memory[readableMemType].DimmCount++
}
}
return memory, nil
}
func mbToBytes(megabytes int) int {
return megabytes * 1024 * 1024
}
// GetMachineSwapCapacity returns the machine's total swap from /proc/meminfo.
// Returns the total swap capacity as an uint64 (number of bytes).
func GetMachineSwapCapacity() (uint64, error) {
@@ -116,6 +213,15 @@ func GetMachineSwapCapacity() (uint64, error) {
return swapCapacity, err
}
// GetTopology returns CPU topology reading information from sysfs
func GetTopology(sysFs sysfs.SysFs) ([]info.Node, int, error) {
// s390/s390x changes
if isSystemZ() {
return nil, getNumCores(), nil
}
return sysinfo.GetNodesInfo(sysFs)
}
// parseCapacity matches a Regexp in a []byte, returning the resulting value in bytes.
// Assumes that the value matched by the Regexp is in KB.
func parseCapacity(b []byte, r *regexp.Regexp) (uint64, error) {
@@ -132,224 +238,26 @@ func parseCapacity(b []byte, r *regexp.Regexp) (uint64, error) {
return m * 1024, err
}
/* Look for sysfs cpu path containing core_id */
/* Such as: sys/bus/cpu/devices/cpu0/topology/core_id */
func getCoreIdFromCpuBus(cpuBusPath string, threadId int) (int, error) {
path := filepath.Join(cpuBusPath, fmt.Sprintf("cpu%d/topology", threadId))
file := filepath.Join(path, "core_id")
num, err := ioutil.ReadFile(file)
// Looks for sysfs cpu path containing given CPU property, e.g. core_id or physical_package_id
// and returns number of unique values of given property, exemplary usage: getting number of CPU physical cores
func getUniqueCPUPropertyCount(cpuBusPath string, propertyName string) int {
pathPattern := cpuBusPath + "cpu*[0-9]"
sysCPUPaths, err := filepath.Glob(pathPattern)
if err != nil {
return threadId, err
klog.Errorf("Cannot find files matching pattern (pathPattern: %s), number of unique %s set to 0", pathPattern, propertyName)
return 0
}
coreId, err := strconv.ParseInt(string(bytes.TrimSpace(num)), 10, 32)
if err != nil {
return threadId, err
}
if coreId < 0 {
// report threadId if found coreId < 0
coreId = int64(threadId)
}
return int(coreId), nil
}
/* Look for sysfs cpu path containing node id */
/* Such as: /sys/bus/cpu/devices/cpu0/node%d */
func getNodeIdFromCpuBus(cpuBusPath string, threadId int) (int, error) {
path := filepath.Join(cpuBusPath, fmt.Sprintf("cpu%d", threadId))
files, err := ioutil.ReadDir(path)
if err != nil {
return 0, err
}
nodeId := 0
for _, file := range files {
filename := file.Name()
isNode, error := regexp.MatchString("^node([0-9]+)$", filename)
if error != nil {
continue
}
if !isNode {
continue
}
ok, val, _ := extractValue(filename, nodeBusRegExp)
uniques := make(map[string]bool)
for _, sysCPUPath := range sysCPUPaths {
propertyPath := filepath.Join(sysCPUPath, sysFsCPUTopology, propertyName)
propertyVal, err := ioutil.ReadFile(propertyPath)
if err != nil {
continue
}
if ok {
if val < 0 {
continue
}
nodeId = val
klog.Errorf("Cannot open %s, number of unique %s set to 0", propertyPath, propertyName)
return 0
}
uniques[string(propertyVal)] = true
}
return nodeId, nil
}
// GetHugePagesInfo returns information about pre-allocated huge pages
// hugepagesDirectory should be top directory of hugepages
// Such as: /sys/kernel/mm/hugepages/
func GetHugePagesInfo(hugepagesDirectory string) ([]info.HugePagesInfo, error) {
var hugePagesInfo []info.HugePagesInfo
files, err := ioutil.ReadDir(hugepagesDirectory)
if err != nil {
// treat as non-fatal since kernels and machine can be
// configured to disable hugepage support
return hugePagesInfo, nil
}
for _, st := range files {
nameArray := strings.Split(st.Name(), "-")
pageSizeArray := strings.Split(nameArray[1], "kB")
pageSize, err := strconv.ParseUint(string(pageSizeArray[0]), 10, 64)
if err != nil {
return hugePagesInfo, err
}
numFile := hugepagesDirectory + st.Name() + "/nr_hugepages"
val, err := ioutil.ReadFile(numFile)
if err != nil {
return hugePagesInfo, err
}
var numPages uint64
// we use sscanf as the file as a new-line that trips up ParseUint
// it returns the number of tokens successfully parsed, so if
// n != 1, it means we were unable to parse a number from the file
n, err := fmt.Sscanf(string(val), "%d", &numPages)
if err != nil || n != 1 {
return hugePagesInfo, fmt.Errorf("could not parse file %v contents %q", numFile, string(val))
}
hugePagesInfo = append(hugePagesInfo, info.HugePagesInfo{
NumPages: numPages,
PageSize: pageSize,
})
}
return hugePagesInfo, nil
}
func GetTopology(sysFs sysfs.SysFs, cpuinfo string) ([]info.Node, int, error) {
nodes := []info.Node{}
// s390/s390x changes
if true == isSystemZ() {
return nodes, getNumCores(), nil
}
numCores := 0
lastThread := -1
lastCore := -1
lastNode := -1
for _, line := range strings.Split(cpuinfo, "\n") {
if line == "" {
continue
}
ok, val, err := extractValue(line, cpuRegExp)
if err != nil {
return nil, -1, fmt.Errorf("could not parse cpu info from %q: %v", line, err)
}
if ok {
thread := val
numCores++
if lastThread != -1 {
// New cpu section. Save last one.
nodeIdx, err := addNode(&nodes, lastNode)
if err != nil {
return nil, -1, fmt.Errorf("failed to add node %d: %v", lastNode, err)
}
nodes[nodeIdx].AddThread(lastThread, lastCore)
lastCore = -1
lastNode = -1
}
lastThread = thread
/* On Arm platform, no 'core id' and 'physical id' in '/proc/cpuinfo'. */
/* So we search sysfs cpu path directly. */
/* This method can also be used on other platforms, such as x86, ppc64le... */
/* /sys/bus/cpu/devices/cpu%d contains the information of 'core_id' & 'node_id'. */
/* Such as: /sys/bus/cpu/devices/cpu0/topology/core_id */
/* Such as: /sys/bus/cpu/devices/cpu0/node0 */
if isAArch64() {
val, err = getCoreIdFromCpuBus(cpuBusPath, lastThread)
if err != nil {
// Report thread id if no NUMA
val = lastThread
}
lastCore = val
val, err = getNodeIdFromCpuBus(cpuBusPath, lastThread)
if err != nil {
// Report node 0 if no NUMA
val = 0
}
lastNode = val
}
continue
}
if isAArch64() {
/* On Arm platform, no 'core id' and 'physical id' in '/proc/cpuinfo'. */
continue
}
ok, val, err = extractValue(line, coreRegExp)
if err != nil {
return nil, -1, fmt.Errorf("could not parse core info from %q: %v", line, err)
}
if ok {
lastCore = val
continue
}
ok, val, err = extractValue(line, nodeRegExp)
if err != nil {
return nil, -1, fmt.Errorf("could not parse node info from %q: %v", line, err)
}
if ok {
lastNode = val
continue
}
}
nodeIdx, err := addNode(&nodes, lastNode)
if err != nil {
return nil, -1, fmt.Errorf("failed to add node %d: %v", lastNode, err)
}
nodes[nodeIdx].AddThread(lastThread, lastCore)
if numCores < 1 {
return nil, numCores, fmt.Errorf("could not detect any cores")
}
for idx, node := range nodes {
caches, err := sysinfo.GetCacheInfo(sysFs, node.Cores[0].Threads[0])
if err != nil {
klog.Errorf("failed to get cache information for node %d: %v", node.Id, err)
continue
}
numThreadsPerCore := len(node.Cores[0].Threads)
numThreadsPerNode := len(node.Cores) * numThreadsPerCore
for _, cache := range caches {
c := info.Cache{
Size: cache.Size,
Level: cache.Level,
Type: cache.Type,
}
if cache.Cpus == numThreadsPerNode && cache.Level > 2 {
// Add a node-level cache.
nodes[idx].AddNodeCache(c)
} else if cache.Cpus == numThreadsPerCore {
// Add to each core.
nodes[idx].AddPerCoreCache(c)
}
// Ignore unknown caches.
}
}
return nodes, numCores, nil
return len(uniques)
}
func extractValue(s string, r *regexp.Regexp) (bool, int, error) {
@@ -364,88 +272,49 @@ func extractValue(s string, r *regexp.Regexp) (bool, int, error) {
return false, -1, nil
}
func findNode(nodes []info.Node, id int) (bool, int) {
for i, n := range nodes {
if n.Id == id {
return true, i
}
// getUniqueMatchesCount returns number of unique matches in given argument using provided regular expression
func getUniqueMatchesCount(s string, r *regexp.Regexp) int {
matches := r.FindAllString(s, -1)
uniques := make(map[string]bool)
for _, match := range matches {
uniques[match] = true
}
return false, -1
return len(uniques)
}
func addNode(nodes *[]info.Node, id int) (int, error) {
var idx int
if id == -1 {
// Some VMs don't fill topology data. Export single package.
id = 0
}
ok, idx := findNode(*nodes, id)
if !ok {
// New node
node := info.Node{Id: id}
// Add per-node memory information.
meminfo := fmt.Sprintf("/sys/devices/system/node/node%d/meminfo", id)
out, err := ioutil.ReadFile(meminfo)
// Ignore if per-node info is not available.
if err == nil {
m, err := parseCapacity(out, memoryCapacityRegexp)
if err != nil {
return -1, err
}
node.Memory = uint64(m)
}
// Look for per-node hugepages info using node id
// Such as: /sys/devices/system/node/node%d/hugepages
hugepagesDirectory := fmt.Sprintf("%s/node%d/hugepages/", nodePath, id)
hugePagesInfo, err := GetHugePagesInfo(hugepagesDirectory)
if err != nil {
return -1, err
}
node.HugePages = hugePagesInfo
*nodes = append(*nodes, node)
idx = len(*nodes) - 1
}
return idx, nil
}
// s390/s390x changes
func getMachineArch() (string, error) {
func getMachineArch() string {
uname := unix.Utsname{}
err := unix.Uname(&uname)
if err != nil {
return "", err
klog.Errorf("Cannot get machine architecture, err: %v", err)
return ""
}
return string(uname.Machine[:]), nil
return string(uname.Machine[:])
}
// arm32 chanes
// arm32 changes
func isArm32() bool {
arch, err := getMachineArch()
if err == nil {
return strings.Contains(arch, "arm")
}
return false
return strings.Contains(machineArch, "arm")
}
// aarch64 changes
func isAArch64() bool {
arch, err := getMachineArch()
if err == nil {
return strings.Contains(arch, "aarch64")
}
return false
return strings.Contains(machineArch, "aarch64")
}
// s390/s390x changes
func isSystemZ() bool {
arch, err := getMachineArch()
if err == nil {
return strings.Contains(arch, "390")
}
return false
return strings.Contains(machineArch, "390")
}
// riscv64 changes
func isRiscv64() bool {
return strings.Contains(machineArch, "riscv64")
}
// mips64 changes
func isMips64() bool {
return strings.Contains(machineArch, "mips64")
}
// s390/s390x changes

View File

@@ -0,0 +1,98 @@
// +build libipmctl,cgo
// Copyright 2020 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package machine
// #cgo pkg-config: libipmctl
// #include <nvm_management.h>
import "C"
import (
"fmt"
info "github.com/google/cadvisor/info/v1"
"k8s.io/klog"
)
// getNVMAvgPowerBudget retrieves configured power budget
// (in watts) for NVM devices. When libipmct is not available
// zero is returned.
func getNVMAvgPowerBudget() (uint, error) {
// Get number of devices on the platform
// see: https://github.com/intel/ipmctl/blob/v01.00.00.3497/src/os/nvm_api/nvm_management.h#L1478
var count C.uint
err := C.nvm_get_number_of_devices(&count)
if err != C.NVM_SUCCESS {
klog.Warningf("Unable to get number of NVM devices. Status code: %d", err)
return uint(0), fmt.Errorf("Unable to get number of NVM devices. Status code: %d", err)
}
// Load basic device information for all the devices
// to obtain UID of the first one.
var devices = make([]C.struct_device_discovery, count)
err = C.nvm_get_devices(&devices[0], C.uchar(count))
if err != C.NVM_SUCCESS {
klog.Warningf("Unable to get all NVM devices. Status code: %d", err)
return uint(0), fmt.Errorf("Unable to get all NVM devices. Status code: %d", err)
}
// Power budget is same for all the devices
// so we can rely on any of them.
var device C.struct_device_details
err = C.nvm_get_device_details(&devices[0].uid[0], &device)
if err != C.NVM_SUCCESS {
uid := C.GoString(&devices[0].uid[0])
klog.Warningf("Unable to get details of NVM device %q. Status code: %d", uid, err)
return uint(0), fmt.Errorf("Unable to get details of NVM device %q. Status code: %d", uid, err)
}
return uint(device.avg_power_budget / 1000), nil
}
// getNVMCapacities retrieves the total NVM capacity in bytes for memory mode and app direct mode
func getNVMCapacities() (uint64, uint64, error) {
var caps C.struct_device_capacities
err := C.nvm_get_nvm_capacities(&caps)
if err != C.NVM_SUCCESS {
klog.Warningf("Unable to get NVM capacity. Status code: %d", err)
return uint64(0), uint64(0), fmt.Errorf("Unable to get NVM capacity. Status code: %d", err)
}
return uint64(caps.memory_capacity), uint64(caps.app_direct_capacity), nil
}
// GetNVMInfo returns information specific for non-volatile memory modules
func GetNVMInfo() (info.NVMInfo, error) {
nvmInfo := info.NVMInfo{}
// Initialize libipmctl library.
cErr := C.nvm_init()
if cErr != C.NVM_SUCCESS {
klog.Warningf("libipmctl initialization failed with status %d", cErr)
return info.NVMInfo{}, fmt.Errorf("libipmctl initialization failed with status %d", cErr)
}
defer C.nvm_uninit()
var err error
nvmInfo.MemoryModeCapacity, nvmInfo.AppDirectModeCapacity, err = getNVMCapacities()
if err != nil {
return info.NVMInfo{}, fmt.Errorf("Unable to get NVM capacities, err: %s", err)
}
nvmInfo.AvgPowerBudget, err = getNVMAvgPowerBudget()
if err != nil {
return info.NVMInfo{}, fmt.Errorf("Unable to get NVM average power budget, err: %s", err)
}
return nvmInfo, nil
}

View File

@@ -0,0 +1,25 @@
// +build !libipmctl !cgo
// Copyright 2020 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package machine
import info "github.com/google/cadvisor/info/v1"
// GetNVMInfo returns information specific for non-volatile memory modules.
// When libipmct is not available zero value is returned.
func GetNVMInfo() (info.NVMInfo, error) {
return info.NVMInfo{}, nil
}

View File

@@ -0,0 +1,55 @@
// Copyright 2020 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// +build freebsd darwin linux
package machine
import (
"fmt"
"io/ioutil"
"os"
"os/exec"
"regexp"
"runtime"
"strings"
)
var rex = regexp.MustCompile("(PRETTY_NAME)=(.*)")
// getOperatingSystem gets the name of the current operating system.
func getOperatingSystem() (string, error) {
if runtime.GOOS == "darwin" || runtime.GOOS == "freebsd" {
cmd := exec.Command("uname", "-s")
osName, err := cmd.Output()
if err != nil {
return "", err
}
return string(osName), nil
} else {
bytes, err := ioutil.ReadFile("/etc/os-release")
if err != nil && os.IsNotExist(err) {
// /usr/lib/os-release in stateless systems like Clear Linux
bytes, err = ioutil.ReadFile("/usr/lib/os-release")
}
if err != nil {
return "", fmt.Errorf("error opening file : %v", err)
}
line := rex.FindAllStringSubmatch(string(bytes), -1)
if len(line) > 0 {
return strings.Trim(line[0][2], "\""), nil
}
return "Linux", nil
}
}

View File

@@ -0,0 +1,54 @@
// Copyright 2020 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package machine
import (
"fmt"
"golang.org/x/sys/windows/registry"
)
func getOperatingSystem() (string, error) {
system := "Windows"
k, err := registry.OpenKey(registry.LOCAL_MACHINE, `SOFTWARE\Microsoft\Windows NT\CurrentVersion`, registry.QUERY_VALUE)
if err != nil {
return system, err
}
defer k.Close()
productName, _, err := k.GetStringValue("ProductName")
if err != nil {
return system, nil
}
releaseId, _, err := k.GetStringValue("ReleaseId")
if err != nil {
return system, err
}
currentBuildNumber, _, err := k.GetStringValue("CurrentBuildNumber")
if err != nil {
return system, err
}
revision, _, err := k.GetIntegerValue("UBR")
if err != nil {
return system, err
}
system = fmt.Sprintf("%s Version %s (OS Build %s.%d)",
productName, releaseId, currentBuildNumber, revision)
return system, nil
}

View File

@@ -22,6 +22,7 @@ go_library(
"//vendor/github.com/google/cadvisor/info/v1:go_default_library",
"//vendor/github.com/google/cadvisor/info/v2:go_default_library",
"//vendor/github.com/google/cadvisor/machine:go_default_library",
"//vendor/github.com/google/cadvisor/stats:go_default_library",
"//vendor/github.com/google/cadvisor/summary:go_default_library",
"//vendor/github.com/google/cadvisor/utils/cpuload:go_default_library",
"//vendor/github.com/google/cadvisor/utils/oomparser:go_default_library",

View File

@@ -29,12 +29,12 @@ import (
"sync"
"time"
"github.com/google/cadvisor/accelerators"
"github.com/google/cadvisor/cache/memory"
"github.com/google/cadvisor/collector"
"github.com/google/cadvisor/container"
info "github.com/google/cadvisor/info/v1"
"github.com/google/cadvisor/info/v2"
"github.com/google/cadvisor/stats"
"github.com/google/cadvisor/summary"
"github.com/google/cadvisor/utils/cpuload"
@@ -81,7 +81,7 @@ type containerData struct {
logUsage bool
// Tells the container to stop.
stop chan bool
stop chan struct{}
// Tells the container to immediately collect stats
onDemandChan chan chan struct{}
@@ -90,7 +90,7 @@ type containerData struct {
collectorManager collector.CollectorManager
// nvidiaCollector updates stats for Nvidia GPUs attached to the container.
nvidiaCollector accelerators.AcceleratorCollector
nvidiaCollector stats.Collector
}
// jitter returns a time.Duration between duration and duration + maxFactor * duration,
@@ -114,7 +114,7 @@ func (c *containerData) Stop() error {
if err != nil {
return err
}
c.stop <- true
close(c.stop)
return nil
}
@@ -383,7 +383,7 @@ func newContainerData(containerName string, memoryCache *memory.InMemoryCache, h
allowDynamicHousekeeping: allowDynamicHousekeeping,
logUsage: logUsage,
loadAvg: -1.0, // negative value indicates uninitialized.
stop: make(chan bool, 1),
stop: make(chan struct{}),
collectorManager: collectorManager,
onDemandChan: make(chan chan struct{}, 100),
clock: clock,

View File

@@ -37,6 +37,7 @@ import (
info "github.com/google/cadvisor/info/v1"
"github.com/google/cadvisor/info/v2"
"github.com/google/cadvisor/machine"
"github.com/google/cadvisor/stats"
"github.com/google/cadvisor/utils/oomparser"
"github.com/google/cadvisor/utils/sysfs"
"github.com/google/cadvisor/version"
@@ -181,7 +182,7 @@ func New(memoryCache *memory.InMemoryCache, sysfs sysfs.SysFs, maxHousekeepingIn
containerWatchers: []watcher.ContainerWatcher{},
eventsChannel: eventsChannel,
collectorHttpClient: collectorHttpClient,
nvidiaManager: &accelerators.NvidiaManager{},
nvidiaManager: accelerators.NewNvidiaManager(),
rawContainerCgroupPathPrefixWhiteList: rawContainerCgroupPathPrefixWhiteList,
}
@@ -230,7 +231,7 @@ type manager struct {
containerWatchers []watcher.ContainerWatcher
eventsChannel chan watcher.ContainerEvent
collectorHttpClient *http.Client
nvidiaManager accelerators.AcceleratorManager
nvidiaManager stats.Manager
// List of raw container cgroup path prefix whitelist.
rawContainerCgroupPathPrefixWhiteList []string
}

View File

@@ -321,6 +321,60 @@ func NewPrometheusCollector(i infoProvider, f ContainerLabelsFunc, includedMetri
},
}...)
}
if includedMetrics.Has(container.HugetlbUsageMetrics) {
c.containerMetrics = append(c.containerMetrics, []containerMetric{
{
name: "container_hugetlb_failcnt",
help: "Number of hugepage usage hits limits",
valueType: prometheus.CounterValue,
extraLabels: []string{"pagesize"},
getValues: func(s *info.ContainerStats) metricValues {
values := make(metricValues, 0, len(s.Hugetlb))
for k, v := range s.Hugetlb {
values = append(values, metricValue{
value: float64(v.Failcnt),
labels: []string{k},
timestamp: s.Timestamp,
})
}
return values
},
}, {
name: "container_hugetlb_usage_bytes",
help: "Current hugepage usage in bytes",
valueType: prometheus.GaugeValue,
extraLabels: []string{"pagesize"},
getValues: func(s *info.ContainerStats) metricValues {
values := make(metricValues, 0, len(s.Hugetlb))
for k, v := range s.Hugetlb {
values = append(values, metricValue{
value: float64(v.Usage),
labels: []string{k},
timestamp: s.Timestamp,
})
}
return values
},
},
{
name: "container_hugetlb_max_usage_bytes",
help: "Maximum hugepage usage recorded in bytes",
valueType: prometheus.GaugeValue,
extraLabels: []string{"pagesize"},
getValues: func(s *info.ContainerStats) metricValues {
values := make(metricValues, 0, len(s.Hugetlb))
for k, v := range s.Hugetlb {
values = append(values, metricValue{
value: float64(v.MaxUsage),
labels: []string{k},
timestamp: s.Timestamp,
})
}
return values
},
},
}...)
}
if includedMetrics.Has(container.MemoryUsageMetrics) {
c.containerMetrics = append(c.containerMetrics, []containerMetric{
{
@@ -961,6 +1015,417 @@ func NewPrometheusCollector(i infoProvider, f ContainerLabelsFunc, includedMetri
},
}...)
}
if includedMetrics.Has(container.NetworkAdvancedTcpUsageMetrics) {
c.containerMetrics = append(c.containerMetrics, []containerMetric{
{
name: "container_network_advance_tcp_stats_total",
help: "advance tcp connections statistic for container",
valueType: prometheus.GaugeValue,
extraLabels: []string{"tcp_state"},
getValues: func(s *info.ContainerStats) metricValues {
return metricValues{
{
value: float64(s.Network.TcpAdvanced.RtoAlgorithm),
labels: []string{"rtoalgorithm"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.RtoMin),
labels: []string{"rtomin"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.RtoMax),
labels: []string{"rtomax"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.MaxConn),
labels: []string{"maxconn"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.ActiveOpens),
labels: []string{"activeopens"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.PassiveOpens),
labels: []string{"passiveopens"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.AttemptFails),
labels: []string{"attemptfails"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.EstabResets),
labels: []string{"estabresets"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.CurrEstab),
labels: []string{"currestab"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.InSegs),
labels: []string{"insegs"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.OutSegs),
labels: []string{"outsegs"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.RetransSegs),
labels: []string{"retranssegs"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.InErrs),
labels: []string{"inerrs"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.OutRsts),
labels: []string{"outrsts"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.InCsumErrors),
labels: []string{"incsumerrors"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.EmbryonicRsts),
labels: []string{"embryonicrsts"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.SyncookiesSent),
labels: []string{"syncookiessent"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.SyncookiesRecv),
labels: []string{"syncookiesrecv"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.SyncookiesFailed),
labels: []string{"syncookiesfailed"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.PruneCalled),
labels: []string{"prunecalled"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.RcvPruned),
labels: []string{"rcvpruned"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.OfoPruned),
labels: []string{"ofopruned"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.OutOfWindowIcmps),
labels: []string{"outofwindowicmps"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.LockDroppedIcmps),
labels: []string{"lockdroppedicmps"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.TW),
labels: []string{"tw"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.TWRecycled),
labels: []string{"twrecycled"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.TWKilled),
labels: []string{"twkilled"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.TCPTimeWaitOverflow),
labels: []string{"tcptimewaitoverflow"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.TCPTimeouts),
labels: []string{"tcptimeouts"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.TCPSpuriousRTOs),
labels: []string{"tcpspuriousrtos"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.TCPLossProbes),
labels: []string{"tcplossprobes"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.TCPLossProbeRecovery),
labels: []string{"tcplossproberecovery"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.TCPRenoRecoveryFail),
labels: []string{"tcprenorecoveryfail"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.TCPSackRecoveryFail),
labels: []string{"tcpsackrecoveryfail"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.TCPRenoFailures),
labels: []string{"tcprenofailures"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.TCPSackFailures),
labels: []string{"tcpsackfailures"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.TCPLossFailures),
labels: []string{"tcplossfailures"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.DelayedACKs),
labels: []string{"delayedacks"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.DelayedACKLocked),
labels: []string{"delayedacklocked"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.DelayedACKLost),
labels: []string{"delayedacklost"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.ListenOverflows),
labels: []string{"listenoverflows"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.ListenDrops),
labels: []string{"listendrops"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.TCPHPHits),
labels: []string{"tcphphits"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.TCPPureAcks),
labels: []string{"tcppureacks"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.TCPHPAcks),
labels: []string{"tcphpacks"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.TCPRenoRecovery),
labels: []string{"tcprenorecovery"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.TCPSackRecovery),
labels: []string{"tcpsackrecovery"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.TCPSACKReneging),
labels: []string{"tcpsackreneging"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.TCPFACKReorder),
labels: []string{"tcpfackreorder"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.TCPSACKReorder),
labels: []string{"tcpsackreorder"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.TCPRenoReorder),
labels: []string{"tcprenoreorder"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.TCPTSReorder),
labels: []string{"tcptsreorder"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.TCPFullUndo),
labels: []string{"tcpfullundo"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.TCPPartialUndo),
labels: []string{"tcppartialundo"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.TCPDSACKUndo),
labels: []string{"tcpdsackundo"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.TCPLossUndo),
labels: []string{"tcplossundo"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.TCPFastRetrans),
labels: []string{"tcpfastretrans"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.TCPSlowStartRetrans),
labels: []string{"tcpslowstartretrans"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.TCPLostRetransmit),
labels: []string{"tcplostretransmit"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.TCPRetransFail),
labels: []string{"tcpretransfail"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.TCPRcvCollapsed),
labels: []string{"tcprcvcollapsed"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.TCPDSACKOldSent),
labels: []string{"tcpdsackoldsent"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.TCPDSACKOfoSent),
labels: []string{"tcpdsackofosent"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.TCPDSACKRecv),
labels: []string{"tcpdsackrecv"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.TCPDSACKOfoRecv),
labels: []string{"tcpdsackoforecv"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.TCPAbortOnData),
labels: []string{"tcpabortondata"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.TCPAbortOnClose),
labels: []string{"tcpabortonclose"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.TCPAbortOnMemory),
labels: []string{"tcpabortonmemory"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.TCPAbortOnTimeout),
labels: []string{"tcpabortontimeout"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.TCPAbortOnLinger),
labels: []string{"tcpabortonlinger"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.TCPAbortFailed),
labels: []string{"tcpabortfailed"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.TCPMemoryPressures),
labels: []string{"tcpmemorypressures"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.TCPMemoryPressuresChrono),
labels: []string{"tcpmemorypressureschrono"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.TCPSACKDiscard),
labels: []string{"tcpsackdiscard"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.TCPDSACKIgnoredOld),
labels: []string{"tcpdsackignoredold"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.TCPDSACKIgnoredNoUndo),
labels: []string{"tcpdsackignorednoundo"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.TCPMD5NotFound),
labels: []string{"tcpmd5notfound"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.TCPMD5Unexpected),
labels: []string{"tcpmd5unexpected"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.TCPMD5Failure),
labels: []string{"tcpmd5failure"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.TCPSackShifted),
labels: []string{"tcpsackshifted"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.TCPSackMerged),
labels: []string{"tcpsackmerged"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.TCPSackShiftFallback),
labels: []string{"tcpsackshiftfallback"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.TCPBacklogDrop),
labels: []string{"tcpbacklogdrop"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.PFMemallocDrop),
labels: []string{"pfmemallocdrop"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.TCPMinTTLDrop),
labels: []string{"tcpminttldrop"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.TCPDeferAcceptDrop),
labels: []string{"tcpdeferacceptdrop"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.IPReversePathFilter),
labels: []string{"ipreversepathfilter"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.TCPReqQFullDoCookies),
labels: []string{"tcpreqqfulldocookies"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.TCPReqQFullDrop),
labels: []string{"tcpreqqfulldrop"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.TCPFastOpenActive),
labels: []string{"tcpfastopenactive"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.TCPFastOpenActiveFail),
labels: []string{"tcpfastopenactivefail"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.TCPFastOpenPassive),
labels: []string{"tcpfastopenpassive"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.TCPFastOpenPassiveFail),
labels: []string{"tcpfastopenpassivefail"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.TCPFastOpenListenOverflow),
labels: []string{"tcpfastopenlistenoverflow"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.TCPFastOpenCookieReqd),
labels: []string{"tcpfastopencookiereqd"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.TCPSynRetrans),
labels: []string{"tcpsynretrans"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.TCPOrigDataSent),
labels: []string{"tcporigdatasent"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.PAWSActive),
labels: []string{"pawsactive"},
timestamp: s.Timestamp,
}, {
value: float64(s.Network.TcpAdvanced.PAWSEstab),
labels: []string{"pawsestab"},
timestamp: s.Timestamp,
},
}
},
},
}...)
}
if includedMetrics.Has(container.NetworkUdpUsageMetrics) {
c.containerMetrics = append(c.containerMetrics, []containerMetric{
{
@@ -1079,6 +1544,23 @@ func NewPrometheusCollector(i infoProvider, f ContainerLabelsFunc, includedMetri
}
},
},
{
name: "container_ulimits_soft",
help: "Soft ulimit values for the container root process. Unlimited if -1, except priority and nice",
valueType: prometheus.GaugeValue,
extraLabels: []string{"ulimit"},
getValues: func(s *info.ContainerStats) metricValues {
values := make(metricValues, 0, len(s.Processes.Ulimits))
for _, ulimit := range s.Processes.Ulimits {
values = append(values, metricValue{
value: float64(ulimit.SoftLimit),
labels: []string{ulimit.Name},
timestamp: s.Timestamp,
})
}
return values
},
},
}...)
}
@@ -1246,7 +1728,24 @@ func (c *PrometheusCollector) collectContainersInfo(ch chan<- prometheus.Metric)
)
}
}
if c.includedMetrics.Has(container.AppMetrics) {
for metricLabel, v := range stats.CustomMetrics {
for _, metric := range v {
clabels := make([]string, len(rawLabels), len(rawLabels)+len(metric.Labels))
cvalues := make([]string, len(rawLabels), len(rawLabels)+len(metric.Labels))
copy(clabels, labels)
copy(cvalues, values)
for label, value := range metric.Labels {
clabels = append(clabels, sanitizeLabelName("app_"+label))
cvalues = append(cvalues, value)
}
desc := prometheus.NewDesc(metricLabel, "Custom application metric.", clabels, nil)
ch <- prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, float64(metric.FloatValue), cvalues...)
}
}
}
}
}
func (c *PrometheusCollector) collectVersionInfo(ch chan<- prometheus.Metric) {

24
vendor/github.com/google/cadvisor/stats/BUILD generated vendored Normal file
View File

@@ -0,0 +1,24 @@
load("@io_bazel_rules_go//go:def.bzl", "go_library")
go_library(
name = "go_default_library",
srcs = ["types.go"],
importmap = "k8s.io/kubernetes/vendor/github.com/google/cadvisor/stats",
importpath = "github.com/google/cadvisor/stats",
visibility = ["//visibility:public"],
deps = ["//vendor/github.com/google/cadvisor/info/v1:go_default_library"],
)
filegroup(
name = "package-srcs",
srcs = glob(["**"]),
tags = ["automanaged"],
visibility = ["//visibility:private"],
)
filegroup(
name = "all-srcs",
srcs = [":package-srcs"],
tags = ["automanaged"],
visibility = ["//visibility:public"],
)

View File

@@ -1,4 +1,4 @@
// Copyright 2017 Google Inc. All Rights Reserved.
// Copyright 2020 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@@ -11,22 +11,25 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package accelerators
// Handling statistics that are fully controlled in cAdvisor
package stats
import info "github.com/google/cadvisor/info/v1"
// This is supposed to store global state about an accelerator metrics collector.
// cadvisor manager will call Setup() when it starts and Destroy() when it stops.
// For each container detected by the cadvisor manager, it will call
// This is supposed to store global state about an cAdvisor metrics collector.
// cAdvisor manager will call Setup() when it starts and Destroy() when it stops.
// For each container detected by the cAdvisor manager, it will call
// GetCollector() with the devices cgroup path for that container.
// GetCollector() is supposed to return an object that can update
// accelerator stats for that container.
type AcceleratorManager interface {
type Manager interface {
Setup()
Destroy()
GetCollector(deviceCgroup string) (AcceleratorCollector, error)
GetCollector(deviceCgroup string) (Collector, error)
}
type AcceleratorCollector interface {
// Collector can update ContainerStats by adding more metrics.
type Collector interface {
UpdateStats(*info.ContainerStats) error
}

View File

@@ -25,7 +25,6 @@ filegroup(
":package-srcs",
"//vendor/github.com/google/cadvisor/utils/cloudinfo:all-srcs",
"//vendor/github.com/google/cadvisor/utils/cpuload:all-srcs",
"//vendor/github.com/google/cadvisor/utils/docker:all-srcs",
"//vendor/github.com/google/cadvisor/utils/oomparser:all-srcs",
"//vendor/github.com/google/cadvisor/utils/sysfs:all-srcs",
"//vendor/github.com/google/cadvisor/utils/sysinfo:all-srcs",

View File

@@ -26,7 +26,9 @@ import (
)
var (
containerRegexp = regexp.MustCompile(`Task in (.*) killed as a result of limit of (.*)`)
legacyContainerRegexp = regexp.MustCompile(`Task in (.*) killed as a result of limit of (.*)`)
// Starting in 5.0 linux kernels, the OOM message changed
containerRegexp = regexp.MustCompile(`oom-kill:constraint=(.*),nodemask=(.*),cpuset=(.*),mems_allowed=(.*),oom_memcg=(.*) (.*),task_memcg=(.*),task=(.*),pid=(.*),uid=(.*)`)
lastLineRegexp = regexp.MustCompile(`Killed process ([0-9]+) \((.+)\)`)
firstLineRegexp = regexp.MustCompile(`invoked oom-killer:`)
)
@@ -51,11 +53,14 @@ type OomInstance struct {
// the absolute name of the container that was killed
// due to the OOM.
VictimContainerName string
// the constraint that triggered the OOM. One of CONSTRAINT_NONE,
// CONSTRAINT_CPUSET, CONSTRAINT_MEMORY_POLICY, CONSTRAINT_MEMCG
Constraint string
}
// gets the container name from a line and adds it to the oomInstance.
func getContainerName(line string, currentOomInstance *OomInstance) error {
parsedLine := containerRegexp.FindStringSubmatch(line)
func getLegacyContainerName(line string, currentOomInstance *OomInstance) error {
parsedLine := legacyContainerRegexp.FindStringSubmatch(line)
if parsedLine == nil {
return nil
}
@@ -64,6 +69,25 @@ func getContainerName(line string, currentOomInstance *OomInstance) error {
return nil
}
// gets the container name from a line and adds it to the oomInstance.
func getContainerName(line string, currentOomInstance *OomInstance) (bool, error) {
parsedLine := containerRegexp.FindStringSubmatch(line)
if parsedLine == nil {
// Fall back to the legacy format if it isn't found here.
return false, getLegacyContainerName(line, currentOomInstance)
}
currentOomInstance.ContainerName = parsedLine[7]
currentOomInstance.VictimContainerName = parsedLine[5]
currentOomInstance.Constraint = parsedLine[1]
pid, err := strconv.Atoi(parsedLine[9])
if err != nil {
return false, err
}
currentOomInstance.Pid = pid
currentOomInstance.ProcessName = parsedLine[8]
return true, nil
}
// gets the pid, name, and date from a line and adds it to oomInstance
func getProcessNamePid(line string, currentOomInstance *OomInstance) (bool, error) {
reList := lastLineRegexp.FindStringSubmatch(line)
@@ -106,13 +130,15 @@ func (self *OomParser) StreamOoms(outStream chan<- *OomInstance) {
TimeOfDeath: msg.Timestamp,
}
for msg := range kmsgEntries {
err := getContainerName(msg.Message, oomCurrentInstance)
finished, err := getContainerName(msg.Message, oomCurrentInstance)
if err != nil {
klog.Errorf("%v", err)
}
finished, err := getProcessNamePid(msg.Message, oomCurrentInstance)
if err != nil {
klog.Errorf("%v", err)
if !finished {
finished, err = getProcessNamePid(msg.Message, oomCurrentInstance)
if err != nil {
klog.Errorf("%v", err)
}
}
if finished {
oomCurrentInstance.TimeOfDeath = msg.Timestamp

View File

@@ -19,6 +19,7 @@ import (
"io/ioutil"
"os"
"path"
"path/filepath"
"strconv"
"strings"
)
@@ -30,6 +31,20 @@ const (
dmiDir = "/sys/class/dmi"
ppcDevTree = "/proc/device-tree"
s390xDevTree = "/etc" // s390/s390x changes
hugePagesDirName = "hugepages"
coreIDFilePath = "/topology/core_id"
meminfoFile = "meminfo"
cpuDirPattern = "cpu*[0-9]"
nodeDirPattern = "node*[0-9]"
//HugePagesNrFile name of nr_hugepages file in sysfs
HugePagesNrFile = "nr_hugepages"
)
var (
nodeDir = "/sys/devices/system/node/"
)
type CacheInfo struct {
@@ -45,6 +60,18 @@ type CacheInfo struct {
// Abstracts the lowest level calls to sysfs.
type SysFs interface {
// Get NUMA nodes paths
GetNodesPaths() ([]string, error)
// Get paths to CPU assigned for specified NUMA node
GetCPUsPaths(nodePath string) ([]string, error)
// Get physical core id for specified CPU
GetCoreID(coreIDFilePath string) (string, error)
// Get total memory for specified NUMA node
GetMemInfo(nodeDir string) (string, error)
// Get hugepages from specified directory
GetHugePagesInfo(hugePagesDirectory string) ([]os.FileInfo, error)
// Get hugepage_nr from specified directory
GetHugePagesNr(hugePagesDirectory string, hugePageName string) (string, error)
// Get directory information for available block devices.
GetBlockDevices() ([]os.FileInfo, error)
// Get Size of a given block device.
@@ -74,6 +101,47 @@ func NewRealSysFs() SysFs {
return &realSysFs{}
}
func (self *realSysFs) GetNodesPaths() ([]string, error) {
pathPattern := fmt.Sprintf("%s%s", nodeDir, nodeDirPattern)
return filepath.Glob(pathPattern)
}
func (self *realSysFs) GetCPUsPaths(nodePath string) ([]string, error) {
pathPattern := fmt.Sprintf("%s/%s", nodePath, cpuDirPattern)
return filepath.Glob(pathPattern)
}
func (self *realSysFs) GetCoreID(cpuPath string) (string, error) {
coreIDFilePath := fmt.Sprintf("%s%s", cpuPath, coreIDFilePath)
coreID, err := ioutil.ReadFile(coreIDFilePath)
if err != nil {
return "", err
}
return strings.TrimSpace(string(coreID)), err
}
func (self *realSysFs) GetMemInfo(nodePath string) (string, error) {
meminfoPath := fmt.Sprintf("%s/%s", nodePath, meminfoFile)
meminfo, err := ioutil.ReadFile(meminfoPath)
if err != nil {
return "", err
}
return strings.TrimSpace(string(meminfo)), err
}
func (self *realSysFs) GetHugePagesInfo(hugePagesDirectory string) ([]os.FileInfo, error) {
return ioutil.ReadDir(hugePagesDirectory)
}
func (self *realSysFs) GetHugePagesNr(hugepagesDirectory string, hugePageName string) (string, error) {
hugePageFilePath := fmt.Sprintf("%s%s/%s", hugepagesDirectory, hugePageName, HugePagesNrFile)
hugePageFile, err := ioutil.ReadFile(hugePageFilePath)
if err != nil {
return "", err
}
return strings.TrimSpace(string(hugePageFile)), err
}
func (self *realSysFs) GetBlockDevices() ([]os.FileInfo, error) {
return ioutil.ReadDir(blockDir)
}

View File

@@ -9,6 +9,8 @@ go_library(
deps = [
"//vendor/github.com/google/cadvisor/info/v1:go_default_library",
"//vendor/github.com/google/cadvisor/utils/sysfs:go_default_library",
"//vendor/github.com/google/go-cmp/cmp:go_default_library",
"//vendor/k8s.io/klog:go_default_library",
],
)

View File

@@ -22,9 +22,22 @@ import (
info "github.com/google/cadvisor/info/v1"
"github.com/google/cadvisor/utils/sysfs"
"github.com/google/go-cmp/cmp"
"k8s.io/klog"
)
var schedulerRegExp = regexp.MustCompile(`.*\[(.*)\].*`)
var (
schedulerRegExp = regexp.MustCompile(`.*\[(.*)\].*`)
nodeDirRegExp = regexp.MustCompile("node/node(\\d*)")
cpuDirRegExp = regexp.MustCompile("/cpu(\\d*)")
memoryCapacityRegexp = regexp.MustCompile(`MemTotal:\s*([0-9]+) kB`)
)
const (
cacheLevel2 = 2
hugepagesDir = "hugepages/"
)
// Get information about block devices present on the system.
// Uses the passed in system interface to retrieve the low level OS information.
@@ -133,6 +146,199 @@ func GetNetworkDevices(sysfs sysfs.SysFs) ([]info.NetInfo, error) {
return netDevices, nil
}
// GetHugePagesInfo returns information about pre-allocated huge pages
// hugepagesDirectory should be top directory of hugepages
// Such as: /sys/kernel/mm/hugepages/
func GetHugePagesInfo(sysFs sysfs.SysFs, hugepagesDirectory string) ([]info.HugePagesInfo, error) {
var hugePagesInfo []info.HugePagesInfo
files, err := sysFs.GetHugePagesInfo(hugepagesDirectory)
if err != nil {
// treat as non-fatal since kernels and machine can be
// configured to disable hugepage support
return hugePagesInfo, nil
}
for _, st := range files {
nameArray := strings.Split(st.Name(), "-")
pageSizeArray := strings.Split(nameArray[1], "kB")
pageSize, err := strconv.ParseUint(string(pageSizeArray[0]), 10, 64)
if err != nil {
return hugePagesInfo, err
}
val, err := sysFs.GetHugePagesNr(hugepagesDirectory, st.Name())
if err != nil {
return hugePagesInfo, err
}
var numPages uint64
// we use sscanf as the file as a new-line that trips up ParseUint
// it returns the number of tokens successfully parsed, so if
// n != 1, it means we were unable to parse a number from the file
n, err := fmt.Sscanf(string(val), "%d", &numPages)
if err != nil || n != 1 {
return hugePagesInfo, fmt.Errorf("could not parse file nr_hugepage for %s, contents %q", st.Name(), string(val))
}
hugePagesInfo = append(hugePagesInfo, info.HugePagesInfo{
NumPages: numPages,
PageSize: pageSize,
})
}
return hugePagesInfo, nil
}
// GetNodesInfo returns information about NUMA nodes and their topology
func GetNodesInfo(sysFs sysfs.SysFs) ([]info.Node, int, error) {
nodes := []info.Node{}
allLogicalCoresCount := 0
nodesDirs, err := sysFs.GetNodesPaths()
if err != nil || len(nodesDirs) == 0 {
if len(nodesDirs) == 0 && err == nil {
//sysFs.GetNodesPaths uses filePath.Glob which does not return any error if pattern does not match anything
err = fmt.Errorf("Any path to specific node is not found")
}
return nil, 0, err
}
for _, nodeDir := range nodesDirs {
id, err := getMatchedInt(nodeDirRegExp, nodeDir)
node := info.Node{Id: id}
cores, logicalCoreCount, err := getCoresInfo(sysFs, nodeDir)
if err != nil {
return nil, 0, err
}
node.Cores = cores
allLogicalCoresCount += logicalCoreCount
err = addCacheInfo(sysFs, &node)
if err != nil {
return nil, 0, err
}
node.Memory, err = getNodeMemInfo(sysFs, nodeDir)
if err != nil {
return nil, 0, err
}
hugepagesDirectory := fmt.Sprintf("%s/%s", nodeDir, hugepagesDir)
node.HugePages, err = GetHugePagesInfo(sysFs, hugepagesDirectory)
if err != nil {
return nil, 0, err
}
nodes = append(nodes, node)
}
return nodes, allLogicalCoresCount, err
}
// addCacheInfo adds information about cache for NUMA node
func addCacheInfo(sysFs sysfs.SysFs, node *info.Node) error {
for coreID, core := range node.Cores {
threadID := core.Threads[0] //get any thread for core
caches, err := GetCacheInfo(sysFs, threadID)
if err != nil {
return err
}
numThreadsPerCore := len(core.Threads)
numThreadsPerNode := len(node.Cores) * numThreadsPerCore
for _, cache := range caches {
c := info.Cache{
Size: cache.Size,
Level: cache.Level,
Type: cache.Type,
}
if cache.Cpus == numThreadsPerNode && cache.Level > cacheLevel2 {
// Add a node-level cache.
cacheFound := false
for _, nodeCache := range node.Caches {
if cmp.Equal(nodeCache, c) {
cacheFound = true
}
}
if !cacheFound {
node.Caches = append(node.Caches, c)
}
} else if cache.Cpus == numThreadsPerCore {
// Add core level cache
node.Cores[coreID].Caches = append(node.Cores[coreID].Caches, c)
}
// Ignore unknown caches.
}
}
return nil
}
// getNodeMemInfo returns information about total memory for NUMA node
func getNodeMemInfo(sysFs sysfs.SysFs, nodeDir string) (uint64, error) {
rawMem, err := sysFs.GetMemInfo(nodeDir)
if err != nil {
//Ignore if per-node info is not available.
klog.Warningf("Found node without memory information, nodeDir: %s", nodeDir)
return 0, nil
}
matches := memoryCapacityRegexp.FindStringSubmatch(rawMem)
if len(matches) != 2 {
return 0, fmt.Errorf("failed to match regexp in output: %q", string(rawMem))
}
memory, err := strconv.ParseUint(matches[1], 10, 64)
if err != nil {
return 0, err
}
memory = memory * 1024 // Convert to bytes
return uint64(memory), nil
}
// getCoresInfo retruns infromation about physical and logical cores assigned to NUMA node
func getCoresInfo(sysFs sysfs.SysFs, nodeDir string) ([]info.Core, int, error) {
cpuDirs, err := sysFs.GetCPUsPaths(nodeDir)
if err != nil || len(cpuDirs) == 0 {
klog.Warningf("Found node without any CPU, nodeDir: %s, number of cpuDirs %d, err: %v", nodeDir, len(cpuDirs), err)
return nil, 0, nil
}
cores := make([]info.Core, 0, len(cpuDirs))
for _, cpuDir := range cpuDirs {
cpuID, err := getMatchedInt(cpuDirRegExp, cpuDir)
if err != nil {
return nil, 0, fmt.Errorf("Unexpected format of CPU directory, cpuDirRegExp %s, cpuDir: %s", cpuDirRegExp, cpuDir)
}
rawPhysicalID, err := sysFs.GetCoreID(cpuDir)
if err != nil {
return nil, 0, err
}
physicalID, err := strconv.Atoi(rawPhysicalID)
if err != nil {
return nil, 0, err
}
coreIDx := -1
for id, core := range cores {
if core.Id == physicalID {
coreIDx = id
}
}
if coreIDx == -1 {
cores = append(cores, info.Core{})
coreIDx = len(cores) - 1
}
desiredCore := &cores[coreIDx]
desiredCore.Id = physicalID
if len(desiredCore.Threads) == 0 {
desiredCore.Threads = []int{cpuID}
} else {
desiredCore.Threads = append(desiredCore.Threads, cpuID)
}
}
return cores, len(cpuDirs), nil
}
// GetCacheInfo return information about a cache accessible from the given cpu thread
func GetCacheInfo(sysFs sysfs.SysFs, id int) ([]sysfs.CacheInfo, error) {
caches, err := sysFs.GetCaches(id)
if err != nil {
@@ -201,3 +407,15 @@ func getNetworkStats(name string, sysFs sysfs.SysFs) (info.InterfaceStats, error
func GetSystemUUID(sysFs sysfs.SysFs) (string, error) {
return sysFs.GetSystemUUID()
}
func getMatchedInt(rgx *regexp.Regexp, str string) (int, error) {
matches := rgx.FindStringSubmatch(str)
if len(matches) != 2 {
return 0, fmt.Errorf("failed to match regexp, str: %s", str)
}
valInt, err := strconv.Atoi(matches[1])
if err != nil {
return 0, err
}
return valInt, nil
}