diff --git a/pkg/cri/sbserver/metrics.go b/pkg/cri/sbserver/metrics.go index 455927819..3bcff06a1 100644 --- a/pkg/cri/sbserver/metrics.go +++ b/pkg/cri/sbserver/metrics.go @@ -36,6 +36,10 @@ var ( containerStopTimer metrics.LabeledTimer containerStartTimer metrics.LabeledTimer + networkPluginOperations metrics.LabeledCounter + networkPluginOperationsErrors metrics.LabeledCounter + networkPluginOperationsLatency metrics.LabeledTimer + imagePulls metrics.LabeledCounter inProgressImagePulls metrics.Gauge // pull duration / (image size / 1MBi) @@ -60,6 +64,10 @@ func init() { containerStopTimer = ns.NewLabeledTimer("container_stop", "time to stop a container", "runtime") containerStartTimer = ns.NewLabeledTimer("container_start", "time to start a container", "runtime") + networkPluginOperations = ns.NewLabeledCounter("network_plugin_operations_total", "cumulative number of network plugin operations by operation type", "operation_type") + networkPluginOperationsErrors = ns.NewLabeledCounter("network_plugin_operations_errors_total", "cumulative number of network plugin operations by operation type", "operation_type") + networkPluginOperationsLatency = ns.NewLabeledTimer("network_plugin_operations_duration_seconds", "latency in seconds of network plugin operations. Broken down by operation type", "operation_type") + imagePulls = ns.NewLabeledCounter("image_pulls", "succeeded and failed counters", "status") inProgressImagePulls = ns.NewGauge("in_progress_image_pulls", "in progress pulls", metrics.Total) imagePullThroughput = prom.NewHistogram( @@ -72,3 +80,11 @@ func init() { metrics.Register(ns) } + +// for backwards compatibility with kubelet/dockershim metrics +// https://github.com/containerd/containerd/issues/7801 +const ( + networkStatusOp = "get_pod_network_status" + networkSetUpOp = "set_up_pod" + networkTearDownOp = "tear_down_pod" +) diff --git a/pkg/cri/sbserver/sandbox_run.go b/pkg/cri/sbserver/sandbox_run.go index eed132130..d94afe8a6 100644 --- a/pkg/cri/sbserver/sandbox_run.go +++ b/pkg/cri/sbserver/sandbox_run.go @@ -340,8 +340,12 @@ func (c *criService) setupPodNetwork(ctx context.Context, sandbox *sandboxstore. return fmt.Errorf("get cni namespace options: %w", err) } log.G(ctx).WithField("podsandboxid", id).Debugf("begin cni setup") + netStart := time.Now() result, err := netPlugin.Setup(ctx, id, path, opts...) + networkPluginOperations.WithValues(networkSetUpOp).Inc() + networkPluginOperationsLatency.WithValues(networkSetUpOp).UpdateSince(netStart) if err != nil { + networkPluginOperationsErrors.WithValues(networkSetUpOp).Inc() return err } logDebugCNIResult(ctx, id, result) diff --git a/pkg/cri/sbserver/sandbox_stop.go b/pkg/cri/sbserver/sandbox_stop.go index 754b9a8f7..267316f3c 100644 --- a/pkg/cri/sbserver/sandbox_stop.go +++ b/pkg/cri/sbserver/sandbox_stop.go @@ -132,5 +132,13 @@ func (c *criService) teardownPodNetwork(ctx context.Context, sandbox sandboxstor return fmt.Errorf("get cni namespace options: %w", err) } - return netPlugin.Remove(ctx, id, path, opts...) + netStart := time.Now() + err = netPlugin.Remove(ctx, id, path, opts...) + networkPluginOperations.WithValues(networkTearDownOp).Inc() + networkPluginOperationsLatency.WithValues(networkTearDownOp).UpdateSince(netStart) + if err != nil { + networkPluginOperationsErrors.WithValues(networkTearDownOp).Inc() + return err + } + return nil } diff --git a/pkg/cri/sbserver/update_runtime_config.go b/pkg/cri/sbserver/update_runtime_config.go index db21b0785..8740739e5 100644 --- a/pkg/cri/sbserver/update_runtime_config.go +++ b/pkg/cri/sbserver/update_runtime_config.go @@ -24,6 +24,7 @@ import ( "path/filepath" "strings" "text/template" + "time" "github.com/containerd/containerd/log" runtime "k8s.io/cri-api/pkg/apis/runtime/v1" @@ -74,10 +75,16 @@ func (c *criService) UpdateRuntimeConfig(ctx context.Context, r *runtime.UpdateR log.G(ctx).Infof("Network plugin is ready, skip generating cni config from template %q", confTemplate) return &runtime.UpdateRuntimeConfigResponse{}, nil } - if err := netPlugin.Status(); err == nil { + netStart := time.Now() + err = netPlugin.Status() + networkPluginOperations.WithValues(networkStatusOp).Inc() + networkPluginOperationsLatency.WithValues(networkStatusOp).UpdateSince(netStart) + if err == nil { log.G(ctx).Infof("Network plugin is ready, skip generating cni config from template %q", confTemplate) return &runtime.UpdateRuntimeConfigResponse{}, nil - } else if err := netPlugin.Load(c.cniLoadOptions()...); err == nil { + } + networkPluginOperationsErrors.WithValues(networkStatusOp).Inc() + if err := netPlugin.Load(c.cniLoadOptions()...); err == nil { log.G(ctx).Infof("CNI config is successfully loaded, skip generating cni config from template %q", confTemplate) return &runtime.UpdateRuntimeConfigResponse{}, nil } diff --git a/pkg/cri/server/metrics.go b/pkg/cri/server/metrics.go index 49e2a7477..57a5ceba0 100644 --- a/pkg/cri/server/metrics.go +++ b/pkg/cri/server/metrics.go @@ -36,6 +36,10 @@ var ( containerStopTimer metrics.LabeledTimer containerStartTimer metrics.LabeledTimer + networkPluginOperations metrics.LabeledCounter + networkPluginOperationsErrors metrics.LabeledCounter + networkPluginOperationsLatency metrics.LabeledTimer + imagePulls metrics.LabeledCounter inProgressImagePulls metrics.Gauge // pull duration / (image size / 1MBi) @@ -60,6 +64,10 @@ func init() { containerStopTimer = ns.NewLabeledTimer("container_stop", "time to stop a container", "runtime") containerStartTimer = ns.NewLabeledTimer("container_start", "time to start a container", "runtime") + networkPluginOperations = ns.NewLabeledCounter("network_plugin_operations_total", "cumulative number of network plugin operations by operation type", "operation_type") + networkPluginOperationsErrors = ns.NewLabeledCounter("network_plugin_operations_errors_total", "cumulative number of network plugin operations by operation type", "operation_type") + networkPluginOperationsLatency = ns.NewLabeledTimer("network_plugin_operations_duration_seconds", "latency in seconds of network plugin operations. Broken down by operation type", "operation_type") + imagePulls = ns.NewLabeledCounter("image_pulls", "succeeded and failed counters", "status") inProgressImagePulls = ns.NewGauge("in_progress_image_pulls", "in progress pulls", metrics.Total) imagePullThroughput = prom.NewHistogram( @@ -72,3 +80,11 @@ func init() { metrics.Register(ns) } + +// for backwards compatibility with kubelet/dockershim metrics +// https://github.com/containerd/containerd/issues/7801 +const ( + networkStatusOp = "get_pod_network_status" + networkSetUpOp = "set_up_pod" + networkTearDownOp = "tear_down_pod" +) diff --git a/pkg/cri/server/sandbox_run.go b/pkg/cri/server/sandbox_run.go index 4c1365f47..acc4a16ef 100644 --- a/pkg/cri/server/sandbox_run.go +++ b/pkg/cri/server/sandbox_run.go @@ -440,12 +440,16 @@ func (c *criService) setupPodNetwork(ctx context.Context, sandbox *sandboxstore. return fmt.Errorf("get cni namespace options: %w", err) } log.G(ctx).WithField("podsandboxid", id).Debugf("begin cni setup") + netStart := time.Now() if c.config.CniConfig.NetworkPluginSetupSerially { result, err = netPlugin.SetupSerially(ctx, id, path, opts...) } else { result, err = netPlugin.Setup(ctx, id, path, opts...) } + networkPluginOperations.WithValues(networkSetUpOp).Inc() + networkPluginOperationsLatency.WithValues(networkSetUpOp).UpdateSince(netStart) if err != nil { + networkPluginOperationsErrors.WithValues(networkSetUpOp).Inc() return err } logDebugCNIResult(ctx, id, result) diff --git a/pkg/cri/server/sandbox_stop.go b/pkg/cri/server/sandbox_stop.go index 84bf7a3c3..5aea5c32b 100644 --- a/pkg/cri/server/sandbox_stop.go +++ b/pkg/cri/server/sandbox_stop.go @@ -193,7 +193,15 @@ func (c *criService) teardownPodNetwork(ctx context.Context, sandbox sandboxstor return fmt.Errorf("get cni namespace options: %w", err) } - return netPlugin.Remove(ctx, id, path, opts...) + netStart := time.Now() + err = netPlugin.Remove(ctx, id, path, opts...) + networkPluginOperations.WithValues(networkTearDownOp).Inc() + networkPluginOperationsLatency.WithValues(networkTearDownOp).UpdateSince(netStart) + if err != nil { + networkPluginOperationsErrors.WithValues(networkTearDownOp).Inc() + return err + } + return nil } // cleanupUnknownSandbox cleanup stopped sandbox in unknown state. diff --git a/pkg/cri/server/update_runtime_config.go b/pkg/cri/server/update_runtime_config.go index 8f0a83ba7..52246746c 100644 --- a/pkg/cri/server/update_runtime_config.go +++ b/pkg/cri/server/update_runtime_config.go @@ -24,6 +24,7 @@ import ( "path/filepath" "strings" "text/template" + "time" "github.com/containerd/containerd/log" runtime "k8s.io/cri-api/pkg/apis/runtime/v1" @@ -74,10 +75,17 @@ func (c *criService) UpdateRuntimeConfig(ctx context.Context, r *runtime.UpdateR log.G(ctx).Infof("Network plugin is ready, skip generating cni config from template %q", confTemplate) return &runtime.UpdateRuntimeConfigResponse{}, nil } - if err := netPlugin.Status(); err == nil { + + netStart := time.Now() + err = netPlugin.Status() + networkPluginOperations.WithValues(networkStatusOp).Inc() + networkPluginOperationsLatency.WithValues(networkStatusOp).UpdateSince(netStart) + if err == nil { log.G(ctx).Infof("Network plugin is ready, skip generating cni config from template %q", confTemplate) return &runtime.UpdateRuntimeConfigResponse{}, nil - } else if err := netPlugin.Load(c.cniLoadOptions()...); err == nil { + } + networkPluginOperationsErrors.WithValues(networkStatusOp).Inc() + if err := netPlugin.Load(c.cniLoadOptions()...); err == nil { log.G(ctx).Infof("CNI config is successfully loaded, skip generating cni config from template %q", confTemplate) return &runtime.UpdateRuntimeConfigResponse{}, nil }