diff --git a/pkg/kubelet/kuberuntime/BUILD b/pkg/kubelet/kuberuntime/BUILD index d0a52dd54d0..8b87f733254 100644 --- a/pkg/kubelet/kuberuntime/BUILD +++ b/pkg/kubelet/kuberuntime/BUILD @@ -16,6 +16,7 @@ go_library( "doc.go", "fake_kuberuntime_manager.go", "helpers.go", + "instrumented_services.go", "kuberuntime_container.go", "kuberuntime_gc.go", "kuberuntime_image.go", @@ -38,6 +39,7 @@ go_library( "//pkg/kubelet/events:go_default_library", "//pkg/kubelet/images:go_default_library", "//pkg/kubelet/lifecycle:go_default_library", + "//pkg/kubelet/metrics:go_default_library", "//pkg/kubelet/network:go_default_library", "//pkg/kubelet/prober/results:go_default_library", "//pkg/kubelet/qos:go_default_library", diff --git a/pkg/kubelet/kuberuntime/instrumented_services.go b/pkg/kubelet/kuberuntime/instrumented_services.go new file mode 100644 index 00000000000..4aaed743d6a --- /dev/null +++ b/pkg/kubelet/kuberuntime/instrumented_services.go @@ -0,0 +1,223 @@ +/* +Copyright 2016 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package kuberuntime + +import ( + "io" + "time" + + internalApi "k8s.io/kubernetes/pkg/kubelet/api" + runtimeApi "k8s.io/kubernetes/pkg/kubelet/api/v1alpha1/runtime" + "k8s.io/kubernetes/pkg/kubelet/metrics" +) + +// instrumentedRuntimeService wraps the RuntimeService and records the operations +// and errors metrics. +type instrumentedRuntimeService struct { + service internalApi.RuntimeService +} + +// Creates an instrumented RuntimeInterface from an existing RuntimeService. +func NewInstrumentedRuntimeService(service internalApi.RuntimeService) internalApi.RuntimeService { + return &instrumentedRuntimeService{service: service} +} + +// instrumentedImageManagerService wraps the ImageManagerService and records the operations +// and errors metrics. +type instrumentedImageManagerService struct { + service internalApi.ImageManagerService +} + +// Creates an instrumented ImageManagerService from an existing ImageManagerService. +func NewInstrumentedImageManagerService(service internalApi.ImageManagerService) internalApi.ImageManagerService { + return &instrumentedImageManagerService{service: service} +} + +// recordOperation records the duration of the operation. +func recordOperation(operation string, start time.Time) { + metrics.RuntimeOperations.WithLabelValues(operation).Inc() + metrics.RuntimeOperationsLatency.WithLabelValues(operation).Observe(metrics.SinceInMicroseconds(start)) +} + +// recordError records error for metric if an error occurred. +func recordError(operation string, err error) { + if err != nil { + metrics.RuntimeOperationsErrors.WithLabelValues(operation).Inc() + } +} + +func (in instrumentedRuntimeService) Version(apiVersion string) (*runtimeApi.VersionResponse, error) { + const operation = "version" + defer recordOperation(operation, time.Now()) + + out, err := in.service.Version(apiVersion) + recordError(operation, err) + return out, err +} + +func (in instrumentedRuntimeService) CreateContainer(podSandboxID string, config *runtimeApi.ContainerConfig, sandboxConfig *runtimeApi.PodSandboxConfig) (string, error) { + const operation = "create_container" + defer recordOperation(operation, time.Now()) + + out, err := in.service.CreateContainer(podSandboxID, config, sandboxConfig) + recordError(operation, err) + return out, err +} + +func (in instrumentedRuntimeService) StartContainer(containerID string) error { + const operation = "start_container" + defer recordOperation(operation, time.Now()) + + err := in.service.StartContainer(containerID) + recordError(operation, err) + return err +} + +func (in instrumentedRuntimeService) StopContainer(containerID string, timeout int64) error { + const operation = "stop_container" + defer recordOperation(operation, time.Now()) + + err := in.service.StopContainer(containerID, timeout) + recordError(operation, err) + return err +} + +func (in instrumentedRuntimeService) RemoveContainer(containerID string) error { + const operation = "remove_container" + defer recordOperation(operation, time.Now()) + + err := in.service.RemoveContainer(containerID) + recordError(operation, err) + return err +} + +func (in instrumentedRuntimeService) ListContainers(filter *runtimeApi.ContainerFilter) ([]*runtimeApi.Container, error) { + const operation = "list_containers" + defer recordOperation(operation, time.Now()) + + out, err := in.service.ListContainers(filter) + recordError(operation, err) + return out, err +} + +func (in instrumentedRuntimeService) ContainerStatus(containerID string) (*runtimeApi.ContainerStatus, error) { + const operation = "container_status" + defer recordOperation(operation, time.Now()) + + out, err := in.service.ContainerStatus(containerID) + recordError(operation, err) + return out, err +} + +func (in instrumentedRuntimeService) Exec(containerID string, cmd []string, tty bool, stdin io.Reader, stdout, stderr io.WriteCloser) error { + const operation = "exec" + defer recordOperation(operation, time.Now()) + + err := in.service.Exec(containerID, cmd, tty, stdin, stdout, stderr) + recordError(operation, err) + return err +} + +func (in instrumentedRuntimeService) RunPodSandbox(config *runtimeApi.PodSandboxConfig) (string, error) { + const operation = "run_podsandbox" + defer recordOperation(operation, time.Now()) + + out, err := in.service.RunPodSandbox(config) + recordError(operation, err) + return out, err +} + +func (in instrumentedRuntimeService) StopPodSandbox(podSandboxID string) error { + const operation = "stop_podsandbox" + defer recordOperation(operation, time.Now()) + + err := in.service.StopPodSandbox(podSandboxID) + recordError(operation, err) + return err +} + +func (in instrumentedRuntimeService) RemovePodSandbox(podSandboxID string) error { + const operation = "remove_podsandbox" + defer recordOperation(operation, time.Now()) + + err := in.service.RemovePodSandbox(podSandboxID) + recordError(operation, err) + return err +} + +func (in instrumentedRuntimeService) PodSandboxStatus(podSandboxID string) (*runtimeApi.PodSandboxStatus, error) { + const operation = "podsandbox_status" + defer recordOperation(operation, time.Now()) + + out, err := in.service.PodSandboxStatus(podSandboxID) + recordError(operation, err) + return out, err +} + +func (in instrumentedRuntimeService) ListPodSandbox(filter *runtimeApi.PodSandboxFilter) ([]*runtimeApi.PodSandbox, error) { + const operation = "list_podsandbox" + defer recordOperation(operation, time.Now()) + + out, err := in.service.ListPodSandbox(filter) + recordError(operation, err) + return out, err +} + +func (in instrumentedRuntimeService) UpdateRuntimeConfig(runtimeConfig *runtimeApi.RuntimeConfig) error { + const operation = "update_runtime_config" + defer recordOperation(operation, time.Now()) + + err := in.service.UpdateRuntimeConfig(runtimeConfig) + recordError(operation, err) + return err +} + +func (in instrumentedImageManagerService) ListImages(filter *runtimeApi.ImageFilter) ([]*runtimeApi.Image, error) { + const operation = "list_images" + defer recordOperation(operation, time.Now()) + + out, err := in.service.ListImages(filter) + recordError(operation, err) + return out, err +} + +func (in instrumentedImageManagerService) ImageStatus(image *runtimeApi.ImageSpec) (*runtimeApi.Image, error) { + const operation = "image_status" + defer recordOperation(operation, time.Now()) + + out, err := in.service.ImageStatus(image) + recordError(operation, err) + return out, err +} + +func (in instrumentedImageManagerService) PullImage(image *runtimeApi.ImageSpec, auth *runtimeApi.AuthConfig) error { + const operation = "pull_image" + defer recordOperation(operation, time.Now()) + + err := in.service.PullImage(image, auth) + recordError(operation, err) + return err +} + +func (in instrumentedImageManagerService) RemoveImage(image *runtimeApi.ImageSpec) error { + const operation = "remove_image" + defer recordOperation(operation, time.Now()) + + err := in.service.RemoveImage(image) + recordError(operation, err) + return err +} diff --git a/pkg/kubelet/kuberuntime/kuberuntime_manager.go b/pkg/kubelet/kuberuntime/kuberuntime_manager.go index d4bd574db94..15993026115 100644 --- a/pkg/kubelet/kuberuntime/kuberuntime_manager.go +++ b/pkg/kubelet/kuberuntime/kuberuntime_manager.go @@ -138,8 +138,8 @@ func NewKubeGenericRuntimeManager( osInterface: osInterface, networkPlugin: networkPlugin, runtimeHelper: runtimeHelper, - runtimeService: runtimeService, - imageService: imageService, + runtimeService: NewInstrumentedRuntimeService(runtimeService), + imageService: NewInstrumentedImageManagerService(imageService), keyring: credentialprovider.NewDockerKeyring(), } diff --git a/pkg/kubelet/metrics/metrics.go b/pkg/kubelet/metrics/metrics.go index 7461a736577..433f57715f3 100644 --- a/pkg/kubelet/metrics/metrics.go +++ b/pkg/kubelet/metrics/metrics.go @@ -39,6 +39,10 @@ const ( PodWorkerStartLatencyKey = "pod_worker_start_latency_microseconds" PLEGRelistLatencyKey = "pleg_relist_latency_microseconds" PLEGRelistIntervalKey = "pleg_relist_interval_microseconds" + // Metrics keys of remote runtime operations + RuntimeOperationsKey = "runtime_operations" + RuntimeOperationsLatencyKey = "runtime_operations_latency_microseconds" + RuntimeOperationsErrorsKey = "runtime_operations_errors" ) var ( @@ -93,6 +97,7 @@ var ( Help: "Latency in microseconds from seeing a pod to starting a worker.", }, ) + // TODO(random-liu): Move the following docker metrics into shim once dockertools is deprecated. DockerOperationsLatency = prometheus.NewSummaryVec( prometheus.SummaryOpts{ Subsystem: KubeletSubsystem, @@ -139,6 +144,31 @@ var ( Help: "Interval in microseconds between relisting in PLEG.", }, ) + // Metrics of remote runtime operations. + RuntimeOperations = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Subsystem: KubeletSubsystem, + Name: RuntimeOperationsKey, + Help: "Cumulative number of runtime operations by operation type.", + }, + []string{"operation_type"}, + ) + RuntimeOperationsLatency = prometheus.NewSummaryVec( + prometheus.SummaryOpts{ + Subsystem: KubeletSubsystem, + Name: RuntimeOperationsLatencyKey, + Help: "Latency in microseconds of runtime operations. Broken down by operation type.", + }, + []string{"operation_type"}, + ) + RuntimeOperationsErrors = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Subsystem: KubeletSubsystem, + Name: RuntimeOperationsErrorsKey, + Help: "Cumulative number of runtime operation errors by operation type.", + }, + []string{"operation_type"}, + ) ) var registerMetrics sync.Once @@ -161,6 +191,9 @@ func Register(containerCache kubecontainer.RuntimeCache) { prometheus.MustRegister(newPodAndContainerCollector(containerCache)) prometheus.MustRegister(PLEGRelistLatency) prometheus.MustRegister(PLEGRelistInterval) + prometheus.MustRegister(RuntimeOperations) + prometheus.MustRegister(RuntimeOperationsLatency) + prometheus.MustRegister(RuntimeOperationsErrors) }) }