[cri] add sandbox and container latency metrics

These are simple metrics that allow users to view more fine grained metrics on
internal operations.

Signed-off-by: Michael Crosby <michael@thepasture.io>
This commit is contained in:
Michael Crosby 2021-10-07 19:35:43 +00:00
parent 432ddecaae
commit 91bbaf6799
10 changed files with 111 additions and 0 deletions

View File

@ -102,6 +102,7 @@ func (c *criService) CreateContainer(ctx context.Context, r *runtime.CreateConta
return nil, errors.Wrapf(err, "failed to get image from containerd %q", image.ID) return nil, errors.Wrapf(err, "failed to get image from containerd %q", image.ID)
} }
start := time.Now()
// Run container using the same runtime with sandbox. // Run container using the same runtime with sandbox.
sandboxInfo, err := sandbox.Container.Info(ctx) sandboxInfo, err := sandbox.Container.Info(ctx)
if err != nil { if err != nil {
@ -278,6 +279,8 @@ func (c *criService) CreateContainer(ctx context.Context, r *runtime.CreateConta
return nil, errors.Wrapf(err, "failed to add container %q into store", id) return nil, errors.Wrapf(err, "failed to add container %q into store", id)
} }
containerCreateTimer.WithValues(ociRuntime.Type).UpdateSince(start)
return &runtime.CreateContainerResponse{ContainerId: id}, nil return &runtime.CreateContainerResponse{ContainerId: id}, nil
} }

View File

@ -17,6 +17,8 @@
package server package server
import ( import (
"time"
"golang.org/x/net/context" "golang.org/x/net/context"
runtime "k8s.io/cri-api/pkg/apis/runtime/v1" runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
@ -26,6 +28,7 @@ import (
// ListContainers lists all containers matching the filter. // ListContainers lists all containers matching the filter.
func (c *criService) ListContainers(ctx context.Context, r *runtime.ListContainersRequest) (*runtime.ListContainersResponse, error) { func (c *criService) ListContainers(ctx context.Context, r *runtime.ListContainersRequest) (*runtime.ListContainersResponse, error) {
start := time.Now()
// List all containers from store. // List all containers from store.
containersInStore := c.containerStore.List() containersInStore := c.containerStore.List()
@ -35,6 +38,8 @@ func (c *criService) ListContainers(ctx context.Context, r *runtime.ListContaine
} }
containers = c.filterCRIContainers(containers, r.GetFilter()) containers = c.filterCRIContainers(containers, r.GetFilter())
containerListTimer.UpdateSince(start)
return &runtime.ListContainersResponse{Containers: containers}, nil return &runtime.ListContainersResponse{Containers: containers}, nil
} }

View File

@ -17,6 +17,8 @@
package server package server
import ( import (
"time"
"github.com/containerd/containerd" "github.com/containerd/containerd"
"github.com/containerd/containerd/errdefs" "github.com/containerd/containerd/errdefs"
"github.com/containerd/containerd/log" "github.com/containerd/containerd/log"
@ -30,6 +32,7 @@ import (
// RemoveContainer removes the container. // RemoveContainer removes the container.
func (c *criService) RemoveContainer(ctx context.Context, r *runtime.RemoveContainerRequest) (_ *runtime.RemoveContainerResponse, retErr error) { func (c *criService) RemoveContainer(ctx context.Context, r *runtime.RemoveContainerRequest) (_ *runtime.RemoveContainerResponse, retErr error) {
start := time.Now()
container, err := c.containerStore.Get(r.GetContainerId()) container, err := c.containerStore.Get(r.GetContainerId())
if err != nil { if err != nil {
if !errdefs.IsNotFound(err) { if !errdefs.IsNotFound(err) {
@ -40,6 +43,10 @@ func (c *criService) RemoveContainer(ctx context.Context, r *runtime.RemoveConta
return &runtime.RemoveContainerResponse{}, nil return &runtime.RemoveContainerResponse{}, nil
} }
id := container.ID id := container.ID
i, err := container.Container.Info(ctx)
if err != nil {
return nil, errors.Wrap(err, "get container info")
}
// Forcibly stop the containers if they are in running or unknown state // Forcibly stop the containers if they are in running or unknown state
state := container.Status.Get().State() state := container.Status.Get().State()
@ -99,6 +106,8 @@ func (c *criService) RemoveContainer(ctx context.Context, r *runtime.RemoveConta
c.containerNameIndex.ReleaseByKey(id) c.containerNameIndex.ReleaseByKey(id)
containerRemoveTimer.WithValues(i.Runtime.Name).UpdateSince(start)
return &runtime.RemoveContainerResponse{}, nil return &runtime.RemoveContainerResponse{}, nil
} }

View File

@ -40,11 +40,17 @@ import (
// StartContainer starts the container. // StartContainer starts the container.
func (c *criService) StartContainer(ctx context.Context, r *runtime.StartContainerRequest) (retRes *runtime.StartContainerResponse, retErr error) { func (c *criService) StartContainer(ctx context.Context, r *runtime.StartContainerRequest) (retRes *runtime.StartContainerResponse, retErr error) {
start := time.Now()
cntr, err := c.containerStore.Get(r.GetContainerId()) cntr, err := c.containerStore.Get(r.GetContainerId())
if err != nil { if err != nil {
return nil, errors.Wrapf(err, "an error occurred when try to find container %q", r.GetContainerId()) return nil, errors.Wrapf(err, "an error occurred when try to find container %q", r.GetContainerId())
} }
info, err := cntr.Container.Info(ctx)
if err != nil {
return nil, errors.Wrap(err, "get container info")
}
id := cntr.ID id := cntr.ID
meta := cntr.Metadata meta := cntr.Metadata
container := cntr.Container container := cntr.Container
@ -162,6 +168,8 @@ func (c *criService) StartContainer(ctx context.Context, r *runtime.StartContain
// It handles the TaskExit event and update container state after this. // It handles the TaskExit event and update container state after this.
c.eventMonitor.startContainerExitMonitor(context.Background(), id, task.Pid(), exitCh) c.eventMonitor.startContainerExitMonitor(context.Background(), id, task.Pid(), exitCh)
containerStartTimer.WithValues(info.Runtime.Name).UpdateSince(start)
return &runtime.StartContainerResponse{}, nil return &runtime.StartContainerResponse{}, nil
} }

View File

@ -35,6 +35,7 @@ import (
// StopContainer stops a running container with a grace period (i.e., timeout). // StopContainer stops a running container with a grace period (i.e., timeout).
func (c *criService) StopContainer(ctx context.Context, r *runtime.StopContainerRequest) (*runtime.StopContainerResponse, error) { func (c *criService) StopContainer(ctx context.Context, r *runtime.StopContainerRequest) (*runtime.StopContainerResponse, error) {
start := time.Now()
// Get container config from container store. // Get container config from container store.
container, err := c.containerStore.Get(r.GetContainerId()) container, err := c.containerStore.Get(r.GetContainerId())
if err != nil { if err != nil {
@ -45,6 +46,13 @@ func (c *criService) StopContainer(ctx context.Context, r *runtime.StopContainer
return nil, err return nil, err
} }
i, err := container.Container.Info(ctx)
if err != nil {
return nil, errors.Wrap(err, "get container info")
}
containerStopTimer.WithValues(i.Runtime.Name).UpdateSince(start)
return &runtime.StopContainerResponse{}, nil return &runtime.StopContainerResponse{}, nil
} }

58
pkg/cri/server/metrics.go Normal file
View File

@ -0,0 +1,58 @@
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package server
import (
metrics "github.com/docker/go-metrics"
)
var (
sandboxListTimer metrics.Timer
sandboxCreateNetworkTimer metrics.Timer
sandboxDeleteNetwork metrics.Timer
sandboxRuntimeCreateTimer metrics.LabeledTimer
sandboxRuntimeStopTimer metrics.LabeledTimer
sandboxRemoveTimer metrics.LabeledTimer
containerListTimer metrics.Timer
containerRemoveTimer metrics.LabeledTimer
containerCreateTimer metrics.LabeledTimer
containerStopTimer metrics.LabeledTimer
containerStartTimer metrics.LabeledTimer
)
func init() {
// these CRI metrics record latencies for successful operations around a sandbox and container's lifecycle.
ns := metrics.NewNamespace("containerd", "cri", nil)
sandboxListTimer = ns.NewTimer("sandbox_list", "time to list sandboxes")
sandboxCreateNetworkTimer = ns.NewTimer("sandbox_create_network", "time to create the network for a sandbox")
sandboxDeleteNetwork = ns.NewTimer("sandbox_delete_network", "time to delete a sandbox's network")
sandboxRuntimeCreateTimer = ns.NewLabeledTimer("sandbox_runtime_create", "time to create a sandbox in the runtime", "runtime")
sandboxRuntimeStopTimer = ns.NewLabeledTimer("sandbox_runtime_stop", "time to stop a sandbox", "runtime")
sandboxRemoveTimer = ns.NewLabeledTimer("sandbox_remove", "time to remove a sandbox", "runtime")
containerListTimer = ns.NewTimer("container_list", "time to list containers")
containerRemoveTimer = ns.NewLabeledTimer("container_remove", "time to remove a container", "runtime")
containerCreateTimer = ns.NewLabeledTimer("container_create", "time to create a container", "runtime")
containerStopTimer = ns.NewLabeledTimer("container_stop", "time to stop a container", "runtime")
containerStartTimer = ns.NewLabeledTimer("container_start", "time to start a container", "runtime")
metrics.Register(ns)
}

View File

@ -17,6 +17,8 @@
package server package server
import ( import (
"time"
"golang.org/x/net/context" "golang.org/x/net/context"
runtime "k8s.io/cri-api/pkg/apis/runtime/v1" runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
@ -25,6 +27,7 @@ import (
// ListPodSandbox returns a list of Sandbox. // ListPodSandbox returns a list of Sandbox.
func (c *criService) ListPodSandbox(ctx context.Context, r *runtime.ListPodSandboxRequest) (*runtime.ListPodSandboxResponse, error) { func (c *criService) ListPodSandbox(ctx context.Context, r *runtime.ListPodSandboxRequest) (*runtime.ListPodSandboxResponse, error) {
start := time.Now()
// List all sandboxes from store. // List all sandboxes from store.
sandboxesInStore := c.sandboxStore.List() sandboxesInStore := c.sandboxStore.List()
var sandboxes []*runtime.PodSandbox var sandboxes []*runtime.PodSandbox
@ -36,6 +39,8 @@ func (c *criService) ListPodSandbox(ctx context.Context, r *runtime.ListPodSandb
} }
sandboxes = c.filterCRISandboxes(sandboxes, r.GetFilter()) sandboxes = c.filterCRISandboxes(sandboxes, r.GetFilter())
sandboxListTimer.UpdateSince(start)
return &runtime.ListPodSandboxResponse{Items: sandboxes}, nil return &runtime.ListPodSandboxResponse{Items: sandboxes}, nil
} }

View File

@ -17,6 +17,8 @@
package server package server
import ( import (
"time"
"github.com/containerd/containerd" "github.com/containerd/containerd"
"github.com/containerd/containerd/errdefs" "github.com/containerd/containerd/errdefs"
"github.com/containerd/containerd/log" "github.com/containerd/containerd/log"
@ -30,6 +32,7 @@ import (
// RemovePodSandbox removes the sandbox. If there are running containers in the // RemovePodSandbox removes the sandbox. If there are running containers in the
// sandbox, they should be forcibly removed. // sandbox, they should be forcibly removed.
func (c *criService) RemovePodSandbox(ctx context.Context, r *runtime.RemovePodSandboxRequest) (*runtime.RemovePodSandboxResponse, error) { func (c *criService) RemovePodSandbox(ctx context.Context, r *runtime.RemovePodSandboxRequest) (*runtime.RemovePodSandboxResponse, error) {
start := time.Now()
sandbox, err := c.sandboxStore.Get(r.GetPodSandboxId()) sandbox, err := c.sandboxStore.Get(r.GetPodSandboxId())
if err != nil { if err != nil {
if !errdefs.IsNotFound(err) { if !errdefs.IsNotFound(err) {
@ -108,5 +111,7 @@ func (c *criService) RemovePodSandbox(ctx context.Context, r *runtime.RemovePodS
// Release the sandbox name reserved for the sandbox. // Release the sandbox name reserved for the sandbox.
c.sandboxNameIndex.ReleaseByKey(id) c.sandboxNameIndex.ReleaseByKey(id)
sandboxRemoveTimer.WithValues(sandbox.RuntimeHandler).UpdateSince(start)
return &runtime.RemovePodSandboxResponse{}, nil return &runtime.RemovePodSandboxResponse{}, nil
} }

View File

@ -22,6 +22,7 @@ import (
"path/filepath" "path/filepath"
goruntime "runtime" goruntime "runtime"
"strings" "strings"
"time"
"github.com/containerd/containerd" "github.com/containerd/containerd"
containerdio "github.com/containerd/containerd/cio" containerdio "github.com/containerd/containerd/cio"
@ -123,6 +124,7 @@ func (c *criService) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandbox
} }
if podNetwork { if podNetwork {
netStart := time.Now()
// If it is not in host network namespace then create a namespace and set the sandbox // If it is not in host network namespace then create a namespace and set the sandbox
// handle. NetNSPath in sandbox metadata and NetNS is non empty only for non host network // handle. NetNSPath in sandbox metadata and NetNS is non empty only for non host network
// namespaces. If the pod is in host network namespace then both are empty and should not // namespaces. If the pod is in host network namespace then both are empty and should not
@ -163,8 +165,10 @@ func (c *criService) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandbox
if err := c.setupPodNetwork(ctx, &sandbox); err != nil { if err := c.setupPodNetwork(ctx, &sandbox); err != nil {
return nil, errors.Wrapf(err, "failed to setup network for sandbox %q", id) return nil, errors.Wrapf(err, "failed to setup network for sandbox %q", id)
} }
sandboxCreateNetworkTimer.UpdateSince(netStart)
} }
runtimeStart := time.Now()
// Create sandbox container. // Create sandbox container.
// NOTE: sandboxContainerSpec SHOULD NOT have side // NOTE: sandboxContainerSpec SHOULD NOT have side
// effect, e.g. accessing/creating files, so that we can test // effect, e.g. accessing/creating files, so that we can test
@ -345,6 +349,8 @@ func (c *criService) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandbox
// but we don't care about sandbox TaskOOM right now, so it is fine. // but we don't care about sandbox TaskOOM right now, so it is fine.
c.eventMonitor.startSandboxExitMonitor(context.Background(), id, task.Pid(), exitCh) c.eventMonitor.startSandboxExitMonitor(context.Background(), id, task.Pid(), exitCh)
sandboxRuntimeCreateTimer.WithValues(ociRuntime.Type).UpdateSince(runtimeStart)
return &runtime.RunPodSandboxResponse{PodSandboxId: id}, nil return &runtime.RunPodSandboxResponse{PodSandboxId: id}, nil
} }

View File

@ -54,6 +54,7 @@ func (c *criService) stopPodSandbox(ctx context.Context, sandbox sandboxstore.Sa
// Stop all containers inside the sandbox. This terminates the container forcibly, // Stop all containers inside the sandbox. This terminates the container forcibly,
// and container may still be created, so production should not rely on this behavior. // and container may still be created, so production should not rely on this behavior.
// TODO(random-liu): Introduce a state in sandbox to avoid future container creation. // TODO(random-liu): Introduce a state in sandbox to avoid future container creation.
stop := time.Now()
containers := c.containerStore.List() containers := c.containerStore.List()
for _, container := range containers { for _, container := range containers {
if container.SandboxID != id { if container.SandboxID != id {
@ -77,9 +78,11 @@ func (c *criService) stopPodSandbox(ctx context.Context, sandbox sandboxstore.Sa
return errors.Wrapf(err, "failed to stop sandbox container %q in %q state", id, state) return errors.Wrapf(err, "failed to stop sandbox container %q in %q state", id, state)
} }
} }
sandboxRuntimeStopTimer.WithValues(sandbox.RuntimeHandler).UpdateSince(stop)
// Teardown network for sandbox. // Teardown network for sandbox.
if sandbox.NetNS != nil { if sandbox.NetNS != nil {
netStop := time.Now()
// Use empty netns path if netns is not available. This is defined in: // Use empty netns path if netns is not available. This is defined in:
// https://github.com/containernetworking/cni/blob/v0.7.0-alpha1/SPEC.md // https://github.com/containernetworking/cni/blob/v0.7.0-alpha1/SPEC.md
if closed, err := sandbox.NetNS.Closed(); err != nil { if closed, err := sandbox.NetNS.Closed(); err != nil {
@ -93,6 +96,7 @@ func (c *criService) stopPodSandbox(ctx context.Context, sandbox sandboxstore.Sa
if err := sandbox.NetNS.Remove(); err != nil { if err := sandbox.NetNS.Remove(); err != nil {
return errors.Wrapf(err, "failed to remove network namespace for sandbox %q", id) return errors.Wrapf(err, "failed to remove network namespace for sandbox %q", id)
} }
sandboxDeleteNetwork.UpdateSince(netStop)
} }
log.G(ctx).Infof("TearDown network for sandbox %q successfully", id) log.G(ctx).Infof("TearDown network for sandbox %q successfully", id)