Add spans to CRI runtime service and related client methods
This adds otel spans to CRI service mainly targeting mutating apis which includes: * Sandbox apis - RunPodSandbox, StopPodSandbox, RemovePodSandbox * Container apis - CreateContainer, StartContainer, StopContainer, RemoveContainer * Attach, Exec and Exec Sync * Containerd client methods: container.go, client.go, process.go and task.go Signed-off-by: Swagat Bora <sbora@amazon.com>
This commit is contained in:
@@ -22,6 +22,7 @@ import (
|
||||
"io"
|
||||
|
||||
containerd "github.com/containerd/containerd/v2/client"
|
||||
"github.com/containerd/containerd/v2/pkg/tracing"
|
||||
"github.com/containerd/log"
|
||||
"k8s.io/client-go/tools/remotecommand"
|
||||
runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
|
||||
@@ -31,10 +32,12 @@ import (
|
||||
|
||||
// Attach prepares a streaming endpoint to attach to a running container, and returns the address.
|
||||
func (c *criService) Attach(ctx context.Context, r *runtime.AttachRequest) (*runtime.AttachResponse, error) {
|
||||
span := tracing.SpanFromContext(ctx)
|
||||
cntr, err := c.containerStore.Get(r.GetContainerId())
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to find container in store: %w", err)
|
||||
}
|
||||
span.SetAttributes(tracing.Attribute("container.id", cntr.ID))
|
||||
state := cntr.Status.Get().State()
|
||||
if state != runtime.ContainerState_CONTAINER_RUNNING {
|
||||
return nil, fmt.Errorf("container is in %s state", criContainerStateToString(state))
|
||||
|
||||
@@ -45,6 +45,7 @@ import (
|
||||
"github.com/containerd/containerd/v2/internal/cri/util"
|
||||
"github.com/containerd/containerd/v2/pkg/blockio"
|
||||
"github.com/containerd/containerd/v2/pkg/oci"
|
||||
"github.com/containerd/containerd/v2/pkg/tracing"
|
||||
"github.com/containerd/platforms"
|
||||
)
|
||||
|
||||
@@ -55,6 +56,7 @@ func init() {
|
||||
|
||||
// CreateContainer creates a new container in the given PodSandbox.
|
||||
func (c *criService) CreateContainer(ctx context.Context, r *runtime.CreateContainerRequest) (_ *runtime.CreateContainerResponse, retErr error) {
|
||||
span := tracing.SpanFromContext(ctx)
|
||||
config := r.GetConfig()
|
||||
log.G(ctx).Debugf("Container config %+v", config)
|
||||
sandboxConfig := r.GetSandboxConfig()
|
||||
@@ -72,7 +74,10 @@ func (c *criService) CreateContainer(ctx context.Context, r *runtime.CreateConta
|
||||
sandboxID = cstatus.SandboxID
|
||||
sandboxPid = cstatus.Pid
|
||||
)
|
||||
|
||||
span.SetAttributes(
|
||||
tracing.Attribute("sandbox.id", sandboxID),
|
||||
tracing.Attribute("sandbox.pid", sandboxPid),
|
||||
)
|
||||
// Generate unique id and name for the container and reserve the name.
|
||||
// Reserve the container name to avoid concurrent `CreateContainer` request creating
|
||||
// the same container.
|
||||
@@ -87,6 +92,10 @@ func (c *criService) CreateContainer(ctx context.Context, r *runtime.CreateConta
|
||||
if err = c.containerNameIndex.Reserve(name, id); err != nil {
|
||||
return nil, fmt.Errorf("failed to reserve container name %q: %w", name, err)
|
||||
}
|
||||
span.SetAttributes(
|
||||
tracing.Attribute("container.id", id),
|
||||
tracing.Attribute("container.name", name),
|
||||
)
|
||||
defer func() {
|
||||
// Release the name if the function returns with an error.
|
||||
if retErr != nil {
|
||||
@@ -112,7 +121,9 @@ func (c *criService) CreateContainer(ctx context.Context, r *runtime.CreateConta
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to get image from containerd %q: %w", image.ID, err)
|
||||
}
|
||||
|
||||
span.SetAttributes(
|
||||
tracing.Attribute("container.image.ref", containerdImage.Name()),
|
||||
)
|
||||
start := time.Now()
|
||||
|
||||
// Create container root directory.
|
||||
@@ -345,6 +356,9 @@ func (c *criService) CreateContainer(ctx context.Context, r *runtime.CreateConta
|
||||
|
||||
containerCreateTimer.WithValues(ociRuntime.Type).UpdateSince(start)
|
||||
|
||||
span.AddEvent("container created",
|
||||
tracing.Attribute("container.create.duration", time.Since(start).String()),
|
||||
)
|
||||
return &runtime.CreateContainerResponse{ContainerId: id}, nil
|
||||
}
|
||||
|
||||
|
||||
@@ -20,15 +20,18 @@ import (
|
||||
"context"
|
||||
"fmt"
|
||||
|
||||
"github.com/containerd/containerd/v2/pkg/tracing"
|
||||
runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
|
||||
)
|
||||
|
||||
// Exec prepares a streaming endpoint to execute a command in the container, and returns the address.
|
||||
func (c *criService) Exec(ctx context.Context, r *runtime.ExecRequest) (*runtime.ExecResponse, error) {
|
||||
span := tracing.SpanFromContext(ctx)
|
||||
cntr, err := c.containerStore.Get(r.GetContainerId())
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to find container %q in store: %w", r.GetContainerId(), err)
|
||||
}
|
||||
span.SetAttributes(tracing.Attribute("container.id", cntr.ID))
|
||||
state := cntr.Status.Get().State()
|
||||
if state != runtime.ContainerState_CONTAINER_RUNNING {
|
||||
return nil, fmt.Errorf("container is in %s state", criContainerStateToString(state))
|
||||
|
||||
@@ -25,6 +25,7 @@ import (
|
||||
"time"
|
||||
|
||||
"github.com/containerd/containerd/v2/pkg/oci"
|
||||
"github.com/containerd/containerd/v2/pkg/tracing"
|
||||
"github.com/containerd/errdefs"
|
||||
"github.com/containerd/log"
|
||||
"k8s.io/client-go/tools/remotecommand"
|
||||
@@ -271,6 +272,7 @@ func (c *criService) execInternal(ctx context.Context, container containerd.Cont
|
||||
// this case, the CRI plugin will still have a goroutine waiting for the exec process
|
||||
// to exit and log the exit code, but dockershim won't.
|
||||
func (c *criService) execInContainer(ctx context.Context, id string, opts execOptions) (*uint32, error) {
|
||||
span := tracing.SpanFromContext(ctx)
|
||||
// Get container from our container store.
|
||||
cntr, err := c.containerStore.Get(id)
|
||||
|
||||
@@ -278,6 +280,7 @@ func (c *criService) execInContainer(ctx context.Context, id string, opts execOp
|
||||
return nil, fmt.Errorf("failed to find container %q in store: %w", id, err)
|
||||
}
|
||||
id = cntr.ID
|
||||
span.SetAttributes(tracing.Attribute("container.id", id))
|
||||
|
||||
state := cntr.Status.Get().State()
|
||||
if state != runtime.ContainerState_CONTAINER_RUNNING {
|
||||
|
||||
@@ -24,6 +24,7 @@ import (
|
||||
|
||||
containerd "github.com/containerd/containerd/v2/client"
|
||||
containerstore "github.com/containerd/containerd/v2/internal/cri/store/container"
|
||||
"github.com/containerd/containerd/v2/pkg/tracing"
|
||||
"github.com/containerd/errdefs"
|
||||
"github.com/containerd/log"
|
||||
runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
|
||||
@@ -31,6 +32,7 @@ import (
|
||||
|
||||
// RemoveContainer removes the container.
|
||||
func (c *criService) RemoveContainer(ctx context.Context, r *runtime.RemoveContainerRequest) (_ *runtime.RemoveContainerResponse, retErr error) {
|
||||
span := tracing.SpanFromContext(ctx)
|
||||
start := time.Now()
|
||||
ctrID := r.GetContainerId()
|
||||
container, err := c.containerStore.Get(ctrID)
|
||||
@@ -43,6 +45,7 @@ func (c *criService) RemoveContainer(ctx context.Context, r *runtime.RemoveConta
|
||||
return &runtime.RemoveContainerResponse{}, nil
|
||||
}
|
||||
id := container.ID
|
||||
span.SetAttributes(tracing.Attribute("container.id", id))
|
||||
i, err := container.Container.Info(ctx)
|
||||
if err != nil {
|
||||
if !errdefs.IsNotFound(err) {
|
||||
@@ -129,6 +132,11 @@ func (c *criService) RemoveContainer(ctx context.Context, r *runtime.RemoveConta
|
||||
|
||||
containerRemoveTimer.WithValues(i.Runtime.Name).UpdateSince(start)
|
||||
|
||||
span.AddEvent("container removed",
|
||||
tracing.Attribute("container.id", container.ID),
|
||||
tracing.Attribute("container.remove.duration", time.Since(start).String()),
|
||||
)
|
||||
|
||||
return &runtime.RemoveContainerResponse{}, nil
|
||||
}
|
||||
|
||||
|
||||
@@ -23,6 +23,7 @@ import (
|
||||
"io"
|
||||
"time"
|
||||
|
||||
"github.com/containerd/containerd/v2/pkg/tracing"
|
||||
"github.com/containerd/errdefs"
|
||||
"github.com/containerd/log"
|
||||
runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
|
||||
@@ -38,12 +39,13 @@ import (
|
||||
|
||||
// StartContainer starts the container.
|
||||
func (c *criService) StartContainer(ctx context.Context, r *runtime.StartContainerRequest) (retRes *runtime.StartContainerResponse, retErr error) {
|
||||
span := tracing.SpanFromContext(ctx)
|
||||
start := time.Now()
|
||||
cntr, err := c.containerStore.Get(r.GetContainerId())
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("an error occurred when try to find container %q: %w", r.GetContainerId(), err)
|
||||
}
|
||||
|
||||
span.SetAttributes(tracing.Attribute("container.id", cntr.ID))
|
||||
info, err := cntr.Container.Info(ctx)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("get container info: %w", err)
|
||||
@@ -87,6 +89,7 @@ func (c *criService) StartContainer(ctx context.Context, r *runtime.StartContain
|
||||
if sandbox.Status.Get().State != sandboxstore.StateReady {
|
||||
return nil, fmt.Errorf("sandbox container %q is not running", sandboxID)
|
||||
}
|
||||
span.SetAttributes(tracing.Attribute("sandbox.id", sandboxID))
|
||||
|
||||
// Recheck target container validity in Linux namespace options.
|
||||
if linux := config.GetLinux(); linux != nil {
|
||||
@@ -190,6 +193,10 @@ func (c *criService) StartContainer(ctx context.Context, r *runtime.StartContain
|
||||
|
||||
containerStartTimer.WithValues(info.Runtime.Name).UpdateSince(start)
|
||||
|
||||
span.AddEvent("container started",
|
||||
tracing.Attribute("container.start.duration", time.Since(start).String()),
|
||||
)
|
||||
|
||||
return &runtime.StartContainerResponse{}, nil
|
||||
}
|
||||
|
||||
|
||||
@@ -27,6 +27,7 @@ import (
|
||||
containerstore "github.com/containerd/containerd/v2/internal/cri/store/container"
|
||||
ctrdutil "github.com/containerd/containerd/v2/internal/cri/util"
|
||||
"github.com/containerd/containerd/v2/pkg/protobuf"
|
||||
"github.com/containerd/containerd/v2/pkg/tracing"
|
||||
"github.com/containerd/errdefs"
|
||||
"github.com/containerd/log"
|
||||
|
||||
@@ -36,6 +37,7 @@ import (
|
||||
|
||||
// StopContainer stops a running container with a grace period (i.e., timeout).
|
||||
func (c *criService) StopContainer(ctx context.Context, r *runtime.StopContainerRequest) (*runtime.StopContainerResponse, error) {
|
||||
span := tracing.SpanFromContext(ctx)
|
||||
start := time.Now()
|
||||
// Get container config from container store.
|
||||
container, err := c.containerStore.Get(r.GetContainerId())
|
||||
@@ -49,7 +51,7 @@ func (c *criService) StopContainer(ctx context.Context, r *runtime.StopContainer
|
||||
// https://github.com/kubernetes/cri-api/blob/c20fa40/pkg/apis/runtime/v1/api.proto#L67-L68
|
||||
return &runtime.StopContainerResponse{}, nil
|
||||
}
|
||||
|
||||
span.SetAttributes(tracing.Attribute("container.id", container.ID))
|
||||
if err := c.stopContainer(ctx, container, time.Duration(r.GetTimeout())*time.Second); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@@ -76,6 +78,8 @@ func (c *criService) StopContainer(ctx context.Context, r *runtime.StopContainer
|
||||
|
||||
// stopContainer stops a container based on the container metadata.
|
||||
func (c *criService) stopContainer(ctx context.Context, container containerstore.Container, timeout time.Duration) error {
|
||||
span := tracing.SpanFromContext(ctx)
|
||||
start := time.Now()
|
||||
id := container.ID
|
||||
sandboxID := container.SandboxID
|
||||
|
||||
@@ -199,6 +203,12 @@ func (c *criService) stopContainer(ctx context.Context, container containerstore
|
||||
if err != nil {
|
||||
return fmt.Errorf("an error occurs during waiting for container %q to be killed: %w", id, err)
|
||||
}
|
||||
|
||||
span.AddEvent("container stopped",
|
||||
tracing.Attribute("container.id", id),
|
||||
tracing.Attribute("container.stop.duration", time.Since(start).String()),
|
||||
)
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
|
||||
@@ -21,6 +21,7 @@ import (
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"github.com/containerd/containerd/v2/pkg/tracing"
|
||||
"github.com/containerd/errdefs"
|
||||
"github.com/containerd/log"
|
||||
|
||||
@@ -30,6 +31,7 @@ import (
|
||||
// RemovePodSandbox removes the sandbox. If there are running containers in the
|
||||
// sandbox, they should be forcibly removed.
|
||||
func (c *criService) RemovePodSandbox(ctx context.Context, r *runtime.RemovePodSandboxRequest) (*runtime.RemovePodSandboxResponse, error) {
|
||||
span := tracing.SpanFromContext(ctx)
|
||||
start := time.Now()
|
||||
sandbox, err := c.sandboxStore.Get(r.GetPodSandboxId())
|
||||
if err != nil {
|
||||
@@ -44,6 +46,7 @@ func (c *criService) RemovePodSandbox(ctx context.Context, r *runtime.RemovePodS
|
||||
}
|
||||
// Use the full sandbox id.
|
||||
id := sandbox.ID
|
||||
span.SetAttributes(tracing.Attribute("sandbox.id", id))
|
||||
|
||||
// If the sandbox is still running, not ready, or in an unknown state, forcibly stop it.
|
||||
// Even if it's in a NotReady state, this will close its network namespace, if open.
|
||||
@@ -110,5 +113,9 @@ func (c *criService) RemovePodSandbox(ctx context.Context, r *runtime.RemovePodS
|
||||
|
||||
sandboxRemoveTimer.WithValues(sandbox.RuntimeHandler).UpdateSince(start)
|
||||
|
||||
span.AddEvent("pod sandbox removed",
|
||||
tracing.Attribute("sandbox.remove.duration", time.Since(start).String()),
|
||||
)
|
||||
|
||||
return &runtime.RemovePodSandboxResponse{}, nil
|
||||
}
|
||||
|
||||
@@ -39,6 +39,7 @@ import (
|
||||
sandboxstore "github.com/containerd/containerd/v2/internal/cri/store/sandbox"
|
||||
"github.com/containerd/containerd/v2/internal/cri/util"
|
||||
"github.com/containerd/containerd/v2/pkg/netns"
|
||||
"github.com/containerd/containerd/v2/pkg/tracing"
|
||||
)
|
||||
|
||||
func init() {
|
||||
@@ -49,6 +50,7 @@ func init() {
|
||||
// RunPodSandbox creates and starts a pod-level sandbox. Runtimes should ensure
|
||||
// the sandbox is in ready state.
|
||||
func (c *criService) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandboxRequest) (_ *runtime.RunPodSandboxResponse, retErr error) {
|
||||
span := tracing.SpanFromContext(ctx)
|
||||
config := r.GetConfig()
|
||||
log.G(ctx).Debugf("Sandbox config %+v", config)
|
||||
|
||||
@@ -59,6 +61,11 @@ func (c *criService) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandbox
|
||||
return nil, errors.New("sandbox config must include metadata")
|
||||
}
|
||||
name := makeSandboxName(metadata)
|
||||
|
||||
span.SetAttributes(
|
||||
tracing.Attribute("sandbox.id", id),
|
||||
tracing.Attribute("sandbox.name", name),
|
||||
)
|
||||
log.G(ctx).WithField("podsandboxid", id).Debugf("generated id for sandbox name %q", name)
|
||||
|
||||
// cleanupErr records the last error returned by the critical cleanup operations in deferred functions,
|
||||
@@ -172,6 +179,7 @@ func (c *criService) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandbox
|
||||
//
|
||||
// To simplify this, in the future, we should just remove this case (podNetwork &&
|
||||
// !userNsEnabled) and just keep the other case (podNetwork && userNsEnabled).
|
||||
span.AddEvent("setup pod network")
|
||||
netStart := time.Now()
|
||||
// If it is not in host network namespace then create a namespace and set the sandbox
|
||||
// handle. NetNSPath in sandbox metadata and NetNS is non empty only for non host network
|
||||
@@ -356,6 +364,10 @@ func (c *criService) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandbox
|
||||
return nil, fmt.Errorf("failed to setup network for sandbox %q: %w", id, err)
|
||||
}
|
||||
sandboxCreateNetworkTimer.UpdateSince(netStart)
|
||||
|
||||
span.AddEvent("finished pod network setup",
|
||||
tracing.Attribute("pod.network.setup.duration", time.Since(netStart).String()),
|
||||
)
|
||||
}
|
||||
|
||||
// TODO: get rid of this. sandbox object should no longer have Container field.
|
||||
|
||||
@@ -22,6 +22,7 @@ import (
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"github.com/containerd/containerd/v2/pkg/tracing"
|
||||
"github.com/containerd/log"
|
||||
runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
|
||||
|
||||
@@ -32,6 +33,7 @@ import (
|
||||
// StopPodSandbox stops the sandbox. If there are any running containers in the
|
||||
// sandbox, they should be forcibly terminated.
|
||||
func (c *criService) StopPodSandbox(ctx context.Context, r *runtime.StopPodSandboxRequest) (*runtime.StopPodSandboxResponse, error) {
|
||||
span := tracing.SpanFromContext(ctx)
|
||||
sandbox, err := c.sandboxStore.Get(r.GetPodSandboxId())
|
||||
if err != nil {
|
||||
if !errdefs.IsNotFound(err) {
|
||||
@@ -44,7 +46,7 @@ func (c *criService) StopPodSandbox(ctx context.Context, r *runtime.StopPodSandb
|
||||
// https://github.com/kubernetes/cri-api/blob/c20fa40/pkg/apis/runtime/v1/api.proto#L45-L46
|
||||
return &runtime.StopPodSandboxResponse{}, nil
|
||||
}
|
||||
|
||||
span.SetAttributes(tracing.Attribute("sandbox.id", sandbox.ID))
|
||||
if err := c.stopPodSandbox(ctx, sandbox); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@@ -53,12 +55,14 @@ func (c *criService) StopPodSandbox(ctx context.Context, r *runtime.StopPodSandb
|
||||
}
|
||||
|
||||
func (c *criService) stopPodSandbox(ctx context.Context, sandbox sandboxstore.Sandbox) error {
|
||||
span := tracing.SpanFromContext(ctx)
|
||||
// Use the full sandbox id.
|
||||
id := sandbox.ID
|
||||
|
||||
// Stop all containers inside the sandbox. This terminates the container forcibly,
|
||||
// and container may still be created, so production should not rely on this behavior.
|
||||
// TODO(random-liu): Introduce a state in sandbox to avoid future container creation.
|
||||
span.AddEvent("stopping containers in the sandbox")
|
||||
stop := time.Now()
|
||||
containers := c.containerStore.List()
|
||||
for _, container := range containers {
|
||||
@@ -87,6 +91,10 @@ func (c *criService) stopPodSandbox(ctx context.Context, sandbox sandboxstore.Sa
|
||||
|
||||
sandboxRuntimeStopTimer.WithValues(sandbox.RuntimeHandler).UpdateSince(stop)
|
||||
|
||||
span.AddEvent("sandbox container stopped",
|
||||
tracing.Attribute("sandbox.stop.duration", time.Since(stop).String()),
|
||||
)
|
||||
|
||||
err := c.nri.StopPodSandbox(ctx, &sandbox)
|
||||
if err != nil {
|
||||
log.G(ctx).WithError(err).Errorf("NRI sandbox stop notification failed")
|
||||
@@ -94,6 +102,7 @@ func (c *criService) stopPodSandbox(ctx context.Context, sandbox sandboxstore.Sa
|
||||
|
||||
// Teardown network for sandbox.
|
||||
if sandbox.NetNS != nil {
|
||||
span.AddEvent("start pod network teardown")
|
||||
netStop := time.Now()
|
||||
// Use empty netns path if netns is not available. This is defined in:
|
||||
// https://github.com/containernetworking/cni/blob/v0.7.0-alpha1/SPEC.md
|
||||
@@ -109,10 +118,13 @@ func (c *criService) stopPodSandbox(ctx context.Context, sandbox sandboxstore.Sa
|
||||
return fmt.Errorf("failed to remove network namespace for sandbox %q: %w", id, err)
|
||||
}
|
||||
sandboxDeleteNetwork.UpdateSince(netStop)
|
||||
|
||||
span.AddEvent("finished pod network teardown",
|
||||
tracing.Attribute("network.teardown.duration", time.Since(netStop).String()),
|
||||
)
|
||||
}
|
||||
|
||||
log.G(ctx).Infof("TearDown network for sandbox %q successfully", id)
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user