cri: get pid count from container metrics

This reduces latency of calling ListPodSandboxStats() by avoiding calling
shim API Task().

Signed-off-by: Eric Lin <linxiulei@gmail.com>
This commit is contained in:
Eric Lin 2024-06-21 15:29:32 +00:00
parent 741c4bde51
commit f6e731c809
4 changed files with 73 additions and 59 deletions

View File

@ -49,5 +49,5 @@ func (c *criService) ContainerStats(ctx context.Context, in *runtime.ContainerSt
if err != nil { if err != nil {
return nil, fmt.Errorf("failed to decode container metrics: %w", err) return nil, fmt.Errorf("failed to decode container metrics: %w", err)
} }
return &runtime.ContainerStatsResponse{Stats: cs}, nil return &runtime.ContainerStatsResponse{Stats: cs.stats}, nil
} }

View File

@ -43,6 +43,17 @@ func (c *criService) ListContainerStats(
ctx context.Context, ctx context.Context,
in *runtime.ListContainerStatsRequest, in *runtime.ListContainerStatsRequest,
) (*runtime.ListContainerStatsResponse, error) { ) (*runtime.ListContainerStatsResponse, error) {
css, err := c.listContainerStats(ctx, in)
if err != nil {
return nil, fmt.Errorf("failed to fetch containers and stats: %w", err)
}
return c.toCRIContainerStats(css), nil
}
func (c *criService) listContainerStats(
ctx context.Context,
in *runtime.ListContainerStatsRequest,
) ([]containerStats, error) {
request, containers, err := c.buildTaskMetricsRequest(in) request, containers, err := c.buildTaskMetricsRequest(in)
if err != nil { if err != nil {
return nil, fmt.Errorf("failed to build metrics request: %w", err) return nil, fmt.Errorf("failed to build metrics request: %w", err)
@ -51,14 +62,20 @@ func (c *criService) ListContainerStats(
if err != nil { if err != nil {
return nil, fmt.Errorf("failed to fetch metrics for tasks: %w", err) return nil, fmt.Errorf("failed to fetch metrics for tasks: %w", err)
} }
criStats, err := c.toCRIContainerStats(ctx, resp.Metrics, containers) css, err := c.toContainerStats(ctx, resp.Metrics, containers)
if err != nil { if err != nil {
return nil, fmt.Errorf("failed to convert to cri containerd stats format: %w", err) return nil, fmt.Errorf("failed to convert to cri containerd stats format: %w", err)
} }
return criStats, nil return css, nil
} }
type metricsHandler func(containerstore.Metadata, *types.Metric) (*runtime.ContainerStats, error) type containerStats struct {
stats *runtime.ContainerStats
// pids is only valid in linux platform
pids uint64
}
type metricsHandler func(containerstore.Metadata, *types.Metric) (containerStats, error)
// Returns a function to be used for transforming container metrics into the right format. // Returns a function to be used for transforming container metrics into the right format.
// Uses the platform the given sandbox advertises to implement its logic. If the platform is // Uses the platform the given sandbox advertises to implement its logic. If the platform is
@ -86,11 +103,11 @@ func (c *criService) getMetricsHandler(ctx context.Context, sandboxID string) (m
switch p.OS { switch p.OS {
case "windows": case "windows":
return func(meta containerstore.Metadata, stats *types.Metric) (*runtime.ContainerStats, error) { return func(meta containerstore.Metadata, stats *types.Metric) (containerStats, error) {
return c.windowsContainerMetrics(meta, stats, snapshotter) return c.windowsContainerMetrics(meta, stats, snapshotter)
}, nil }, nil
case "linux": case "linux":
return func(meta containerstore.Metadata, stats *types.Metric) (*runtime.ContainerStats, error) { return func(meta containerstore.Metadata, stats *types.Metric) (containerStats, error) {
return c.linuxContainerMetrics(meta, stats, snapshotter) return c.linuxContainerMetrics(meta, stats, snapshotter)
}, nil }, nil
default: default:
@ -98,16 +115,16 @@ func (c *criService) getMetricsHandler(ctx context.Context, sandboxID string) (m
} }
} }
func (c *criService) toCRIContainerStats( func (c *criService) toContainerStats(
ctx context.Context, ctx context.Context,
stats []*types.Metric, stats []*types.Metric,
containers []containerstore.Container, containers []containerstore.Container,
) (*runtime.ListContainerStatsResponse, error) { ) ([]containerStats, error) {
statsMap := make(map[string]*types.Metric) statsMap := make(map[string]*types.Metric)
for _, stat := range stats { for _, stat := range stats {
statsMap[stat.ID] = stat statsMap[stat.ID] = stat
} }
containerStats := new(runtime.ListContainerStatsResponse) css := []containerStats{}
// Unfortunately if no filter was passed we're asking for every containers stats which // Unfortunately if no filter was passed we're asking for every containers stats which
// generally belong to multiple different pods, who all might have different platforms. // generally belong to multiple different pods, who all might have different platforms.
@ -143,17 +160,25 @@ func (c *criService) toCRIContainerStats(
return nil, fmt.Errorf("failed to decode container metrics for %q: %w", cntr.ID, err) return nil, fmt.Errorf("failed to decode container metrics for %q: %w", cntr.ID, err)
} }
if cs.Cpu != nil && cs.Cpu.UsageCoreNanoSeconds != nil { if cs.stats.Cpu != nil && cs.stats.Cpu.UsageCoreNanoSeconds != nil {
// this is a calculated value and should be computed for all OSes // this is a calculated value and should be computed for all OSes
nanoUsage, err := c.getUsageNanoCores(cntr.Metadata.ID, false, cs.Cpu.UsageCoreNanoSeconds.Value, time.Unix(0, cs.Cpu.Timestamp)) nanoUsage, err := c.getUsageNanoCores(cntr.Metadata.ID, false, cs.stats.Cpu.UsageCoreNanoSeconds.Value, time.Unix(0, cs.stats.Cpu.Timestamp))
if err != nil { if err != nil {
return nil, fmt.Errorf("failed to get usage nano cores, containerID: %s: %w", cntr.Metadata.ID, err) return nil, fmt.Errorf("failed to get usage nano cores, containerID: %s: %w", cntr.Metadata.ID, err)
} }
cs.Cpu.UsageNanoCores = &runtime.UInt64Value{Value: nanoUsage} cs.stats.Cpu.UsageNanoCores = &runtime.UInt64Value{Value: nanoUsage}
} }
containerStats.Stats = append(containerStats.Stats, cs) css = append(css, cs)
} }
return containerStats, nil return css, nil
}
func (c *criService) toCRIContainerStats(css []containerStats) *runtime.ListContainerStatsResponse {
containerStats := new(runtime.ListContainerStatsResponse)
for _, cs := range css {
containerStats.Stats = append(containerStats.Stats, cs.stats)
}
return containerStats
} }
func (c *criService) getUsageNanoCores(containerID string, isSandbox bool, currentUsageCoreNanoSeconds uint64, currentTimestamp time.Time) (uint64, error) { func (c *criService) getUsageNanoCores(containerID string, isSandbox bool, currentUsageCoreNanoSeconds uint64, currentTimestamp time.Time) (uint64, error) {
@ -275,7 +300,7 @@ func (c *criService) windowsContainerMetrics(
meta containerstore.Metadata, meta containerstore.Metadata,
stats *types.Metric, stats *types.Metric,
snapshotter string, snapshotter string,
) (*runtime.ContainerStats, error) { ) (containerStats, error) {
var cs runtime.ContainerStats var cs runtime.ContainerStats
var usedBytes, inodesUsed uint64 var usedBytes, inodesUsed uint64
sn, err := c.GetSnapshot(meta.ID, snapshotter) sn, err := c.GetSnapshot(meta.ID, snapshotter)
@ -303,11 +328,11 @@ func (c *criService) windowsContainerMetrics(
if stats != nil { if stats != nil {
s, err := typeurl.UnmarshalAny(stats.Data) s, err := typeurl.UnmarshalAny(stats.Data)
if err != nil { if err != nil {
return nil, fmt.Errorf("failed to extract container metrics: %w", err) return containerStats{}, fmt.Errorf("failed to extract container metrics: %w", err)
} }
wstats := s.(*wstats.Statistics).GetWindows() wstats := s.(*wstats.Statistics).GetWindows()
if wstats == nil { if wstats == nil {
return nil, errors.New("windows stats is empty") return containerStats{}, errors.New("windows stats is empty")
} }
if wstats.Processor != nil { if wstats.Processor != nil {
cs.Cpu = &runtime.CpuUsage{ cs.Cpu = &runtime.CpuUsage{
@ -324,16 +349,16 @@ func (c *criService) windowsContainerMetrics(
} }
} }
} }
return &cs, nil return containerStats{&cs, 0}, nil
} }
func (c *criService) linuxContainerMetrics( func (c *criService) linuxContainerMetrics(
meta containerstore.Metadata, meta containerstore.Metadata,
stats *types.Metric, stats *types.Metric,
snapshotter string, snapshotter string,
) (*runtime.ContainerStats, error) { ) (containerStats, error) {
var cs runtime.ContainerStats var cs runtime.ContainerStats
var usedBytes, inodesUsed uint64 var usedBytes, inodesUsed, pids uint64
sn, err := c.GetSnapshot(meta.ID, snapshotter) sn, err := c.GetSnapshot(meta.ID, snapshotter)
// If snapshotstore doesn't have cached snapshot information // If snapshotstore doesn't have cached snapshot information
// set WritableLayer usage to zero // set WritableLayer usage to zero
@ -361,32 +386,37 @@ func (c *criService) linuxContainerMetrics(
switch { switch {
case typeurl.Is(stats.Data, (*cg1.Metrics)(nil)): case typeurl.Is(stats.Data, (*cg1.Metrics)(nil)):
data = &cg1.Metrics{} data = &cg1.Metrics{}
if err := typeurl.UnmarshalTo(stats.Data, data); err != nil {
return containerStats{}, fmt.Errorf("failed to extract container metrics: %w", err)
}
pids = data.(*cg1.Metrics).GetPids().GetCurrent()
case typeurl.Is(stats.Data, (*cg2.Metrics)(nil)): case typeurl.Is(stats.Data, (*cg2.Metrics)(nil)):
data = &cg2.Metrics{} data = &cg2.Metrics{}
case typeurl.Is(stats.Data, (*wstats.Statistics)(nil)): if err := typeurl.UnmarshalTo(stats.Data, data); err != nil {
data = &wstats.Statistics{} return containerStats{}, fmt.Errorf("failed to extract container metrics: %w", err)
}
pids = data.(*cg2.Metrics).GetPids().GetCurrent()
default: default:
return nil, errors.New("cannot convert metric data to cgroups.Metrics or windows.Statistics") return containerStats{}, errors.New("cannot convert metric data to cgroups.Metrics")
}
if err := typeurl.UnmarshalTo(stats.Data, data); err != nil {
return nil, fmt.Errorf("failed to extract container metrics: %w", err)
} }
cpuStats, err := c.cpuContainerStats(meta.ID, false /* isSandbox */, data, protobuf.FromTimestamp(stats.Timestamp)) cpuStats, err := c.cpuContainerStats(meta.ID, false /* isSandbox */, data, protobuf.FromTimestamp(stats.Timestamp))
if err != nil { if err != nil {
return nil, fmt.Errorf("failed to obtain cpu stats: %w", err) return containerStats{}, fmt.Errorf("failed to obtain cpu stats: %w", err)
} }
cs.Cpu = cpuStats cs.Cpu = cpuStats
memoryStats, err := c.memoryContainerStats(meta.ID, data, protobuf.FromTimestamp(stats.Timestamp)) memoryStats, err := c.memoryContainerStats(meta.ID, data, protobuf.FromTimestamp(stats.Timestamp))
if err != nil { if err != nil {
return nil, fmt.Errorf("failed to obtain memory stats: %w", err) return containerStats{}, fmt.Errorf("failed to obtain memory stats: %w", err)
} }
cs.Memory = memoryStats cs.Memory = memoryStats
if err != nil {
return containerStats{}, fmt.Errorf("failed to obtain pid count: %w", err)
}
} }
return &cs, nil return containerStats{&cs, pids}, nil
} }
// getWorkingSet calculates workingset memory from cgroup memory stats. // getWorkingSet calculates workingset memory from cgroup memory stats.

View File

@ -420,7 +420,7 @@ func TestListContainerStats(t *testing.T) {
if tt.before != nil { if tt.before != nil {
tt.before() tt.before()
} }
got, err := c.toCRIContainerStats(tt.args.ctx, tt.args.stats, tt.args.containers) css, err := c.toContainerStats(tt.args.ctx, tt.args.stats, tt.args.containers)
if tt.after != nil { if tt.after != nil {
tt.after() tt.after()
} }
@ -428,6 +428,10 @@ func TestListContainerStats(t *testing.T) {
t.Errorf("ListContainerStats() error = %v, wantErr %v", err, tt.wantErr) t.Errorf("ListContainerStats() error = %v, wantErr %v", err, tt.wantErr)
return return
} }
var got *runtime.ListContainerStatsResponse
if err == nil {
got = c.toCRIContainerStats(css)
}
if !reflect.DeepEqual(got, tt.want) { if !reflect.DeepEqual(got, tt.want) {
t.Errorf("ListContainerStats() = %v, want %v", got, tt.want) t.Errorf("ListContainerStats() = %v, want %v", got, tt.want)
} }

View File

@ -84,40 +84,20 @@ func (c *criService) podSandboxStats(
} }
} }
listContainerStatsRequest := &runtime.ListContainerStatsRequest{Filter: &runtime.ContainerStatsFilter{PodSandboxId: meta.ID}}
css, err := c.listContainerStats(ctx, listContainerStatsRequest)
if err != nil {
return nil, fmt.Errorf("failed to obtain container stats during podSandboxStats call: %w", err)
}
var pidCount uint64 var pidCount uint64
for _, cntr := range c.containerStore.List() { for _, cs := range css {
if cntr.SandboxID != sandbox.ID { pidCount += cs.pids
continue podSandboxStats.Linux.Containers = append(podSandboxStats.Linux.Containers, cs.stats)
}
state := cntr.Status.Get().State()
if state != runtime.ContainerState_CONTAINER_RUNNING {
continue
}
task, err := cntr.Container.Task(ctx, nil)
if err != nil {
return nil, err
}
processes, err := task.Pids(ctx)
if err != nil {
return nil, err
}
pidCount += uint64(len(processes))
} }
podSandboxStats.Linux.Process = &runtime.ProcessUsage{ podSandboxStats.Linux.Process = &runtime.ProcessUsage{
Timestamp: timestamp.UnixNano(), Timestamp: timestamp.UnixNano(),
ProcessCount: &runtime.UInt64Value{Value: pidCount}, ProcessCount: &runtime.UInt64Value{Value: pidCount},
} }
listContainerStatsRequest := &runtime.ListContainerStatsRequest{Filter: &runtime.ContainerStatsFilter{PodSandboxId: meta.ID}}
resp, err := c.ListContainerStats(ctx, listContainerStatsRequest)
if err != nil {
return nil, fmt.Errorf("failed to obtain container stats during podSandboxStats call: %w", err)
}
podSandboxStats.Linux.Containers = resp.GetStats()
} }
return podSandboxStats, nil return podSandboxStats, nil