metrics/cgroups: fix deadlock issue in Add during Collect

The Collector.Collect will be the field ns'Collect's callback, which be
invoked periodically with internal lock. And Collector.Add also runs
with ns.Lock in Collector.Lock, which is easy to cause deadlock.

Goroutine X:

	ns.Collect
	  ns.Lock
	    Collector.Collect
	      Collector.RLock

Goroutine Y:

	Collector.Add
	  Collector.Lock
	    ns.Lock

We should use ns.Lock without Collector.Lock in Add.

Fix: #6772

Signed-off-by: Wei Fu <fuweid89@gmail.com>
This commit is contained in:
Wei Fu
2022-04-08 00:00:07 +08:00
parent f033f6ff85
commit 8a1280b2b6
5 changed files with 266 additions and 36 deletions

View File

@@ -25,21 +25,14 @@ import (
"sync"
"github.com/containerd/containerd/log"
"github.com/containerd/containerd/metrics/cgroups/common"
v2 "github.com/containerd/containerd/metrics/types/v2"
"github.com/containerd/containerd/namespaces"
"github.com/containerd/typeurl"
"github.com/docker/go-metrics"
"github.com/gogo/protobuf/types"
"github.com/prometheus/client_golang/prometheus"
)
// Statable type that returns cgroup metrics
type Statable interface {
ID() string
Namespace() string
Stats(context.Context) (*types.Any, error)
}
// NewCollector registers the collector with the provided namespace and returns it so
// that cgroups can be added for collection
func NewCollector(ns *metrics.Namespace) *Collector {
@@ -64,7 +57,7 @@ func taskID(id, namespace string) string {
}
type entry struct {
task Statable
task common.Statable
// ns is an optional child namespace that contains additional to parent labels.
// This can be used to append task specific labels to be able to differentiate the different containerd metrics.
ns *metrics.Namespace
@@ -73,12 +66,34 @@ type entry struct {
// Collector provides the ability to collect container stats and export
// them in the prometheus format
type Collector struct {
mu sync.RWMutex
tasks map[string]entry
ns *metrics.Namespace
metrics []*metric
storedMetrics chan prometheus.Metric
// TODO(fuweid):
//
// The Collector.Collect will be the field ns'Collect's callback,
// which be invoked periodically with internal lock. And Collector.Add
// might also invoke ns.Lock if the labels is not nil, which is easy to
// cause dead-lock.
//
// Goroutine X:
//
// ns.Collect
// ns.Lock
// Collector.Collect
// Collector.RLock
//
//
// Goroutine Y:
//
// Collector.Add
// ...(RLock/Lock)
// ns.Lock
//
// I think we should seek the way to decouple ns from Collector.
mu sync.RWMutex
tasks map[string]entry
metrics []*metric
}
// Describe prometheus metrics
@@ -141,26 +156,29 @@ func (c *Collector) collect(entry entry, ch chan<- prometheus.Metric, block bool
}
// Add adds the provided cgroup and id so that metrics are collected and exported
func (c *Collector) Add(t Statable, labels map[string]string) error {
func (c *Collector) Add(t common.Statable, labels map[string]string) error {
if c.ns == nil {
return nil
}
c.mu.Lock()
defer c.mu.Unlock()
c.mu.RLock()
id := taskID(t.ID(), t.Namespace())
if _, ok := c.tasks[id]; ok {
_, ok := c.tasks[id]
c.mu.RUnlock()
if ok {
return nil // requests to collect metrics should be idempotent
}
entry := entry{task: t}
if labels != nil {
entry.ns = c.ns.WithConstLabels(labels)
}
c.mu.Lock()
c.tasks[id] = entry
c.mu.Unlock()
return nil
}
// Remove removes the provided cgroup by id from the collector
func (c *Collector) Remove(t Statable) {
func (c *Collector) Remove(t common.Statable) {
if c.ns == nil {
return
}