diff --git a/Vagrantfile b/Vagrantfile index fe7aaca2b..f1a2027bf 100644 --- a/Vagrantfile +++ b/Vagrantfile @@ -228,6 +228,7 @@ EOF set -eux -o pipefail rm -rf /var/lib/containerd-test /run/containerd-test cd ${GOPATH}/src/github.com/containerd/containerd + go test -v -count=1 -race ./metrics/cgroups make integration EXTRA_TESTFLAGS="-timeout 15m -no-criu -test.v" TEST_RUNTIME=io.containerd.runc.v2 RUNC_FLAVOR=$RUNC_FLAVOR SHELL end diff --git a/metrics/cgroups/common/type.go b/metrics/cgroups/common/type.go new file mode 100644 index 000000000..b19235650 --- /dev/null +++ b/metrics/cgroups/common/type.go @@ -0,0 +1,33 @@ +//go:build linux +// +build linux + +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package common + +import ( + "context" + + "github.com/gogo/protobuf/types" +) + +// Statable type that returns cgroup metrics +type Statable interface { + ID() string + Namespace() string + Stats(context.Context) (*types.Any, error) +} diff --git a/metrics/cgroups/metrics_test.go b/metrics/cgroups/metrics_test.go new file mode 100644 index 000000000..c362ea3b9 --- /dev/null +++ b/metrics/cgroups/metrics_test.go @@ -0,0 +1,158 @@ +//go:build linux +// +build linux + +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package cgroups + +import ( + "context" + "strconv" + "sync" + "testing" + "time" + + "github.com/containerd/cgroups" + "github.com/containerd/containerd/metrics/cgroups/common" + v1 "github.com/containerd/containerd/metrics/cgroups/v1" + v2 "github.com/containerd/containerd/metrics/cgroups/v2" + v1types "github.com/containerd/containerd/metrics/types/v1" + v2types "github.com/containerd/containerd/metrics/types/v2" + "github.com/containerd/containerd/protobuf" + "github.com/prometheus/client_golang/prometheus" + + metrics "github.com/docker/go-metrics" + "github.com/gogo/protobuf/types" +) + +// TestRegressionIssue6772 should not have dead-lock when Collect and Add run +// in the same time. +// +// Issue: https://github.com/containerd/containerd/issues/6772. +func TestRegressionIssue6772(t *testing.T) { + ns := metrics.NewNamespace("test-container", "", nil) + isV1 := true + + var collecter Collecter + if cgroups.Mode() == cgroups.Unified { + isV1 = false + collecter = v2.NewCollector(ns) + } else { + collecter = v1.NewCollector(ns) + } + + doneCh := make(chan struct{}) + defer close(doneCh) + + maxItem := 100 + startCh := make(chan struct{}) + + metricCh := make(chan prometheus.Metric, maxItem) + + go func() { + for { + select { + case <-doneCh: + return + case <-metricCh: + } + } + }() + + go func() { + // pulling the metrics to trigger dead-lock + ns.Collect(metricCh) + close(startCh) + + for { + select { + case <-doneCh: + return + default: + } + + ns.Collect(metricCh) + } + }() + <-startCh + + labels := map[string]string{"issue": "6772"} + errCh := make(chan error, 1) + + var wg sync.WaitGroup + for i := 0; i < maxItem; i++ { + id := i + wg.Add(1) + + go func() { + defer wg.Done() + + err := collecter.Add( + &mockStatT{ + id: strconv.Itoa(id), + namespace: "issue6772", + isV1: isV1, + }, + labels, + ) + if err != nil { + errCh <- err + } + }() + } + + finishedCh := make(chan struct{}) + go func() { + defer close(finishedCh) + + wg.Wait() + }() + + select { + case err := <-errCh: + t.Fatalf("unexpected error: %v", err) + case <-finishedCh: + case <-time.After(30 * time.Second): + t.Fatal("should finish the Add in time") + } +} + +type Collecter interface { + Collect(ch chan<- prometheus.Metric) + + Add(t common.Statable, labels map[string]string) error +} + +type mockStatT struct { + id, namespace string + isV1 bool +} + +func (t *mockStatT) ID() string { + return t.id +} + +func (t *mockStatT) Namespace() string { + return t.namespace +} + +func (t *mockStatT) Stats(context.Context) (*types.Any, error) { + if t.isV1 { + return protobuf.MarshalAnyToProto(&v1types.Metrics{}) + } + return protobuf.MarshalAnyToProto(&v2types.Metrics{}) +} diff --git a/metrics/cgroups/v1/metrics.go b/metrics/cgroups/v1/metrics.go index a620bdddd..f73b0608a 100644 --- a/metrics/cgroups/v1/metrics.go +++ b/metrics/cgroups/v1/metrics.go @@ -26,21 +26,14 @@ import ( "github.com/containerd/cgroups" "github.com/containerd/containerd/log" + "github.com/containerd/containerd/metrics/cgroups/common" v1 "github.com/containerd/containerd/metrics/types/v1" "github.com/containerd/containerd/namespaces" "github.com/containerd/typeurl" "github.com/docker/go-metrics" - "github.com/gogo/protobuf/types" "github.com/prometheus/client_golang/prometheus" ) -// Statable type that returns cgroup metrics -type Statable interface { - ID() string - Namespace() string - Stats(context.Context) (*types.Any, error) -} - // Trigger will be called when an event happens and provides the cgroup // where the event originated from type Trigger func(string, string, cgroups.Cgroup) @@ -71,7 +64,7 @@ func taskID(id, namespace string) string { } type entry struct { - task Statable + task common.Statable // ns is an optional child namespace that contains additional to parent labels. // This can be used to append task specific labels to be able to differentiate the different containerd metrics. ns *metrics.Namespace @@ -80,12 +73,34 @@ type entry struct { // Collector provides the ability to collect container stats and export // them in the prometheus format type Collector struct { - mu sync.RWMutex - - tasks map[string]entry ns *metrics.Namespace - metrics []*metric storedMetrics chan prometheus.Metric + + // TODO(fuweid): + // + // The Collector.Collect will be the field ns'Collect's callback, + // which be invoked periodically with internal lock. And Collector.Add + // might also invoke ns.Lock if the labels is not nil, which is easy to + // cause dead-lock. + // + // Goroutine X: + // + // ns.Collect + // ns.Lock + // Collector.Collect + // Collector.RLock + // + // + // Goroutine Y: + // + // Collector.Add + // ...(RLock/Lock) + // ns.Lock + // + // I think we should seek the way to decouple ns from Collector. + mu sync.RWMutex + tasks map[string]entry + metrics []*metric } // Describe prometheus metrics @@ -148,26 +163,31 @@ func (c *Collector) collect(entry entry, ch chan<- prometheus.Metric, block bool } // Add adds the provided cgroup and id so that metrics are collected and exported -func (c *Collector) Add(t Statable, labels map[string]string) error { +func (c *Collector) Add(t common.Statable, labels map[string]string) error { if c.ns == nil { return nil } - c.mu.Lock() - defer c.mu.Unlock() + c.mu.RLock() id := taskID(t.ID(), t.Namespace()) - if _, ok := c.tasks[id]; ok { + _, ok := c.tasks[id] + c.mu.RUnlock() + if ok { return nil // requests to collect metrics should be idempotent } + entry := entry{task: t} if labels != nil { entry.ns = c.ns.WithConstLabels(labels) } + + c.mu.Lock() c.tasks[id] = entry + c.mu.Unlock() return nil } // Remove removes the provided cgroup by id from the collector -func (c *Collector) Remove(t Statable) { +func (c *Collector) Remove(t common.Statable) { if c.ns == nil { return } diff --git a/metrics/cgroups/v2/metrics.go b/metrics/cgroups/v2/metrics.go index 61cdb2cc8..00c498c83 100644 --- a/metrics/cgroups/v2/metrics.go +++ b/metrics/cgroups/v2/metrics.go @@ -25,21 +25,14 @@ import ( "sync" "github.com/containerd/containerd/log" + "github.com/containerd/containerd/metrics/cgroups/common" v2 "github.com/containerd/containerd/metrics/types/v2" "github.com/containerd/containerd/namespaces" "github.com/containerd/typeurl" "github.com/docker/go-metrics" - "github.com/gogo/protobuf/types" "github.com/prometheus/client_golang/prometheus" ) -// Statable type that returns cgroup metrics -type Statable interface { - ID() string - Namespace() string - Stats(context.Context) (*types.Any, error) -} - // NewCollector registers the collector with the provided namespace and returns it so // that cgroups can be added for collection func NewCollector(ns *metrics.Namespace) *Collector { @@ -64,7 +57,7 @@ func taskID(id, namespace string) string { } type entry struct { - task Statable + task common.Statable // ns is an optional child namespace that contains additional to parent labels. // This can be used to append task specific labels to be able to differentiate the different containerd metrics. ns *metrics.Namespace @@ -73,12 +66,34 @@ type entry struct { // Collector provides the ability to collect container stats and export // them in the prometheus format type Collector struct { - mu sync.RWMutex - - tasks map[string]entry ns *metrics.Namespace - metrics []*metric storedMetrics chan prometheus.Metric + + // TODO(fuweid): + // + // The Collector.Collect will be the field ns'Collect's callback, + // which be invoked periodically with internal lock. And Collector.Add + // might also invoke ns.Lock if the labels is not nil, which is easy to + // cause dead-lock. + // + // Goroutine X: + // + // ns.Collect + // ns.Lock + // Collector.Collect + // Collector.RLock + // + // + // Goroutine Y: + // + // Collector.Add + // ...(RLock/Lock) + // ns.Lock + // + // I think we should seek the way to decouple ns from Collector. + mu sync.RWMutex + tasks map[string]entry + metrics []*metric } // Describe prometheus metrics @@ -141,26 +156,29 @@ func (c *Collector) collect(entry entry, ch chan<- prometheus.Metric, block bool } // Add adds the provided cgroup and id so that metrics are collected and exported -func (c *Collector) Add(t Statable, labels map[string]string) error { +func (c *Collector) Add(t common.Statable, labels map[string]string) error { if c.ns == nil { return nil } - c.mu.Lock() - defer c.mu.Unlock() + c.mu.RLock() id := taskID(t.ID(), t.Namespace()) - if _, ok := c.tasks[id]; ok { + _, ok := c.tasks[id] + c.mu.RUnlock() + if ok { return nil // requests to collect metrics should be idempotent } entry := entry{task: t} if labels != nil { entry.ns = c.ns.WithConstLabels(labels) } + c.mu.Lock() c.tasks[id] = entry + c.mu.Unlock() return nil } // Remove removes the provided cgroup by id from the collector -func (c *Collector) Remove(t Statable) { +func (c *Collector) Remove(t common.Statable) { if c.ns == nil { return }