Merge pull request #6788 from fuweid/fix-issue-6772
metrics/cgroups: fix deadlock issue in Add during Collect
This commit is contained in:
commit
449eb08b89
1
Vagrantfile
vendored
1
Vagrantfile
vendored
@ -228,6 +228,7 @@ EOF
|
|||||||
set -eux -o pipefail
|
set -eux -o pipefail
|
||||||
rm -rf /var/lib/containerd-test /run/containerd-test
|
rm -rf /var/lib/containerd-test /run/containerd-test
|
||||||
cd ${GOPATH}/src/github.com/containerd/containerd
|
cd ${GOPATH}/src/github.com/containerd/containerd
|
||||||
|
go test -v -count=1 -race ./metrics/cgroups
|
||||||
make integration EXTRA_TESTFLAGS="-timeout 15m -no-criu -test.v" TEST_RUNTIME=io.containerd.runc.v2 RUNC_FLAVOR=$RUNC_FLAVOR
|
make integration EXTRA_TESTFLAGS="-timeout 15m -no-criu -test.v" TEST_RUNTIME=io.containerd.runc.v2 RUNC_FLAVOR=$RUNC_FLAVOR
|
||||||
SHELL
|
SHELL
|
||||||
end
|
end
|
||||||
|
33
metrics/cgroups/common/type.go
Normal file
33
metrics/cgroups/common/type.go
Normal file
@ -0,0 +1,33 @@
|
|||||||
|
//go:build linux
|
||||||
|
// +build linux
|
||||||
|
|
||||||
|
/*
|
||||||
|
Copyright The containerd Authors.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package common
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
|
||||||
|
"github.com/gogo/protobuf/types"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Statable type that returns cgroup metrics
|
||||||
|
type Statable interface {
|
||||||
|
ID() string
|
||||||
|
Namespace() string
|
||||||
|
Stats(context.Context) (*types.Any, error)
|
||||||
|
}
|
158
metrics/cgroups/metrics_test.go
Normal file
158
metrics/cgroups/metrics_test.go
Normal file
@ -0,0 +1,158 @@
|
|||||||
|
//go:build linux
|
||||||
|
// +build linux
|
||||||
|
|
||||||
|
/*
|
||||||
|
Copyright The containerd Authors.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package cgroups
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"strconv"
|
||||||
|
"sync"
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/containerd/cgroups"
|
||||||
|
"github.com/containerd/containerd/metrics/cgroups/common"
|
||||||
|
v1 "github.com/containerd/containerd/metrics/cgroups/v1"
|
||||||
|
v2 "github.com/containerd/containerd/metrics/cgroups/v2"
|
||||||
|
v1types "github.com/containerd/containerd/metrics/types/v1"
|
||||||
|
v2types "github.com/containerd/containerd/metrics/types/v2"
|
||||||
|
"github.com/containerd/containerd/protobuf"
|
||||||
|
"github.com/prometheus/client_golang/prometheus"
|
||||||
|
|
||||||
|
metrics "github.com/docker/go-metrics"
|
||||||
|
"github.com/gogo/protobuf/types"
|
||||||
|
)
|
||||||
|
|
||||||
|
// TestRegressionIssue6772 should not have dead-lock when Collect and Add run
|
||||||
|
// in the same time.
|
||||||
|
//
|
||||||
|
// Issue: https://github.com/containerd/containerd/issues/6772.
|
||||||
|
func TestRegressionIssue6772(t *testing.T) {
|
||||||
|
ns := metrics.NewNamespace("test-container", "", nil)
|
||||||
|
isV1 := true
|
||||||
|
|
||||||
|
var collecter Collecter
|
||||||
|
if cgroups.Mode() == cgroups.Unified {
|
||||||
|
isV1 = false
|
||||||
|
collecter = v2.NewCollector(ns)
|
||||||
|
} else {
|
||||||
|
collecter = v1.NewCollector(ns)
|
||||||
|
}
|
||||||
|
|
||||||
|
doneCh := make(chan struct{})
|
||||||
|
defer close(doneCh)
|
||||||
|
|
||||||
|
maxItem := 100
|
||||||
|
startCh := make(chan struct{})
|
||||||
|
|
||||||
|
metricCh := make(chan prometheus.Metric, maxItem)
|
||||||
|
|
||||||
|
go func() {
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-doneCh:
|
||||||
|
return
|
||||||
|
case <-metricCh:
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
|
go func() {
|
||||||
|
// pulling the metrics to trigger dead-lock
|
||||||
|
ns.Collect(metricCh)
|
||||||
|
close(startCh)
|
||||||
|
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-doneCh:
|
||||||
|
return
|
||||||
|
default:
|
||||||
|
}
|
||||||
|
|
||||||
|
ns.Collect(metricCh)
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
<-startCh
|
||||||
|
|
||||||
|
labels := map[string]string{"issue": "6772"}
|
||||||
|
errCh := make(chan error, 1)
|
||||||
|
|
||||||
|
var wg sync.WaitGroup
|
||||||
|
for i := 0; i < maxItem; i++ {
|
||||||
|
id := i
|
||||||
|
wg.Add(1)
|
||||||
|
|
||||||
|
go func() {
|
||||||
|
defer wg.Done()
|
||||||
|
|
||||||
|
err := collecter.Add(
|
||||||
|
&mockStatT{
|
||||||
|
id: strconv.Itoa(id),
|
||||||
|
namespace: "issue6772",
|
||||||
|
isV1: isV1,
|
||||||
|
},
|
||||||
|
labels,
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
errCh <- err
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
}
|
||||||
|
|
||||||
|
finishedCh := make(chan struct{})
|
||||||
|
go func() {
|
||||||
|
defer close(finishedCh)
|
||||||
|
|
||||||
|
wg.Wait()
|
||||||
|
}()
|
||||||
|
|
||||||
|
select {
|
||||||
|
case err := <-errCh:
|
||||||
|
t.Fatalf("unexpected error: %v", err)
|
||||||
|
case <-finishedCh:
|
||||||
|
case <-time.After(30 * time.Second):
|
||||||
|
t.Fatal("should finish the Add in time")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
type Collecter interface {
|
||||||
|
Collect(ch chan<- prometheus.Metric)
|
||||||
|
|
||||||
|
Add(t common.Statable, labels map[string]string) error
|
||||||
|
}
|
||||||
|
|
||||||
|
type mockStatT struct {
|
||||||
|
id, namespace string
|
||||||
|
isV1 bool
|
||||||
|
}
|
||||||
|
|
||||||
|
func (t *mockStatT) ID() string {
|
||||||
|
return t.id
|
||||||
|
}
|
||||||
|
|
||||||
|
func (t *mockStatT) Namespace() string {
|
||||||
|
return t.namespace
|
||||||
|
}
|
||||||
|
|
||||||
|
func (t *mockStatT) Stats(context.Context) (*types.Any, error) {
|
||||||
|
if t.isV1 {
|
||||||
|
return protobuf.MarshalAnyToProto(&v1types.Metrics{})
|
||||||
|
}
|
||||||
|
return protobuf.MarshalAnyToProto(&v2types.Metrics{})
|
||||||
|
}
|
@ -26,21 +26,14 @@ import (
|
|||||||
|
|
||||||
"github.com/containerd/cgroups"
|
"github.com/containerd/cgroups"
|
||||||
"github.com/containerd/containerd/log"
|
"github.com/containerd/containerd/log"
|
||||||
|
"github.com/containerd/containerd/metrics/cgroups/common"
|
||||||
v1 "github.com/containerd/containerd/metrics/types/v1"
|
v1 "github.com/containerd/containerd/metrics/types/v1"
|
||||||
"github.com/containerd/containerd/namespaces"
|
"github.com/containerd/containerd/namespaces"
|
||||||
"github.com/containerd/typeurl"
|
"github.com/containerd/typeurl"
|
||||||
"github.com/docker/go-metrics"
|
"github.com/docker/go-metrics"
|
||||||
"github.com/gogo/protobuf/types"
|
|
||||||
"github.com/prometheus/client_golang/prometheus"
|
"github.com/prometheus/client_golang/prometheus"
|
||||||
)
|
)
|
||||||
|
|
||||||
// Statable type that returns cgroup metrics
|
|
||||||
type Statable interface {
|
|
||||||
ID() string
|
|
||||||
Namespace() string
|
|
||||||
Stats(context.Context) (*types.Any, error)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Trigger will be called when an event happens and provides the cgroup
|
// Trigger will be called when an event happens and provides the cgroup
|
||||||
// where the event originated from
|
// where the event originated from
|
||||||
type Trigger func(string, string, cgroups.Cgroup)
|
type Trigger func(string, string, cgroups.Cgroup)
|
||||||
@ -71,7 +64,7 @@ func taskID(id, namespace string) string {
|
|||||||
}
|
}
|
||||||
|
|
||||||
type entry struct {
|
type entry struct {
|
||||||
task Statable
|
task common.Statable
|
||||||
// ns is an optional child namespace that contains additional to parent labels.
|
// ns is an optional child namespace that contains additional to parent labels.
|
||||||
// This can be used to append task specific labels to be able to differentiate the different containerd metrics.
|
// This can be used to append task specific labels to be able to differentiate the different containerd metrics.
|
||||||
ns *metrics.Namespace
|
ns *metrics.Namespace
|
||||||
@ -80,12 +73,34 @@ type entry struct {
|
|||||||
// Collector provides the ability to collect container stats and export
|
// Collector provides the ability to collect container stats and export
|
||||||
// them in the prometheus format
|
// them in the prometheus format
|
||||||
type Collector struct {
|
type Collector struct {
|
||||||
mu sync.RWMutex
|
|
||||||
|
|
||||||
tasks map[string]entry
|
|
||||||
ns *metrics.Namespace
|
ns *metrics.Namespace
|
||||||
metrics []*metric
|
|
||||||
storedMetrics chan prometheus.Metric
|
storedMetrics chan prometheus.Metric
|
||||||
|
|
||||||
|
// TODO(fuweid):
|
||||||
|
//
|
||||||
|
// The Collector.Collect will be the field ns'Collect's callback,
|
||||||
|
// which be invoked periodically with internal lock. And Collector.Add
|
||||||
|
// might also invoke ns.Lock if the labels is not nil, which is easy to
|
||||||
|
// cause dead-lock.
|
||||||
|
//
|
||||||
|
// Goroutine X:
|
||||||
|
//
|
||||||
|
// ns.Collect
|
||||||
|
// ns.Lock
|
||||||
|
// Collector.Collect
|
||||||
|
// Collector.RLock
|
||||||
|
//
|
||||||
|
//
|
||||||
|
// Goroutine Y:
|
||||||
|
//
|
||||||
|
// Collector.Add
|
||||||
|
// ...(RLock/Lock)
|
||||||
|
// ns.Lock
|
||||||
|
//
|
||||||
|
// I think we should seek the way to decouple ns from Collector.
|
||||||
|
mu sync.RWMutex
|
||||||
|
tasks map[string]entry
|
||||||
|
metrics []*metric
|
||||||
}
|
}
|
||||||
|
|
||||||
// Describe prometheus metrics
|
// Describe prometheus metrics
|
||||||
@ -148,26 +163,31 @@ func (c *Collector) collect(entry entry, ch chan<- prometheus.Metric, block bool
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Add adds the provided cgroup and id so that metrics are collected and exported
|
// Add adds the provided cgroup and id so that metrics are collected and exported
|
||||||
func (c *Collector) Add(t Statable, labels map[string]string) error {
|
func (c *Collector) Add(t common.Statable, labels map[string]string) error {
|
||||||
if c.ns == nil {
|
if c.ns == nil {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
c.mu.Lock()
|
c.mu.RLock()
|
||||||
defer c.mu.Unlock()
|
|
||||||
id := taskID(t.ID(), t.Namespace())
|
id := taskID(t.ID(), t.Namespace())
|
||||||
if _, ok := c.tasks[id]; ok {
|
_, ok := c.tasks[id]
|
||||||
|
c.mu.RUnlock()
|
||||||
|
if ok {
|
||||||
return nil // requests to collect metrics should be idempotent
|
return nil // requests to collect metrics should be idempotent
|
||||||
}
|
}
|
||||||
|
|
||||||
entry := entry{task: t}
|
entry := entry{task: t}
|
||||||
if labels != nil {
|
if labels != nil {
|
||||||
entry.ns = c.ns.WithConstLabels(labels)
|
entry.ns = c.ns.WithConstLabels(labels)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
c.mu.Lock()
|
||||||
c.tasks[id] = entry
|
c.tasks[id] = entry
|
||||||
|
c.mu.Unlock()
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// Remove removes the provided cgroup by id from the collector
|
// Remove removes the provided cgroup by id from the collector
|
||||||
func (c *Collector) Remove(t Statable) {
|
func (c *Collector) Remove(t common.Statable) {
|
||||||
if c.ns == nil {
|
if c.ns == nil {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
@ -25,21 +25,14 @@ import (
|
|||||||
"sync"
|
"sync"
|
||||||
|
|
||||||
"github.com/containerd/containerd/log"
|
"github.com/containerd/containerd/log"
|
||||||
|
"github.com/containerd/containerd/metrics/cgroups/common"
|
||||||
v2 "github.com/containerd/containerd/metrics/types/v2"
|
v2 "github.com/containerd/containerd/metrics/types/v2"
|
||||||
"github.com/containerd/containerd/namespaces"
|
"github.com/containerd/containerd/namespaces"
|
||||||
"github.com/containerd/typeurl"
|
"github.com/containerd/typeurl"
|
||||||
"github.com/docker/go-metrics"
|
"github.com/docker/go-metrics"
|
||||||
"github.com/gogo/protobuf/types"
|
|
||||||
"github.com/prometheus/client_golang/prometheus"
|
"github.com/prometheus/client_golang/prometheus"
|
||||||
)
|
)
|
||||||
|
|
||||||
// Statable type that returns cgroup metrics
|
|
||||||
type Statable interface {
|
|
||||||
ID() string
|
|
||||||
Namespace() string
|
|
||||||
Stats(context.Context) (*types.Any, error)
|
|
||||||
}
|
|
||||||
|
|
||||||
// NewCollector registers the collector with the provided namespace and returns it so
|
// NewCollector registers the collector with the provided namespace and returns it so
|
||||||
// that cgroups can be added for collection
|
// that cgroups can be added for collection
|
||||||
func NewCollector(ns *metrics.Namespace) *Collector {
|
func NewCollector(ns *metrics.Namespace) *Collector {
|
||||||
@ -64,7 +57,7 @@ func taskID(id, namespace string) string {
|
|||||||
}
|
}
|
||||||
|
|
||||||
type entry struct {
|
type entry struct {
|
||||||
task Statable
|
task common.Statable
|
||||||
// ns is an optional child namespace that contains additional to parent labels.
|
// ns is an optional child namespace that contains additional to parent labels.
|
||||||
// This can be used to append task specific labels to be able to differentiate the different containerd metrics.
|
// This can be used to append task specific labels to be able to differentiate the different containerd metrics.
|
||||||
ns *metrics.Namespace
|
ns *metrics.Namespace
|
||||||
@ -73,12 +66,34 @@ type entry struct {
|
|||||||
// Collector provides the ability to collect container stats and export
|
// Collector provides the ability to collect container stats and export
|
||||||
// them in the prometheus format
|
// them in the prometheus format
|
||||||
type Collector struct {
|
type Collector struct {
|
||||||
mu sync.RWMutex
|
|
||||||
|
|
||||||
tasks map[string]entry
|
|
||||||
ns *metrics.Namespace
|
ns *metrics.Namespace
|
||||||
metrics []*metric
|
|
||||||
storedMetrics chan prometheus.Metric
|
storedMetrics chan prometheus.Metric
|
||||||
|
|
||||||
|
// TODO(fuweid):
|
||||||
|
//
|
||||||
|
// The Collector.Collect will be the field ns'Collect's callback,
|
||||||
|
// which be invoked periodically with internal lock. And Collector.Add
|
||||||
|
// might also invoke ns.Lock if the labels is not nil, which is easy to
|
||||||
|
// cause dead-lock.
|
||||||
|
//
|
||||||
|
// Goroutine X:
|
||||||
|
//
|
||||||
|
// ns.Collect
|
||||||
|
// ns.Lock
|
||||||
|
// Collector.Collect
|
||||||
|
// Collector.RLock
|
||||||
|
//
|
||||||
|
//
|
||||||
|
// Goroutine Y:
|
||||||
|
//
|
||||||
|
// Collector.Add
|
||||||
|
// ...(RLock/Lock)
|
||||||
|
// ns.Lock
|
||||||
|
//
|
||||||
|
// I think we should seek the way to decouple ns from Collector.
|
||||||
|
mu sync.RWMutex
|
||||||
|
tasks map[string]entry
|
||||||
|
metrics []*metric
|
||||||
}
|
}
|
||||||
|
|
||||||
// Describe prometheus metrics
|
// Describe prometheus metrics
|
||||||
@ -141,26 +156,29 @@ func (c *Collector) collect(entry entry, ch chan<- prometheus.Metric, block bool
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Add adds the provided cgroup and id so that metrics are collected and exported
|
// Add adds the provided cgroup and id so that metrics are collected and exported
|
||||||
func (c *Collector) Add(t Statable, labels map[string]string) error {
|
func (c *Collector) Add(t common.Statable, labels map[string]string) error {
|
||||||
if c.ns == nil {
|
if c.ns == nil {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
c.mu.Lock()
|
c.mu.RLock()
|
||||||
defer c.mu.Unlock()
|
|
||||||
id := taskID(t.ID(), t.Namespace())
|
id := taskID(t.ID(), t.Namespace())
|
||||||
if _, ok := c.tasks[id]; ok {
|
_, ok := c.tasks[id]
|
||||||
|
c.mu.RUnlock()
|
||||||
|
if ok {
|
||||||
return nil // requests to collect metrics should be idempotent
|
return nil // requests to collect metrics should be idempotent
|
||||||
}
|
}
|
||||||
entry := entry{task: t}
|
entry := entry{task: t}
|
||||||
if labels != nil {
|
if labels != nil {
|
||||||
entry.ns = c.ns.WithConstLabels(labels)
|
entry.ns = c.ns.WithConstLabels(labels)
|
||||||
}
|
}
|
||||||
|
c.mu.Lock()
|
||||||
c.tasks[id] = entry
|
c.tasks[id] = entry
|
||||||
|
c.mu.Unlock()
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// Remove removes the provided cgroup by id from the collector
|
// Remove removes the provided cgroup by id from the collector
|
||||||
func (c *Collector) Remove(t Statable) {
|
func (c *Collector) Remove(t common.Statable) {
|
||||||
if c.ns == nil {
|
if c.ns == nil {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user