Merge pull request #115456 from pohly/goroutine-leak-check

test/integration: goroutine leak check
This commit is contained in:
Kubernetes Prow Robot
2023-02-14 08:31:31 -08:00
committed by GitHub
28 changed files with 901 additions and 33 deletions

View File

@@ -29,10 +29,10 @@ import (
"syscall"
"time"
"go.uber.org/goleak"
"google.golang.org/grpc/grpclog"
"k8s.io/klog/v2"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/kubernetes/pkg/util/env"
)
@@ -182,37 +182,48 @@ func EtcdMain(tests func() int) {
// Bail out early when -help was given as parameter.
flag.Parse()
before := runtime.NumGoroutine()
// Must be called *before* creating new goroutines.
goleakOpts := IgnoreBackgroundGoroutines()
goleakOpts = append(goleakOpts,
// lumberjack leaks a goroutine:
// https://github.com/natefinch/lumberjack/issues/56 This affects tests
// using --audit-log-path (like
// ./test/integration/apiserver/admissionwebhook/reinvocation_test.go).
// In normal production that should be harmless. We don't know here
// whether the test is using that, so we have to suppress reporting
// this leak for all tests.
//
// Both names occurred in practice.
goleak.IgnoreTopFunction("k8s.io/kubernetes/vendor/gopkg.in/natefinch/lumberjack%2ev2.(*Logger).millRun"),
goleak.IgnoreTopFunction("gopkg.in/natefinch/lumberjack%2ev2.(*Logger).millRun"),
)
stop, err := startEtcd()
if err != nil {
klog.Fatalf("cannot run integration tests: unable to start etcd: %v", err)
}
result := tests()
stop() // Don't defer this. See os.Exit documentation.
klog.StopFlushDaemon()
checkNumberOfGoroutines := func() (bool, error) {
// We leave some room for leaked goroutines as there are
// still some leaks, mostly:
// - leak from lumberjack package we're vendoring
// - leak from apiserve healthz
// - leak from opencensus library
// Once fixed, we should be able to bring it down to zero.
if dg := runtime.NumGoroutine() - before; dg <= 3 {
return true, nil
// Several tests don't wait for goroutines to stop. goleak.Find retries
// internally, but not long enough. 5 seconds seemed to be enough for
// most tests, even when testing in the CI.
timeout := 5 * time.Second
start := time.Now()
for {
err := goleak.Find(goleakOpts...)
if err == nil {
break
}
if time.Now().Sub(start) >= timeout {
klog.ErrorS(err, "EtcdMain goroutine check")
result = 1
break
}
// Allow goroutines to schedule and die off.
runtime.Gosched()
return false, nil
}
// It generally takes visibly less than 1s to finish all goroutines.
// But we keep the limit higher to account for cpu-starved environments.
if err := wait.Poll(100*time.Millisecond, 5*time.Second, checkNumberOfGoroutines); err != nil {
after := runtime.NumGoroutine()
stacktraces := make([]byte, 1<<20)
runtime.Stack(stacktraces, true)
klog.Fatalf("unexpected number of goroutines: before: %d after %d\n%sd", before, after, string(stacktraces))
}
os.Exit(result)
}

View File

@@ -0,0 +1,36 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package framework
import (
"go.uber.org/goleak"
"k8s.io/apiserver/pkg/server/healthz"
)
// IgnoreBackgroundGoroutines returns options for goleak.Find
// which ignore goroutines created by "go test" and init functions,
// like the one from go.opencensus.io/stats/view/worker.go.
//
// Goroutines that normally get created later when using the apiserver
// get created already when calling this function, therefore they
// also get ignored.
func IgnoreBackgroundGoroutines() []goleak.Option {
// Ensure that on-demand goroutines are running.
_ = healthz.LogHealthz.Check(nil)
return []goleak.Option{goleak.IgnoreCurrent()}
}