test: use go-uber/goleak for strict leak checking
It provides more readable output and has additional APIs for using it inside a unit test. goleak.IgnoreCurrent is needed to filter out the goroutine that gets started when importing go.opencensus.io/stats/view. In order to handle background goroutines that get created on demand and cannot be stopped (like the one for LogzHealth), a helper function ensures that those are running before calling goleak.IgnoreCurrent. Keeping those goroutines running is not a problem and thus not worth the effort of adding new APIs to stop them. Other goroutines are genuine leaks for which no fix is available. Those get suppressed via IgnoreTopFunction, which works as long as that function is unique enough. Example output for the leak fixed in https://github.com/kubernetes/kubernetes/pull/115423: E0202 09:30:51.641841 74789 etcd.go:205] "EtcdMain goroutine check" err=< found unexpected goroutines: [Goroutine 4889 in state chan receive, with k8s.io/apimachinery/pkg/watch.(*Broadcaster).loop on top of the stack: goroutine 4889 [chan receive]: k8s.io/apimachinery/pkg/watch.(*Broadcaster).loop(0xc0076183c0) /nvme/gopath/src/k8s.io/kubernetes/vendor/k8s.io/apimachinery/pkg/watch/mux.go:268 +0x65 created by k8s.io/apimachinery/pkg/watch.NewBroadcaster /nvme/gopath/src/k8s.io/kubernetes/vendor/k8s.io/apimachinery/pkg/watch/mux.go:77 +0x116 >
This commit is contained in:
@@ -29,10 +29,10 @@ import (
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
"go.uber.org/goleak"
|
||||
"google.golang.org/grpc/grpclog"
|
||||
"k8s.io/klog/v2"
|
||||
|
||||
"k8s.io/apimachinery/pkg/util/wait"
|
||||
"k8s.io/kubernetes/pkg/util/env"
|
||||
)
|
||||
|
||||
@@ -182,37 +182,48 @@ func EtcdMain(tests func() int) {
|
||||
// Bail out early when -help was given as parameter.
|
||||
flag.Parse()
|
||||
|
||||
before := runtime.NumGoroutine()
|
||||
// Must be called *before* creating new goroutines.
|
||||
goleakOpts := IgnoreBackgroundGoroutines()
|
||||
|
||||
goleakOpts = append(goleakOpts,
|
||||
// lumberjack leaks a goroutine:
|
||||
// https://github.com/natefinch/lumberjack/issues/56 This affects tests
|
||||
// using --audit-log-path (like
|
||||
// ./test/integration/apiserver/admissionwebhook/reinvocation_test.go).
|
||||
// In normal production that should be harmless. We don't know here
|
||||
// whether the test is using that, so we have to suppress reporting
|
||||
// this leak for all tests.
|
||||
//
|
||||
// Both names occurred in practice.
|
||||
goleak.IgnoreTopFunction("k8s.io/kubernetes/vendor/gopkg.in/natefinch/lumberjack%2ev2.(*Logger).millRun"),
|
||||
goleak.IgnoreTopFunction("gopkg.in/natefinch/lumberjack%2ev2.(*Logger).millRun"),
|
||||
)
|
||||
|
||||
stop, err := startEtcd()
|
||||
if err != nil {
|
||||
klog.Fatalf("cannot run integration tests: unable to start etcd: %v", err)
|
||||
}
|
||||
result := tests()
|
||||
stop() // Don't defer this. See os.Exit documentation.
|
||||
klog.StopFlushDaemon()
|
||||
|
||||
checkNumberOfGoroutines := func() (bool, error) {
|
||||
// We leave some room for leaked goroutines as there are
|
||||
// still some leaks, mostly:
|
||||
// - leak from lumberjack package we're vendoring
|
||||
// - leak from apiserve healthz
|
||||
// - leak from opencensus library
|
||||
// Once fixed, we should be able to bring it down to zero.
|
||||
if dg := runtime.NumGoroutine() - before; dg <= 3 {
|
||||
return true, nil
|
||||
// Several tests don't wait for goroutines to stop. goleak.Find retries
|
||||
// internally, but not long enough. 5 seconds seemed to be enough for
|
||||
// most tests, even when testing in the CI.
|
||||
timeout := 5 * time.Second
|
||||
start := time.Now()
|
||||
for {
|
||||
err := goleak.Find(goleakOpts...)
|
||||
if err == nil {
|
||||
break
|
||||
}
|
||||
if time.Now().Sub(start) >= timeout {
|
||||
klog.ErrorS(err, "EtcdMain goroutine check")
|
||||
result = 1
|
||||
break
|
||||
}
|
||||
// Allow goroutines to schedule and die off.
|
||||
runtime.Gosched()
|
||||
return false, nil
|
||||
}
|
||||
|
||||
// It generally takes visibly less than 1s to finish all goroutines.
|
||||
// But we keep the limit higher to account for cpu-starved environments.
|
||||
if err := wait.Poll(100*time.Millisecond, 5*time.Second, checkNumberOfGoroutines); err != nil {
|
||||
after := runtime.NumGoroutine()
|
||||
stacktraces := make([]byte, 1<<20)
|
||||
runtime.Stack(stacktraces, true)
|
||||
klog.Fatalf("unexpected number of goroutines: before: %d after %d\n%sd", before, after, string(stacktraces))
|
||||
}
|
||||
os.Exit(result)
|
||||
}
|
||||
|
||||
|
Reference in New Issue
Block a user