Add additional etcd check to readyz with 2 seconds timeout.

This commit is contained in:
Maciej Wyrzuc
2022-07-25 13:08:50 +00:00
parent 96e22f28db
commit b42045a64f
8 changed files with 336 additions and 17 deletions

View File

@@ -164,6 +164,7 @@ func TestAddFlags(t *testing.T) {
CountMetricPollPeriod: time.Minute, CountMetricPollPeriod: time.Minute,
DBMetricPollInterval: storagebackend.DefaultDBMetricPollInterval, DBMetricPollInterval: storagebackend.DefaultDBMetricPollInterval,
HealthcheckTimeout: storagebackend.DefaultHealthcheckTimeout, HealthcheckTimeout: storagebackend.DefaultHealthcheckTimeout,
ReadycheckTimeout: storagebackend.DefaultReadinessTimeout,
LeaseManagerConfig: etcd3.LeaseManagerConfig{ LeaseManagerConfig: etcd3.LeaseManagerConfig{
ReuseDurationSeconds: 100, ReuseDurationSeconds: 100,
MaxObjectCount: 1000, MaxObjectCount: 1000,

View File

@@ -455,6 +455,12 @@ func (c *Config) AddHealthChecks(healthChecks ...healthz.HealthChecker) {
c.ReadyzChecks = append(c.ReadyzChecks, healthChecks...) c.ReadyzChecks = append(c.ReadyzChecks, healthChecks...)
} }
// AddReadyzChecks adds a health check to our config to be exposed by the readyz endpoint
// of our configured apiserver.
func (c *Config) AddReadyzChecks(healthChecks ...healthz.HealthChecker) {
c.ReadyzChecks = append(c.ReadyzChecks, healthChecks...)
}
// AddPostStartHook allows you to add a PostStartHook that will later be added to the server itself in a New call. // AddPostStartHook allows you to add a PostStartHook that will later be added to the server itself in a New call.
// Name conflicts will cause an error. // Name conflicts will cause an error.
func (c *Config) AddPostStartHook(name string, hook PostStartHookFunc) error { func (c *Config) AddPostStartHook(name string, hook PostStartHookFunc) error {

View File

@@ -183,6 +183,9 @@ func (s *EtcdOptions) AddFlags(fs *pflag.FlagSet) {
fs.DurationVar(&s.StorageConfig.HealthcheckTimeout, "etcd-healthcheck-timeout", s.StorageConfig.HealthcheckTimeout, fs.DurationVar(&s.StorageConfig.HealthcheckTimeout, "etcd-healthcheck-timeout", s.StorageConfig.HealthcheckTimeout,
"The timeout to use when checking etcd health.") "The timeout to use when checking etcd health.")
fs.DurationVar(&s.StorageConfig.ReadycheckTimeout, "etcd-readycheck-timeout", s.StorageConfig.ReadycheckTimeout,
"The timeout to use when checking etcd readiness")
fs.Int64Var(&s.StorageConfig.LeaseManagerConfig.ReuseDurationSeconds, "lease-reuse-duration-seconds", s.StorageConfig.LeaseManagerConfig.ReuseDurationSeconds, fs.Int64Var(&s.StorageConfig.LeaseManagerConfig.ReuseDurationSeconds, "lease-reuse-duration-seconds", s.StorageConfig.LeaseManagerConfig.ReuseDurationSeconds,
"The time in seconds that each lease is reused. A lower value could avoid large number of objects reusing the same lease. Notice that a too small value may cause performance problems at storage layer.") "The time in seconds that each lease is reused. A lower value could avoid large number of objects reusing the same lease. Notice that a too small value may cause performance problems at storage layer.")
} }
@@ -234,6 +237,14 @@ func (s *EtcdOptions) addEtcdHealthEndpoint(c *server.Config) error {
return healthCheck() return healthCheck()
})) }))
readyCheck, err := storagefactory.CreateReadyCheck(s.StorageConfig, c.DrainedNotify())
if err != nil {
return err
}
c.AddReadyzChecks(healthz.NamedCheck("etcd-readiness", func(r *http.Request) error {
return readyCheck()
}))
if s.EncryptionProviderConfigFilepath != "" { if s.EncryptionProviderConfigFilepath != "" {
kmsPluginHealthzChecks, err := encryptionconfig.GetKMSPluginHealthzCheckers(s.EncryptionProviderConfigFilepath) kmsPluginHealthzChecks, err := encryptionconfig.GetKMSPluginHealthzCheckers(s.EncryptionProviderConfigFilepath)
if err != nil { if err != nil {

View File

@@ -26,6 +26,7 @@ import (
"k8s.io/apimachinery/pkg/runtime/serializer" "k8s.io/apimachinery/pkg/runtime/serializer"
utilerrors "k8s.io/apimachinery/pkg/util/errors" utilerrors "k8s.io/apimachinery/pkg/util/errors"
"k8s.io/apiserver/pkg/server" "k8s.io/apiserver/pkg/server"
"k8s.io/apiserver/pkg/server/healthz"
"k8s.io/apiserver/pkg/storage/storagebackend" "k8s.io/apiserver/pkg/storage/storagebackend"
) )
@@ -230,18 +231,57 @@ func TestKMSHealthzEndpoint(t *testing.T) {
} }
for _, n := range tc.wantChecks { for _, n := range tc.wantChecks {
found := false if !hasCheck(n, serverConfig.HealthzChecks) {
for _, h := range serverConfig.HealthzChecks {
if n == h.Name() {
found = true
break
}
}
if !found {
t.Errorf("Missing HealthzChecker %s", n) t.Errorf("Missing HealthzChecker %s", n)
} }
found = false
} }
}) })
} }
} }
func TestReadinessCheck(t *testing.T) {
testCases := []struct {
name string
wantReadyzChecks []string
wantHealthzChecks []string
}{
{
name: "Readyz should have etcd-readiness check",
wantReadyzChecks: []string{"etcd", "etcd-readiness"},
wantHealthzChecks: []string{"etcd"},
},
}
scheme := runtime.NewScheme()
codecs := serializer.NewCodecFactory(scheme)
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
serverConfig := server.NewConfig(codecs)
etcdOptions := &EtcdOptions{}
if err := etcdOptions.addEtcdHealthEndpoint(serverConfig); err != nil {
t.Fatalf("Failed to add healthz error: %v", err)
}
for _, n := range tc.wantReadyzChecks {
if !hasCheck(n, serverConfig.ReadyzChecks) {
t.Errorf("Missing ReadyzChecker %s", n)
}
}
for _, n := range tc.wantHealthzChecks {
if !hasCheck(n, serverConfig.HealthzChecks) {
t.Errorf("Missing HealthzChecker %s", n)
}
}
})
}
}
func hasCheck(want string, healthchecks []healthz.HealthChecker) bool {
for _, h := range healthchecks {
if want == h.Name() {
return true
}
}
return false
}

View File

@@ -36,6 +36,7 @@ const (
DefaultCompactInterval = 5 * time.Minute DefaultCompactInterval = 5 * time.Minute
DefaultDBMetricPollInterval = 30 * time.Second DefaultDBMetricPollInterval = 30 * time.Second
DefaultHealthcheckTimeout = 2 * time.Second DefaultHealthcheckTimeout = 2 * time.Second
DefaultReadinessTimeout = 2 * time.Second
) )
// TransportConfig holds all connection related info, i.e. equal TransportConfig means equal servers we talk to. // TransportConfig holds all connection related info, i.e. equal TransportConfig means equal servers we talk to.
@@ -84,6 +85,8 @@ type Config struct {
DBMetricPollInterval time.Duration DBMetricPollInterval time.Duration
// HealthcheckTimeout specifies the timeout used when checking health // HealthcheckTimeout specifies the timeout used when checking health
HealthcheckTimeout time.Duration HealthcheckTimeout time.Duration
// ReadycheckTimeout specifies the timeout used when checking readiness
ReadycheckTimeout time.Duration
LeaseManagerConfig etcd3.LeaseManagerConfig LeaseManagerConfig etcd3.LeaseManagerConfig
@@ -117,6 +120,7 @@ func NewDefaultConfig(prefix string, codec runtime.Codec) *Config {
CompactionInterval: DefaultCompactInterval, CompactionInterval: DefaultCompactInterval,
DBMetricPollInterval: DefaultDBMetricPollInterval, DBMetricPollInterval: DefaultDBMetricPollInterval,
HealthcheckTimeout: DefaultHealthcheckTimeout, HealthcheckTimeout: DefaultHealthcheckTimeout,
ReadycheckTimeout: DefaultReadinessTimeout,
LeaseManagerConfig: etcd3.NewDefaultLeaseManagerConfig(), LeaseManagerConfig: etcd3.NewDefaultLeaseManagerConfig(),
} }
} }

View File

@@ -73,6 +73,22 @@ func init() {
} }
func newETCD3HealthCheck(c storagebackend.Config, stopCh <-chan struct{}) (func() error, error) { func newETCD3HealthCheck(c storagebackend.Config, stopCh <-chan struct{}) (func() error, error) {
timeout := storagebackend.DefaultHealthcheckTimeout
if c.HealthcheckTimeout != time.Duration(0) {
timeout = c.HealthcheckTimeout
}
return newETCD3Check(c, timeout, stopCh)
}
func newETCD3ReadyCheck(c storagebackend.Config, stopCh <-chan struct{}) (func() error, error) {
timeout := storagebackend.DefaultReadinessTimeout
if c.ReadycheckTimeout != time.Duration(0) {
timeout = c.ReadycheckTimeout
}
return newETCD3Check(c, timeout, stopCh)
}
func newETCD3Check(c storagebackend.Config, timeout time.Duration, stopCh <-chan struct{}) (func() error, error) {
// constructing the etcd v3 client blocks and times out if etcd is not available. // constructing the etcd v3 client blocks and times out if etcd is not available.
// retry in a loop in the background until we successfully create the client, storing the client or error encountered // retry in a loop in the background until we successfully create the client, storing the client or error encountered
@@ -129,23 +145,18 @@ func newETCD3HealthCheck(c storagebackend.Config, stopCh <-chan struct{}) (func(
if clientErr != nil { if clientErr != nil {
return clientErr return clientErr
} }
ctx, cancel := context.WithTimeout(context.Background(), timeout)
healthcheckTimeout := storagebackend.DefaultHealthcheckTimeout
if c.HealthcheckTimeout != time.Duration(0) {
healthcheckTimeout = c.HealthcheckTimeout
}
ctx, cancel := context.WithTimeout(context.Background(), healthcheckTimeout)
defer cancel() defer cancel()
// See https://github.com/etcd-io/etcd/blob/c57f8b3af865d1b531b979889c602ba14377420e/etcdctl/ctlv3/command/ep_command.go#L118 // See https://github.com/etcd-io/etcd/blob/c57f8b3af865d1b531b979889c602ba14377420e/etcdctl/ctlv3/command/ep_command.go#L118
_, err := client.Get(ctx, path.Join("/", c.Prefix, "health")) _, err := client.Get(ctx, path.Join("/", c.Prefix, "health"))
if err == nil { if err == nil {
return nil return nil
} }
return fmt.Errorf("error getting data from etcd: %v", err) return fmt.Errorf("error getting data from etcd: %w", err)
}, nil }, nil
} }
func newETCD3Client(c storagebackend.TransportConfig) (*clientv3.Client, error) { var newETCD3Client = func(c storagebackend.TransportConfig) (*clientv3.Client, error) {
tlsInfo := transport.TLSInfo{ tlsInfo := transport.TLSInfo{
CertFile: c.CertFile, CertFile: c.CertFile,
KeyFile: c.KeyFile, KeyFile: c.KeyFile,

View File

@@ -50,3 +50,14 @@ func CreateHealthCheck(c storagebackend.Config, stopCh <-chan struct{}) (func()
return nil, fmt.Errorf("unknown storage type: %s", c.Type) return nil, fmt.Errorf("unknown storage type: %s", c.Type)
} }
} }
func CreateReadyCheck(c storagebackend.Config, stopCh <-chan struct{}) (func() error, error) {
switch c.Type {
case storagebackend.StorageTypeETCD2:
return nil, fmt.Errorf("%s is no longer a supported storage backend", c.Type)
case storagebackend.StorageTypeUnset, storagebackend.StorageTypeETCD3:
return newETCD3ReadyCheck(c, stopCh)
default:
return nil, fmt.Errorf("unknown storage type: %s", c.Type)
}
}

View File

@@ -0,0 +1,235 @@
/*
Copyright 2022 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package factory
import (
"context"
"errors"
"testing"
"time"
clientv3 "go.etcd.io/etcd/client/v3"
"k8s.io/apiserver/pkg/storage/etcd3/testserver"
"k8s.io/apiserver/pkg/storage/storagebackend"
)
type mockKV struct {
get func(ctx context.Context) (*clientv3.GetResponse, error)
}
func (mkv mockKV) Put(ctx context.Context, key, val string, opts ...clientv3.OpOption) (*clientv3.PutResponse, error) {
return nil, nil
}
func (mkv mockKV) Get(ctx context.Context, key string, opts ...clientv3.OpOption) (*clientv3.GetResponse, error) {
return mkv.get(ctx)
}
func (mockKV) Delete(ctx context.Context, key string, opts ...clientv3.OpOption) (*clientv3.DeleteResponse, error) {
return nil, nil
}
func (mockKV) Compact(ctx context.Context, rev int64, opts ...clientv3.CompactOption) (*clientv3.CompactResponse, error) {
return nil, nil
}
func (mockKV) Do(ctx context.Context, op clientv3.Op) (clientv3.OpResponse, error) {
return clientv3.OpResponse{}, nil
}
func (mockKV) Txn(ctx context.Context) clientv3.Txn {
return nil
}
func TestCreateHealthcheck(t *testing.T) {
etcdConfig := testserver.NewTestConfig(t)
client := testserver.RunEtcd(t, etcdConfig)
newETCD3ClientFn := newETCD3Client
defer func() {
newETCD3Client = newETCD3ClientFn
}()
tests := []struct {
name string
cfg storagebackend.Config
want error
responseTime time.Duration
}{
{
name: "ok if response time lower than default timeout",
cfg: storagebackend.Config{
Type: storagebackend.StorageTypeETCD3,
Transport: storagebackend.TransportConfig{},
},
responseTime: 1 * time.Second,
want: nil,
},
{
name: "ok if response time lower than custom timeout",
cfg: storagebackend.Config{
Type: storagebackend.StorageTypeETCD3,
Transport: storagebackend.TransportConfig{},
HealthcheckTimeout: 5 * time.Second,
},
responseTime: 3 * time.Second,
want: nil,
},
{
name: "timeouts if response time higher than default timeout",
cfg: storagebackend.Config{
Type: storagebackend.StorageTypeETCD3,
Transport: storagebackend.TransportConfig{},
},
responseTime: 3 * time.Second,
want: context.DeadlineExceeded,
},
{
name: "timeouts if response time higher than custom timeout",
cfg: storagebackend.Config{
Type: storagebackend.StorageTypeETCD3,
Transport: storagebackend.TransportConfig{},
HealthcheckTimeout: 3 * time.Second,
},
responseTime: 5 * time.Second,
want: context.DeadlineExceeded,
},
}
for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
tc.cfg.Transport.ServerList = client.Endpoints()
newETCD3Client = func(c storagebackend.TransportConfig) (*clientv3.Client, error) {
dummyKV := mockKV{
get: func(ctx context.Context) (*clientv3.GetResponse, error) {
select {
case <-ctx.Done():
return nil, ctx.Err()
case <-time.After(tc.responseTime):
return nil, nil
}
},
}
client.KV = dummyKV
return client, nil
}
stop := make(chan struct{})
healthcheck, err := CreateHealthCheck(tc.cfg, stop)
if err != nil {
t.Fatal(err)
}
// Wait for healthcheck to establish connection
time.Sleep(2 * time.Second)
got := healthcheck()
if !errors.Is(got, tc.want) {
t.Errorf("healthcheck() missmatch want %v got %v", tc.want, got)
}
})
}
}
func TestCreateReadycheck(t *testing.T) {
etcdConfig := testserver.NewTestConfig(t)
client := testserver.RunEtcd(t, etcdConfig)
newETCD3ClientFn := newETCD3Client
defer func() {
newETCD3Client = newETCD3ClientFn
}()
tests := []struct {
name string
cfg storagebackend.Config
want error
responseTime time.Duration
}{
{
name: "ok if response time lower than default timeout",
cfg: storagebackend.Config{
Type: storagebackend.StorageTypeETCD3,
Transport: storagebackend.TransportConfig{},
},
responseTime: 1 * time.Second,
want: nil,
},
{
name: "ok if response time lower than custom timeout",
cfg: storagebackend.Config{
Type: storagebackend.StorageTypeETCD3,
Transport: storagebackend.TransportConfig{},
ReadycheckTimeout: 5 * time.Second,
},
responseTime: 3 * time.Second,
want: nil,
},
{
name: "timeouts if response time higher than default timeout",
cfg: storagebackend.Config{
Type: storagebackend.StorageTypeETCD3,
Transport: storagebackend.TransportConfig{},
},
responseTime: 3 * time.Second,
want: context.DeadlineExceeded,
},
{
name: "timeouts if response time higher than custom timeout",
cfg: storagebackend.Config{
Type: storagebackend.StorageTypeETCD3,
Transport: storagebackend.TransportConfig{},
ReadycheckTimeout: 3 * time.Second,
},
responseTime: 5 * time.Second,
want: context.DeadlineExceeded,
},
{
name: "timeouts if response time higher than default timeout with custom healthcheck timeout",
cfg: storagebackend.Config{
Type: storagebackend.StorageTypeETCD3,
Transport: storagebackend.TransportConfig{},
HealthcheckTimeout: 10 * time.Second,
},
responseTime: 3 * time.Second,
want: context.DeadlineExceeded,
},
}
for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
tc.cfg.Transport.ServerList = client.Endpoints()
newETCD3Client = func(c storagebackend.TransportConfig) (*clientv3.Client, error) {
dummyKV := mockKV{
get: func(ctx context.Context) (*clientv3.GetResponse, error) {
select {
case <-ctx.Done():
return nil, ctx.Err()
case <-time.After(tc.responseTime):
return nil, nil
}
},
}
client.KV = dummyKV
return client, nil
}
stop := make(chan struct{})
healthcheck, err := CreateReadyCheck(tc.cfg, stop)
if err != nil {
t.Fatal(err)
}
// Wait for healthcheck to establish connection
time.Sleep(2 * time.Second)
got := healthcheck()
if !errors.Is(got, tc.want) {
t.Errorf("healthcheck() missmatch want %v got %v", tc.want, got)
}
})
}
}