Provide backpressure to clients when etcd goes down

When etcd is down today we don't specifically handle the error involved, which means clients get a generic 500 error. This commit adds a formal error type internally for both WatchExpired and EtcdUnreachable, and then converts them to api/errors before returning to the client. It also upgrades the client to retry on any 429 or 5xx error that has a Retry-After header, instead of just 429. In combination, this allows the apiserver to exert backpressure on controllers that are hotlooping. Picked 2 seconds by default, but we could potentially ramp that up even further in a future iteration.
2015-11-04 15:15:01 -05:00
parent ca6fe97275
commit 3da15535b6
8 changed files with 103 additions and 13 deletions
--- a/pkg/api/errors/etcd/etcd.go
+++ b/pkg/api/errors/etcd/etcd.go
@@ -21,12 +21,27 @@ import (
 	etcdstorage "k8s.io/kubernetes/pkg/storage/etcd"
 )

+// InterpretListError converts a generic etcd error on a retrieval
+// operation into the appropriate API error.
+func InterpretListError(err error, kind string) error {
+	switch {
+	case etcdstorage.IsEtcdNotFound(err):
+		return errors.NewNotFound(kind, "")
+	case etcdstorage.IsEtcdUnreachable(err):
+		return errors.NewServerTimeout(kind, "list", 2) // TODO: make configurable or handled at a higher level
+	default:
+		return err
+	}
+}
+
 // InterpretGetError converts a generic etcd error on a retrieval
 // operation into the appropriate API error.
 func InterpretGetError(err error, kind, name string) error {
 	switch {
 	case etcdstorage.IsEtcdNotFound(err):
 		return errors.NewNotFound(kind, name)
+	case etcdstorage.IsEtcdUnreachable(err):
+		return errors.NewServerTimeout(kind, "get", 2) // TODO: make configurable or handled at a higher level
 	default:
 		return err
 	}
@@ -38,6 +53,8 @@ func InterpretCreateError(err error, kind, name string) error {
 	switch {
 	case etcdstorage.IsEtcdNodeExist(err):
 		return errors.NewAlreadyExists(kind, name)
+	case etcdstorage.IsEtcdUnreachable(err):
+		return errors.NewServerTimeout(kind, "create", 2) // TODO: make configurable or handled at a higher level
 	default:
 		return err
 	}
@@ -49,6 +66,8 @@ func InterpretUpdateError(err error, kind, name string) error {
 	switch {
 	case etcdstorage.IsEtcdTestFailed(err), etcdstorage.IsEtcdNodeExist(err):
 		return errors.NewConflict(kind, name, err)
+	case etcdstorage.IsEtcdUnreachable(err):
+		return errors.NewServerTimeout(kind, "update", 2) // TODO: make configurable or handled at a higher level
 	default:
 		return err
 	}
@@ -60,6 +79,8 @@ func InterpretDeleteError(err error, kind, name string) error {
 	switch {
 	case etcdstorage.IsEtcdNotFound(err):
 		return errors.NewNotFound(kind, name)
+	case etcdstorage.IsEtcdUnreachable(err):
+		return errors.NewServerTimeout(kind, "delete", 2) // TODO: make configurable or handled at a higher level
 	default:
 		return err
 	}
--- a/pkg/api/unversioned/types.go
+++ b/pkg/api/unversioned/types.go
@@ -215,6 +215,12 @@ const (
 	// Status code 500
 	StatusReasonInternalError = "InternalError"

+	// StatusReasonExpired indicates that the request is invalid because the content you are requesting
+	// has expired and is no longer available. It is typically associated with watches that can't be
+	// serviced.
+	// Status code 410 (gone)
+	StatusReasonExpired = "Expired"
+
 	// StatusReasonServiceUnavailable means that the request itself was valid,
 	// but the requested service is unavailable at this time.
 	// Retrying the request after some time might succeed.