Provide backpressure to clients when etcd goes down

When etcd is down today we don't specifically handle the error involved, which means clients get a generic 500 error. This commit adds a formal error type internally for both WatchExpired and EtcdUnreachable, and then converts them to api/errors before returning to the client. It also upgrades the client to retry on any 429 or 5xx error that has a Retry-After header, instead of just 429. In combination, this allows the apiserver to exert backpressure on controllers that are hotlooping. Picked 2 seconds by default, but we could potentially ramp that up even further in a future iteration.
2015-11-04 15:15:01 -05:00
parent ca6fe97275
commit 3da15535b6
8 changed files with 103 additions and 13 deletions
--- a/pkg/registry/generic/etcd/etcd.go
+++ b/pkg/registry/generic/etcd/etcd.go
@@ -182,10 +182,7 @@ func (e *Etcd) ListPredicate(ctx api.Context, m generic.Matcher, options *api.Li
 			trace.Step("About to read single object")
 			err := e.Storage.GetToList(ctx, key, filterFunc, list)
 			trace.Step("Object extracted")
-			if err != nil {
-				return nil, err
-			}
-			return list, nil
+			return list, etcderr.InterpretListError(err, e.EndpointName)
 		}
 		// if we cannot extract a key based on the current context, the optimization is skipped
 	}
@@ -200,10 +197,7 @@ func (e *Etcd) ListPredicate(ctx api.Context, m generic.Matcher, options *api.Li
 	}
 	err = e.Storage.List(ctx, e.KeyRootFunc(ctx), version, filterFunc, list)
 	trace.Step("List extracted")
-	if err != nil {
-		return nil, err
-	}
-	return list, nil
+	return list, etcderr.InterpretListError(err, e.EndpointName)
 }

 // Create inserts a new item according to the unique key from the object.