kubeadm: fix flakes when performing etcd MemberAdd on slower setups
In slower setups it can take more time for the existing cluster to be in a healthy state, so the existing backoff of ~50 seconds is apparently not sufficient. The client dial can also fail for similar reasons. Improve kubeadm's join toleration of adding new etcd members. Wrap both the client dial and member add in a longer backoff (up to ~200 seconds). This particular change should be backported to the support skew. In a future change for master, all etcd client operations should be make consistent so that the etcd logic is in a sane state.
This commit is contained in:
parent
8dd93ca94c
commit
1c430ff30f
@ -351,23 +351,32 @@ func (c *Client) AddMember(name string, peerAddrs string) ([]Member, error) {
|
|||||||
return nil, errors.Wrapf(err, "error parsing peer address %s", peerAddrs)
|
return nil, errors.Wrapf(err, "error parsing peer address %s", peerAddrs)
|
||||||
}
|
}
|
||||||
|
|
||||||
cli, err := clientv3.New(clientv3.Config{
|
// Exponential backoff for the MemberAdd operation (up to ~200 seconds)
|
||||||
Endpoints: c.Endpoints,
|
etcdBackoffAdd := wait.Backoff{
|
||||||
DialTimeout: dialTimeout,
|
Steps: 18,
|
||||||
DialOptions: []grpc.DialOption{
|
Duration: 100 * time.Millisecond,
|
||||||
grpc.WithBlock(), // block until the underlying connection is up
|
Factor: 1.5,
|
||||||
},
|
Jitter: 0.1,
|
||||||
TLS: c.TLS,
|
|
||||||
})
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
}
|
||||||
defer cli.Close()
|
|
||||||
|
|
||||||
// Adds a new member to the cluster
|
// Adds a new member to the cluster
|
||||||
var lastError error
|
var lastError error
|
||||||
var resp *clientv3.MemberAddResponse
|
var resp *clientv3.MemberAddResponse
|
||||||
err = wait.ExponentialBackoff(etcdBackoff, func() (bool, error) {
|
err = wait.ExponentialBackoff(etcdBackoffAdd, func() (bool, error) {
|
||||||
|
cli, err := clientv3.New(clientv3.Config{
|
||||||
|
Endpoints: c.Endpoints,
|
||||||
|
DialTimeout: etcdTimeout,
|
||||||
|
DialOptions: []grpc.DialOption{
|
||||||
|
grpc.WithBlock(), // block until the underlying connection is up
|
||||||
|
},
|
||||||
|
TLS: c.TLS,
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
lastError = err
|
||||||
|
return false, nil
|
||||||
|
}
|
||||||
|
defer cli.Close()
|
||||||
|
|
||||||
ctx, cancel := context.WithTimeout(context.Background(), etcdTimeout)
|
ctx, cancel := context.WithTimeout(context.Background(), etcdTimeout)
|
||||||
resp, err = cli.MemberAdd(ctx, []string{peerAddrs})
|
resp, err = cli.MemberAdd(ctx, []string{peerAddrs})
|
||||||
cancel()
|
cancel()
|
||||||
|
Loading…
Reference in New Issue
Block a user