Merge pull request #89777 from wojtek-t/fix_networking_tests_timeouts

Fix networking tests timeouts for large clusters
This commit is contained in:
Kubernetes Prow Robot
2020-04-02 12:43:20 -07:00
committed by GitHub
7 changed files with 64 additions and 50 deletions

View File

@@ -765,14 +765,14 @@ func (j *TestJig) pollIngressWithCert(ing *networkingv1beta1.Ingress, address st
// WaitForIngress waits for the Ingress to get an address.
// WaitForIngress returns when it gets the first 200 response
func (j *TestJig) WaitForIngress(waitForNodePort bool) {
if err := j.WaitForGivenIngressWithTimeout(j.Ingress, waitForNodePort, e2eservice.LoadBalancerPollTimeout); err != nil {
if err := j.WaitForGivenIngressWithTimeout(j.Ingress, waitForNodePort, e2eservice.GetServiceLoadBalancerPropagationTimeout(j.Client)); err != nil {
framework.Failf("error in waiting for ingress to get an address: %s", err)
}
}
// WaitForIngressToStable waits for the LB return 100 consecutive 200 responses.
func (j *TestJig) WaitForIngressToStable() {
if err := wait.Poll(10*time.Second, e2eservice.LoadBalancerPropagationTimeoutDefault, func() (bool, error) {
if err := wait.Poll(10*time.Second, e2eservice.GetServiceLoadBalancerPropagationTimeout(j.Client), func() (bool, error) {
_, err := j.GetDistinctResponseFromIngress()
if err != nil {
return false, nil
@@ -811,12 +811,13 @@ func (j *TestJig) WaitForGivenIngressWithTimeout(ing *networkingv1beta1.Ingress,
// Ingress. Hostnames and certificate need to be explicitly passed in.
func (j *TestJig) WaitForIngressWithCert(waitForNodePort bool, knownHosts []string, cert []byte) error {
// Wait for the loadbalancer IP.
address, err := j.WaitForIngressAddress(j.Client, j.Ingress.Namespace, j.Ingress.Name, e2eservice.LoadBalancerPollTimeout)
propagationTimeout := e2eservice.GetServiceLoadBalancerPropagationTimeout(j.Client)
address, err := j.WaitForIngressAddress(j.Client, j.Ingress.Namespace, j.Ingress.Name, propagationTimeout)
if err != nil {
return fmt.Errorf("Ingress failed to acquire an IP address within %v", e2eservice.LoadBalancerPollTimeout)
return fmt.Errorf("Ingress failed to acquire an IP address within %v", propagationTimeout)
}
return j.pollIngressWithCert(j.Ingress, address, knownHosts, cert, waitForNodePort, e2eservice.LoadBalancerPollTimeout)
return j.pollIngressWithCert(j.Ingress, address, knownHosts, cert, waitForNodePort, propagationTimeout)
}
// VerifyURL polls for the given iterations, in intervals, and fails if the
@@ -960,9 +961,10 @@ func (j *TestJig) ConstructFirewallForIngress(firewallRuleName string, nodeTags
// GetDistinctResponseFromIngress tries GET call to the ingress VIP and return all distinct responses.
func (j *TestJig) GetDistinctResponseFromIngress() (sets.String, error) {
// Wait for the loadbalancer IP.
address, err := j.WaitForIngressAddress(j.Client, j.Ingress.Namespace, j.Ingress.Name, e2eservice.LoadBalancerPollTimeout)
propagationTimeout := e2eservice.GetServiceLoadBalancerPropagationTimeout(j.Client)
address, err := j.WaitForIngressAddress(j.Client, j.Ingress.Namespace, j.Ingress.Name, propagationTimeout)
if err != nil {
framework.Failf("Ingress failed to acquire an IP address within %v", e2eservice.LoadBalancerPollTimeout)
framework.Failf("Ingress failed to acquire an IP address within %v", propagationTimeout)
}
responses := sets.NewString()
timeoutClient := &http.Client{Timeout: IngressReqTimeout}

View File

@@ -50,21 +50,25 @@ const (
// LoadBalancerCreateTimeoutDefault is the default time to wait for a load balancer to be created/modified.
// TODO: once support ticket 21807001 is resolved, reduce this timeout back to something reasonable
LoadBalancerCreateTimeoutDefault = 20 * time.Minute
// Hideen - use GetServiceLoadBalancerCreateTimeout function instead.
loadBalancerCreateTimeoutDefault = 20 * time.Minute
// LoadBalancerCreateTimeoutLarge is the maximum time to wait for a load balancer to be created/modified.
LoadBalancerCreateTimeoutLarge = 2 * time.Hour
// Hideen - use GetServiceLoadBalancerCreateTimeout function instead.
loadBalancerCreateTimeoutLarge = 2 * time.Hour
// LoadBalancerPropagationTimeoutDefault is the default time to wait for pods to
// be targeted by load balancers.
LoadBalancerPropagationTimeoutDefault = 10 * time.Minute
// Hideen - use GetServiceLoadBalancerPropagationTimeout function instead.
loadBalancerPropagationTimeoutDefault = 10 * time.Minute
// LoadBalancerPropagationTimeoutLarge is the maximum time to wait for pods to
// be targeted by load balancers.
// Hideen - use GetServiceLoadBalancerPropagationTimeout function instead.
loadBalancerPropagationTimeoutLarge = time.Hour
// LoadBalancerCleanupTimeout is the time required by the loadbalancer to cleanup, proportional to numApps/Ing.
// Bring the cleanup timeout back down to 5m once b/33588344 is resolved.
LoadBalancerCleanupTimeout = 15 * time.Minute
// LoadBalancerPollTimeout is the time required by the loadbalancer to poll.
// On average it takes ~6 minutes for a single backend to come online in GCE.
LoadBalancerPollTimeout = 22 * time.Minute
// LoadBalancerPollInterval is the interval value in which the loadbalancer polls.
LoadBalancerPollInterval = 30 * time.Second

View File

@@ -300,7 +300,7 @@ func (j *TestJig) GetEndpointNodeNames() (sets.String, error) {
// WaitForEndpointOnNode waits for a service endpoint on the given node.
func (j *TestJig) WaitForEndpointOnNode(nodeName string) error {
return wait.PollImmediate(framework.Poll, LoadBalancerPropagationTimeoutDefault, func() (bool, error) {
return wait.PollImmediate(framework.Poll, KubeProxyLagTimeout, func() (bool, error) {
endpoints, err := j.Client.CoreV1().Endpoints(j.Namespace).Get(context.TODO(), j.Name, metav1.GetOptions{})
if err != nil {
framework.Logf("Get endpoints for service %s/%s failed (%s)", j.Namespace, j.Name, err)

View File

@@ -104,9 +104,19 @@ func GetServiceLoadBalancerCreationTimeout(cs clientset.Interface) time.Duration
nodes, err := e2enode.GetReadySchedulableNodes(cs)
framework.ExpectNoError(err)
if len(nodes.Items) > LargeClusterMinNodesNumber {
return LoadBalancerCreateTimeoutLarge
return loadBalancerCreateTimeoutLarge
}
return LoadBalancerCreateTimeoutDefault
return loadBalancerCreateTimeoutDefault
}
// GetServiceLoadBalancerPropagationTimeout returns a timeout value for propagating a load balancer of a service.
func GetServiceLoadBalancerPropagationTimeout(cs clientset.Interface) time.Duration {
nodes, err := e2enode.GetReadySchedulableNodes(cs)
framework.ExpectNoError(err)
if len(nodes.Items) > LargeClusterMinNodesNumber {
return loadBalancerPropagationTimeoutLarge
}
return loadBalancerPropagationTimeoutDefault
}
// CreateServiceForSimpleAppWithPods is a convenience wrapper to create a service and its matching pods all at once.

View File

@@ -167,7 +167,7 @@ var _ = SIGDescribe("Firewall rule", func() {
// Send requests from outside of the cluster because internal traffic is whitelisted
ginkgo.By("Accessing the external service ip from outside, all non-master nodes should be reached")
err = testHitNodesFromOutside(svcExternalIP, firewallTestHTTPPort, e2eservice.LoadBalancerPropagationTimeoutDefault, nodesSet)
err = testHitNodesFromOutside(svcExternalIP, firewallTestHTTPPort, e2eservice.GetServiceLoadBalancerPropagationTimeout(cs), nodesSet)
framework.ExpectNoError(err)
// Check if there are overlapping tags on the firewall that extend beyond just the vms in our cluster
@@ -188,12 +188,12 @@ var _ = SIGDescribe("Firewall rule", func() {
nodesSet.Insert(nodesNames[0])
gce.SetInstanceTags(cloudConfig, nodesNames[0], zone, removedTags)
// Make sure traffic is recovered before exit
err = testHitNodesFromOutside(svcExternalIP, firewallTestHTTPPort, e2eservice.LoadBalancerPropagationTimeoutDefault, nodesSet)
err = testHitNodesFromOutside(svcExternalIP, firewallTestHTTPPort, e2eservice.GetServiceLoadBalancerPropagationTimeout(cs), nodesSet)
framework.ExpectNoError(err)
}()
ginkgo.By("Accessing serivce through the external ip and examine got no response from the node without tags")
err = testHitNodesFromOutsideWithCount(svcExternalIP, firewallTestHTTPPort, e2eservice.LoadBalancerPropagationTimeoutDefault, nodesSet, 15)
err = testHitNodesFromOutsideWithCount(svcExternalIP, firewallTestHTTPPort, e2eservice.GetServiceLoadBalancerPropagationTimeout(cs), nodesSet, 15)
framework.ExpectNoError(err)
})

View File

@@ -172,7 +172,8 @@ var _ = SIGDescribe("Loadbalancing: L7", func() {
}, map[string]string{})
ginkgo.By(fmt.Sprintf("waiting for Ingress %s to get instance group annotation", name))
pollErr := wait.Poll(2*time.Second, e2eservice.LoadBalancerPollTimeout, func() (bool, error) {
propagationTimeout := e2eservice.GetServiceLoadBalancerPropagationTimeout(f.ClientSet)
pollErr := wait.Poll(2*time.Second, propagationTimeout, func() (bool, error) {
ing, err := f.ClientSet.NetworkingV1beta1().Ingresses(ns).Get(context.TODO(), name, metav1.GetOptions{})
framework.ExpectNoError(err)
annotations := ing.Annotations
@@ -287,6 +288,7 @@ var _ = SIGDescribe("Loadbalancing: L7", func() {
ginkgo.It("should be able to switch between IG and NEG modes", func() {
var err error
propagationTimeout := e2eservice.GetServiceLoadBalancerPropagationTimeout(f.ClientSet)
ginkgo.By("Create a basic HTTP ingress using NEG")
jig.CreateIngress(filepath.Join(e2eingress.IngressManifestPath, "neg"), ns, map[string]string{}, map[string]string{})
jig.WaitForIngress(true)
@@ -301,7 +303,7 @@ var _ = SIGDescribe("Loadbalancing: L7", func() {
_, err = f.ClientSet.CoreV1().Services(ns).Update(context.TODO(), &svc, metav1.UpdateOptions{})
framework.ExpectNoError(err)
}
err = wait.Poll(5*time.Second, e2eservice.LoadBalancerPollTimeout, func() (bool, error) {
err = wait.Poll(5*time.Second, propagationTimeout, func() (bool, error) {
if err := gceController.BackendServiceUsingIG(jig.GetServicePorts(false)); err != nil {
framework.Logf("ginkgo.Failed to verify IG backend service: %v", err)
return false, nil
@@ -319,7 +321,7 @@ var _ = SIGDescribe("Loadbalancing: L7", func() {
_, err = f.ClientSet.CoreV1().Services(ns).Update(context.TODO(), &svc, metav1.UpdateOptions{})
framework.ExpectNoError(err)
}
err = wait.Poll(5*time.Second, e2eservice.LoadBalancerPollTimeout, func() (bool, error) {
err = wait.Poll(5*time.Second, propagationTimeout, func() (bool, error) {
if err := gceController.BackendServiceUsingNEG(jig.GetServicePorts(false)); err != nil {
framework.Logf("ginkgo.Failed to verify NEG backend service: %v", err)
return false, nil
@@ -406,7 +408,8 @@ var _ = SIGDescribe("Loadbalancing: L7", func() {
_, err = f.ClientSet.AppsV1().Deployments(ns).UpdateScale(context.TODO(), name, scale, metav1.UpdateOptions{})
framework.ExpectNoError(err)
err = wait.Poll(10*time.Second, e2eservice.LoadBalancerPollTimeout, func() (bool, error) {
propagationTimeout := e2eservice.GetServiceLoadBalancerPropagationTimeout(f.ClientSet)
err = wait.Poll(10*time.Second, propagationTimeout, func() (bool, error) {
res, err := jig.GetDistinctResponseFromIngress()
if err != nil {
return false, nil
@@ -423,7 +426,7 @@ var _ = SIGDescribe("Loadbalancing: L7", func() {
deploy.Spec.Template.Spec.TerminationGracePeriodSeconds = &gracePeriod
_, err = f.ClientSet.AppsV1().Deployments(ns).Update(context.TODO(), deploy, metav1.UpdateOptions{})
framework.ExpectNoError(err)
err = wait.Poll(10*time.Second, e2eservice.LoadBalancerPollTimeout, func() (bool, error) {
err = wait.Poll(10*time.Second, propagationTimeout, func() (bool, error) {
res, err := jig.GetDistinctResponseFromIngress()
framework.ExpectNoError(err)
deploy, err := f.ClientSet.AppsV1().Deployments(ns).Get(context.TODO(), name, metav1.GetOptions{})
@@ -832,12 +835,14 @@ func executeStaticIPHttpsOnlyTest(f *framework.Framework, jig *e2eingress.TestJi
e2eingress.IngressAllowHTTPKey: "false",
}, map[string]string{})
propagationTimeout := e2eservice.GetServiceLoadBalancerPropagationTimeout(f.ClientSet)
ginkgo.By("waiting for Ingress to come up with ip: " + ip)
httpClient := e2eingress.BuildInsecureClient(e2eingress.IngressReqTimeout)
framework.ExpectNoError(e2eingress.PollURL(fmt.Sprintf("https://%s/", ip), "", e2eservice.LoadBalancerPollTimeout, jig.PollInterval, httpClient, false))
framework.ExpectNoError(e2eingress.PollURL(fmt.Sprintf("https://%s/", ip), "", propagationTimeout, jig.PollInterval, httpClient, false))
ginkgo.By("should reject HTTP traffic")
framework.ExpectNoError(e2eingress.PollURL(fmt.Sprintf("http://%s/", ip), "", e2eservice.LoadBalancerPollTimeout, jig.PollInterval, httpClient, true))
framework.ExpectNoError(e2eingress.PollURL(fmt.Sprintf("http://%s/", ip), "", propagationTimeout, jig.PollInterval, httpClient, true))
}
func executeBacksideBacksideHTTPSTest(f *framework.Framework, jig *e2eingress.TestJig, staticIPName string) {
@@ -850,14 +855,15 @@ func executeBacksideBacksideHTTPSTest(f *framework.Framework, jig *e2eingress.Te
}
}()
framework.ExpectNoError(err, "ginkgo.Failed to create re-encryption ingress")
propagationTimeout := e2eservice.GetServiceLoadBalancerPropagationTimeout(f.ClientSet)
ginkgo.By(fmt.Sprintf("Waiting for ingress %s to come up", ingCreated.Name))
ingIP, err := jig.WaitForIngressAddress(f.ClientSet, f.Namespace.Name, ingCreated.Name, e2eservice.LoadBalancerPollTimeout)
ingIP, err := jig.WaitForIngressAddress(f.ClientSet, f.Namespace.Name, ingCreated.Name, propagationTimeout)
framework.ExpectNoError(err, "ginkgo.Failed to wait for ingress IP")
ginkgo.By(fmt.Sprintf("Polling on address %s and verify the backend is serving HTTPS", ingIP))
timeoutClient := &http.Client{Timeout: e2eingress.IngressReqTimeout}
err = wait.PollImmediate(e2eservice.LoadBalancerPollInterval, e2eservice.LoadBalancerPollTimeout, func() (bool, error) {
err = wait.PollImmediate(e2eservice.LoadBalancerPollInterval, propagationTimeout, func() (bool, error) {
resp, err := e2eingress.SimpleGET(timeoutClient, fmt.Sprintf("http://%s", ingIP), "")
if err != nil {
framework.Logf("SimpleGET failed: %v", err)

View File

@@ -68,16 +68,6 @@ const (
defaultServeHostnameServicePort = 80
defaultServeHostnameServiceName = "svc-hostname"
// KubeProxyLagTimeout is the maximum time a kube-proxy daemon on a node is allowed
// to not notice a Service update, such as type=NodePort.
// TODO: This timeout should be O(10s), observed values are O(1m), 5m is very
// liberal. Fix tracked in #20567.
KubeProxyLagTimeout = 5 * time.Minute
// LoadBalancerPollTimeout is the time required by the loadbalancer to poll.
// On average it takes ~6 minutes for a single backend to come online in GCE.
LoadBalancerPollTimeout = 22 * time.Minute
// AffinityTimeout is the maximum time that CheckAffinity is allowed to take; this
// needs to be more than long enough for AffinityConfirmCount HTTP requests to
// complete in a busy CI cluster, but shouldn't be too long since we will end up
@@ -118,13 +108,13 @@ type portsByPodName map[string][]int
// number of same response observed in a row. If affinity is not expected, the
// test will keep observe until different responses observed. The function will
// return false only in case of unexpected errors.
func checkAffinity(execPod *v1.Pod, serviceIP string, servicePort int, shouldHold bool) bool {
func checkAffinity(cs clientset.Interface, execPod *v1.Pod, serviceIP string, servicePort int, shouldHold bool) bool {
serviceIPPort := net.JoinHostPort(serviceIP, strconv.Itoa(servicePort))
curl := fmt.Sprintf(`curl -q -s --connect-timeout 2 http://%s/`, serviceIPPort)
cmd := fmt.Sprintf("for i in $(seq 0 %d); do echo; %s ; done", AffinityConfirmCount, curl)
timeout := AffinityTimeout
if execPod == nil {
timeout = LoadBalancerPollTimeout
timeout = e2eservice.GetServiceLoadBalancerPropagationTimeout(cs)
}
var tracker affinityTracker
// interval considering a maximum of 2 seconds per connection
@@ -316,7 +306,7 @@ func verifyServeHostnameServiceUp(c clientset.Interface, ns, host string, expect
gotEndpoints := sets.NewString()
// Retry cmdFunc for a while
for start := time.Now(); time.Since(start) < KubeProxyLagTimeout; time.Sleep(5 * time.Second) {
for start := time.Now(); time.Since(start) < e2eservice.KubeProxyLagTimeout; time.Sleep(5 * time.Second) {
for _, endpoint := range strings.Split(cmdFunc(), "\n") {
trimmedEp := strings.TrimSpace(endpoint)
if trimmedEp != "" {
@@ -2355,7 +2345,8 @@ var _ = SIGDescribe("Services", func() {
ginkgo.By("health check should be reconciled")
pollInterval := framework.Poll * 10
if pollErr := wait.PollImmediate(pollInterval, e2eservice.LoadBalancerPropagationTimeoutDefault, func() (bool, error) {
loadBalancerPropagationTimeout := e2eservice.GetServiceLoadBalancerPropagationTimeout(cs)
if pollErr := wait.PollImmediate(pollInterval, loadBalancerPropagationTimeout, func() (bool, error) {
hc, err := gceCloud.GetHTTPHealthCheck(hcName)
if err != nil {
framework.Logf("ginkgo.Failed to get HttpHealthCheck(%q): %v", hcName, err)
@@ -3014,8 +3005,9 @@ var _ = SIGDescribe("ESIPP [Slow] [DisabledForLargeClusters]", func() {
cmd := fmt.Sprintf(`curl -q -s --connect-timeout 30 %v`, path)
var srcIP string
loadBalancerPropagationTimeout := e2eservice.GetServiceLoadBalancerPropagationTimeout(cs)
ginkgo.By(fmt.Sprintf("Hitting external lb %v from pod %v on node %v", ingressIP, pausePod.Name, pausePod.Spec.NodeName))
if pollErr := wait.PollImmediate(framework.Poll, e2eservice.LoadBalancerPropagationTimeoutDefault, func() (bool, error) {
if pollErr := wait.PollImmediate(framework.Poll, loadBalancerPropagationTimeout, func() (bool, error) {
stdout, err := framework.RunHostCmd(pausePod.Namespace, pausePod.Name, cmd)
if err != nil {
framework.Logf("got err: %v, retry until timeout", err)
@@ -3220,7 +3212,7 @@ func execAffinityTestForSessionAffinityTimeout(f *framework.Framework, cs client
framework.ExpectNoError(err)
// the service should be sticky until the timeout expires
framework.ExpectEqual(checkAffinity(execPod, svcIP, servicePort, true), true)
framework.ExpectEqual(checkAffinity(cs, execPod, svcIP, servicePort, true), true)
// but it should return different hostnames after the timeout expires
// try several times to avoid the probability that we hit the same pod twice
hosts := sets.NewString()
@@ -3287,19 +3279,19 @@ func execAffinityTestForNonLBServiceWithOptionalTransition(f *framework.Framewor
framework.ExpectNoError(err)
if !isTransitionTest {
framework.ExpectEqual(checkAffinity(execPod, svcIP, servicePort, true), true)
framework.ExpectEqual(checkAffinity(cs, execPod, svcIP, servicePort, true), true)
}
if isTransitionTest {
_, err = jig.UpdateService(func(svc *v1.Service) {
svc.Spec.SessionAffinity = v1.ServiceAffinityNone
})
framework.ExpectNoError(err)
framework.ExpectEqual(checkAffinity(execPod, svcIP, servicePort, false), true)
framework.ExpectEqual(checkAffinity(cs, execPod, svcIP, servicePort, false), true)
_, err = jig.UpdateService(func(svc *v1.Service) {
svc.Spec.SessionAffinity = v1.ServiceAffinityClientIP
})
framework.ExpectNoError(err)
framework.ExpectEqual(checkAffinity(execPod, svcIP, servicePort, true), true)
framework.ExpectEqual(checkAffinity(cs, execPod, svcIP, servicePort, true), true)
}
}
@@ -3337,19 +3329,19 @@ func execAffinityTestForLBServiceWithOptionalTransition(f *framework.Framework,
port := int(svc.Spec.Ports[0].Port)
if !isTransitionTest {
framework.ExpectEqual(checkAffinity(nil, ingressIP, port, true), true)
framework.ExpectEqual(checkAffinity(cs, nil, ingressIP, port, true), true)
}
if isTransitionTest {
svc, err = jig.UpdateService(func(svc *v1.Service) {
svc.Spec.SessionAffinity = v1.ServiceAffinityNone
})
framework.ExpectNoError(err)
framework.ExpectEqual(checkAffinity(nil, ingressIP, port, false), true)
framework.ExpectEqual(checkAffinity(cs, nil, ingressIP, port, false), true)
svc, err = jig.UpdateService(func(svc *v1.Service) {
svc.Spec.SessionAffinity = v1.ServiceAffinityClientIP
})
framework.ExpectNoError(err)
framework.ExpectEqual(checkAffinity(nil, ingressIP, port, true), true)
framework.ExpectEqual(checkAffinity(cs, nil, ingressIP, port, true), true)
}
}