Removing old shell based monitoring test.

This commit is contained in:
Vishnu Kannan 2015-03-11 21:39:56 +00:00
parent 425dd7e3ee
commit cbb3c96f31
2 changed files with 47 additions and 163 deletions

View File

@ -1,134 +0,0 @@
#!/bin/bash
# Copyright 2014 Google Inc. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Assumes a running Kubernetes test cluster; verifies that the monitoring setup
# works. Assumes that we're being called by hack/e2e-test.sh (we use some env
# vars it sets up).
set -o errexit
set -o nounset
set -o pipefail
KUBE_ROOT=$(dirname "${BASH_SOURCE}")/../..
: ${KUBE_VERSION_ROOT:=${KUBE_ROOT}}
: ${KUBECTL:="${KUBE_VERSION_ROOT}/cluster/kubectl.sh"}
: ${KUBE_CONFIG_FILE:="config-test.sh"}
export KUBECTL KUBE_CONFIG_FILE
source "${KUBE_ROOT}/cluster/kube-env.sh"
source "${KUBE_VERSION_ROOT}/cluster/${KUBERNETES_PROVIDER}/util.sh"
prepare-e2e
MONITORING="${KUBE_ROOT}/cluster/addons/cluster-monitoring"
KUBECTL="${KUBE_ROOT}/cluster/kubectl.sh"
BIGRAND=$(printf "%x\n" $(( $RANDOM << 16 | $RANDOM ))) # random 2^32 in hex
MONITORING_FIREWALL_RULE="monitoring-test-${BIGRAND}"
function setup {
# This only has work to do on gce and gke
if [[ "${KUBERNETES_PROVIDER}" == "gce" ]] || [[ "${KUBERNETES_PROVIDER}" == "gke" ]]; then
detect-project
if ! "${GCLOUD}" compute firewall-rules create "${MONITORING_FIREWALL_RULE}" \
--project "${PROJECT}" \
--network "${NETWORK}" \
--quiet \
--allow tcp:80 tcp:8083 tcp:8086 tcp:9200; then
echo "Failed to set up firewall for monitoring" && false
fi
fi
"${KUBECTL}" create -f "${MONITORING}/"
}
function cleanup {
"${KUBECTL}" stop rc monitoring-influx-grafana-controller &> /dev/null || true
"${KUBECTL}" stop rc monitoring-heapster-controller &> /dev/null || true
"${KUBECTL}" delete -f "${MONITORING}/" &> /dev/null || true
# This only has work to do on gce and gke
if [[ "${KUBERNETES_PROVIDER}" == "gce" ]] || [[ "${KUBERNETES_PROVIDER}" == "gke" ]]; then
detect-project
if "${GCLOUD}" compute firewall-rules describe "${MONITORING_FIREWALL_RULE}" &> /dev/null; then
"${GCLOUD}" compute firewall-rules delete \
--project "${PROJECT}" \
--quiet \
"${MONITORING_FIREWALL_RULE}" || true
fi
fi
}
function influx-data-exists {
local max_retries=10
local retry_delay=30 #seconds
local influx_ip=$("${KUBECTL}" get pods -l name=influxGrafana -o template -t {{range.items}}{{.currentState.hostIP}}:{{end}} | sed s/://g)
local influx_url="http://$influx_ip:8086/db/k8s/series?u=root&p=root"
local ok="false"
for i in `seq 1 10`; do
if curl --retry $max_retries --retry-delay $retry_delay -G $influx_url --data-urlencode "q=select * from stats limit 1" \
&& curl --retry $max_retries --retry-delay $retry_delay -G $influx_url --data-urlencode "q=select * from machine limit 1"; then
echo "retrieved data from InfluxDB."
ok="true"
break
fi
sleep 5
done
if [[ "${ok}" != "true" ]]; then
echo "failed to retrieve stats from InfluxDB. monitoring test failed"
exit 1
fi
}
function wait-for-pods {
local running=false
for i in `seq 1 20`; do
sleep 20
if "${KUBECTL}" get pods -l name=influxGrafana -o template -t {{range.items}}{{.currentState.status}}:{{end}} | grep Running &> /dev/null \
&& "${KUBECTL}" get pods -l name=heapster -o template -t {{range.items}}{{.currentState.status}}:{{end}} | grep Running &> /dev/null; then
running=true
break
fi
done
if [ running == false ]; then
echo "giving up waiting on monitoring pods to be active. monitoring test failed"
exit 1
fi
}
trap cleanup EXIT
# Remove any pre-existing monitoring services.
cleanup
# Start monitoring pods and services.
setup
# Wait for a maximum of 5 minutes for the influx grafana pod to be running.
echo "waiting for monitoring pods to be running"
wait-for-pods
# Wait for some time to let heapster push some stats to InfluxDB.
echo "monitoring pods are running. waiting for stats to be pushed to InfluxDB"
sleep 60
# Check if stats data exists in InfluxDB
echo "checking if stats exist in InfluxDB"
influx-data-exists
echo "monitoring setup works"
exit 0

View File

@ -39,7 +39,12 @@ var _ = Describe("Monitoring", func() {
expectNoError(err)
})
It("pod and node resource usage metrics are available on influxdb using heapster.", func() {
It("verify monitoring pods and all cluster nodes are available on influxdb using heapster.", func() {
if testContext.provider != "gce" {
By(fmt.Sprintf("Skipping Monitoring test, which is only supported for provider gce (not %s)",
testContext.provider))
return
}
testMonitoringUsingHeapsterInfluxdb(c)
})
})
@ -51,8 +56,8 @@ const (
influxdbPW = "root"
podlistQuery = "select distinct(pod) from stats"
nodelistQuery = "select distinct(hostname) from machine"
sleepBetweenAttempts = 30 * time.Second
maxAttempts = 10 // Total sleep time of 5 minutes for this test.
sleepBetweenAttempts = 5 * time.Second
testTimeout = 5 * time.Minute
)
var (
@ -67,27 +72,40 @@ var (
}
)
func expectedRcsExist(c *client.Client) {
func verifyExpectedRcsExistAndGetExpectedPods(c *client.Client) ([]string, error) {
rcList, err := c.ReplicationControllers(api.NamespaceDefault).List(labels.Everything())
expectNoError(err)
if err != nil {
return nil, err
}
expectedPods := []string{}
for _, rc := range rcList.Items {
if _, ok := expectedRcs[rc.Name]; ok {
if rc.Status.Replicas != 1 {
Failf("expected to find only one replica for rc %q, found %d", rc.Name, rc.Status.Replicas)
return nil, fmt.Errorf("expected to find only one replica for rc %q, found %d", rc.Name, rc.Status.Replicas)
}
expectedRcs[rc.Name] = true
podList, err := c.Pods(api.NamespaceDefault).List(labels.Set(rc.Spec.Selector).AsSelector())
if err != nil {
return nil, err
}
for _, pod := range podList.Items {
expectedPods = append(expectedPods, pod.Name)
}
}
}
for rc, found := range expectedRcs {
if !found {
Failf("Replication Controller %q not found.", rc)
return nil, fmt.Errorf("Replication Controller %q not found.", rc)
}
}
return expectedPods, nil
}
func expectedServicesExist(c *client.Client) {
func expectedServicesExist(c *client.Client) error {
serviceList, err := c.Services(api.NamespaceDefault).List(labels.Everything())
expectNoError(err)
if err != nil {
return err
}
for _, service := range serviceList.Items {
if _, ok := expectedServices[service.Name]; ok {
expectedServices[service.Name] = true
@ -95,29 +113,22 @@ func expectedServicesExist(c *client.Client) {
}
for service, found := range expectedServices {
if !found {
Failf("Service %q not found", service)
return fmt.Errorf("Service %q not found", service)
}
}
return nil
}
func getAllPodsInCluster(c *client.Client) []string {
podList, err := c.Pods(api.NamespaceAll).List(labels.Everything())
expectNoError(err)
result := []string{}
for _, pod := range podList.Items {
result = append(result, pod.Name)
}
return result
}
func getAllNodesInCluster(c *client.Client) []string {
func getAllNodesInCluster(c *client.Client) ([]string, error) {
nodeList, err := c.Nodes().List()
expectNoError(err)
if err != nil {
return nil, err
}
result := []string{}
for _, node := range nodeList.Items {
result = append(result, node.Name)
}
return result
return result, nil
}
func getInfluxdbData(c *influxdb.Client, query string) (map[string]bool, error) {
@ -133,6 +144,9 @@ func getInfluxdbData(c *influxdb.Client, query string) (map[string]bool, error)
}
result := map[string]bool{}
for _, point := range series[0].GetPoints() {
if len(point) != 2 {
Failf("Expected only two entries in a point for query %q. Got %v", query, point)
}
name, ok := point[1].(string)
if !ok {
Failf("expected %v to be a string, but it is %T", point[1], point[1])
@ -143,6 +157,9 @@ func getInfluxdbData(c *influxdb.Client, query string) (map[string]bool, error)
}
func expectedItemsExist(expectedItems []string, actualItems map[string]bool) bool {
if len(actualItems) < len(expectedItems) {
return false
}
for _, item := range expectedItems {
if _, found := actualItems[item]; !found {
return false
@ -182,8 +199,9 @@ func getMasterHost() string {
func testMonitoringUsingHeapsterInfluxdb(c *client.Client) {
// Check if heapster pods and services are up.
expectedRcsExist(c)
expectedServicesExist(c)
expectedPods, err := verifyExpectedRcsExistAndGetExpectedPods(c)
expectNoError(err)
expectNoError(expectedServicesExist(c))
// TODO: Wait for all pods and services to be running.
kubeMasterHttpClient, ok := c.Client.(*http.Client)
if !ok {
@ -202,14 +220,14 @@ func testMonitoringUsingHeapsterInfluxdb(c *client.Client) {
influxdbClient, err := influxdb.NewClient(config)
expectNoError(err, "failed to create influxdb client")
expectedPods := getAllPodsInCluster(c)
expectedNodes := getAllNodesInCluster(c)
attempt := maxAttempts
expectedNodes, err := getAllNodesInCluster(c)
expectNoError(err)
startTime := time.Now()
for {
if validatePodsAndNodes(influxdbClient, expectedPods, expectedNodes) {
return
}
if attempt--; attempt <= 0 {
if time.Since(startTime) >= testTimeout {
break
}
time.Sleep(sleepBetweenAttempts)