
separated from the apiserver running locally on the master node so that it can be optionally enabled or disabled as needed. Also, fix the healthchecking configuration for the master components, which was previously only working by coincidence: If a kubelet doesn't register with a master, it never bothers to figure out what its local address is. In which case it ends up constructing a URL like http://:8080/healthz for the http probe. This happens to work on the master because all of the pods are using host networking and explicitly binding to 127.0.0.1. Once the kubelet is registered with the master and it determines the local node address, it tries to healthcheck on an address where the pod isn't listening and the kubelet periodically restarts each master component when the liveness probe fails.
108 lines
3.8 KiB
Bash
Executable File
108 lines
3.8 KiB
Bash
Executable File
#!/bin/bash
|
|
|
|
# Copyright 2014 The Kubernetes Authors All rights reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
# Validates that the cluster is healthy.
|
|
|
|
set -o errexit
|
|
set -o nounset
|
|
set -o pipefail
|
|
|
|
KUBE_ROOT=$(dirname "${BASH_SOURCE}")/..
|
|
source "${KUBE_ROOT}/cluster/kube-env.sh"
|
|
source "${KUBE_ROOT}/cluster/kube-util.sh"
|
|
|
|
MINIONS_FILE=/tmp/minions-$$
|
|
trap 'rm -rf "${MINIONS_FILE}"' EXIT
|
|
|
|
EXPECTED_NUM_NODES="${NUM_MINIONS}"
|
|
if [[ "${REGISTER_MASTER_KUBELET:-}" == "true" ]]; then
|
|
EXPECTED_NUM_NODES=$((EXPECTED_NUM_NODES+1))
|
|
fi
|
|
# Make several attempts to deal with slow cluster birth.
|
|
attempt=0
|
|
while true; do
|
|
# The "kubectl get nodes" output is three columns like this:
|
|
#
|
|
# NAME LABELS STATUS
|
|
# kubernetes-minion-03nb <none> Ready
|
|
#
|
|
# Echo the output, strip the first line, then gather 2 counts:
|
|
# - Total number of nodes.
|
|
# - Number of "ready" nodes.
|
|
#
|
|
# Suppress errors from kubectl output because during cluster bootstrapping
|
|
# for clusters where the master node is registered, the apiserver will become
|
|
# available and then get restarted as the kubelet configures the docker bridge.
|
|
"${KUBE_ROOT}/cluster/kubectl.sh" get nodes > "${MINIONS_FILE}" 2> /dev/null || true
|
|
found=$(cat "${MINIONS_FILE}" | sed '1d' | grep -c .) || true
|
|
ready=$(cat "${MINIONS_FILE}" | sed '1d' | awk '{print $NF}' | grep -c '^Ready') || true
|
|
|
|
if (( "${found}" == "${EXPECTED_NUM_NODES}" )) && (( "${ready}" == "${EXPECTED_NUM_NODES}")); then
|
|
break
|
|
else
|
|
# Set the timeout to ~10minutes (40 x 15 second) to avoid timeouts for 100-node clusters.
|
|
if (( attempt > 40 )); then
|
|
echo -e "${color_red}Detected ${ready} ready nodes, found ${found} nodes out of expected ${EXPECTED_NUM_NODES}. Your cluster may not be working.${color_norm}"
|
|
cat -n "${MINIONS_FILE}"
|
|
exit 2
|
|
else
|
|
echo -e "${color_yellow}Waiting for ${EXPECTED_NUM_NODES} ready nodes. ${ready} ready nodes, ${found} registered. Retrying.${color_norm}"
|
|
fi
|
|
attempt=$((attempt+1))
|
|
sleep 15
|
|
fi
|
|
done
|
|
echo "Found ${found} nodes."
|
|
echo -n " "
|
|
head -n 1 "${MINIONS_FILE}"
|
|
tail -n +2 "${MINIONS_FILE}" | cat -n
|
|
|
|
attempt=0
|
|
while true; do
|
|
kubectl_output=$("${KUBE_ROOT}/cluster/kubectl.sh" get cs) || true
|
|
|
|
# The "kubectl componentstatuses" output is four columns like this:
|
|
#
|
|
# COMPONENT HEALTH MSG ERR
|
|
# controller-manager Healthy ok nil
|
|
#
|
|
# Parse the output to capture the value of the second column("HEALTH"), then use grep to
|
|
# count the number of times it doesn't match "Healthy".
|
|
non_success_count=$(echo "${kubectl_output}" | \
|
|
sed '1d' |
|
|
sed -n 's/^[[:alnum:][:punct:]]/&/p' | \
|
|
grep --invert-match -c '^[[:alnum:][:punct:]]\{1,\}[[:space:]]\{1,\}Healthy') || true
|
|
|
|
if ((non_success_count > 0)); then
|
|
if ((attempt < 5)); then
|
|
echo -e "${color_yellow}Cluster not working yet.${color_norm}"
|
|
attempt=$((attempt+1))
|
|
sleep 30
|
|
else
|
|
echo -e " ${color_yellow}Validate output:${color_norm}"
|
|
echo "${kubectl_output}"
|
|
echo -e "${color_red}Validation returned one or more failed components. Cluster is probably broken.${color_norm}"
|
|
exit 1
|
|
fi
|
|
else
|
|
break
|
|
fi
|
|
done
|
|
|
|
echo "Validate output:"
|
|
echo "${kubectl_output}"
|
|
echo -e "${color_green}Cluster validation succeeded${color_norm}"
|