Merge pull request #7135 from GoogleCloudPlatform/revert-7124-revert-validate.fix

Revert "Rollback "validate-cluster.sh" to previous version."
This commit is contained in:
Robert Bailey
2015-04-21 13:44:54 -07:00

View File

@@ -14,11 +14,7 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
# Bring up a Kubernetes cluster. # Validates that the cluster is healthy.
#
# If the full release name (gs://<bucket>/<release>) is passed in then we take
# that directly. If not then we assume we are doing development stuff and take
# the defaults in the release config.
set -o errexit set -o errexit
set -o nounset set -o nounset
@@ -28,12 +24,9 @@ KUBE_ROOT=$(dirname "${BASH_SOURCE}")/..
source "${KUBE_ROOT}/cluster/kube-env.sh" source "${KUBE_ROOT}/cluster/kube-env.sh"
source "${KUBE_ROOT}/cluster/${KUBERNETES_PROVIDER}/util.sh" source "${KUBE_ROOT}/cluster/${KUBERNETES_PROVIDER}/util.sh"
get-password
detect-master > /dev/null
detect-minions > /dev/null
MINIONS_FILE=/tmp/minions-$$ MINIONS_FILE=/tmp/minions-$$
trap 'rm -rf "${MINIONS_FILE}"' EXIT trap 'rm -rf "${MINIONS_FILE}"' EXIT
# Make several attempts to deal with slow cluster birth. # Make several attempts to deal with slow cluster birth.
attempt=0 attempt=0
while true; do while true; do
@@ -54,62 +47,39 @@ done
echo "Found ${found} nodes." echo "Found ${found} nodes."
cat -n "${MINIONS_FILE}" cat -n "${MINIONS_FILE}"
# On vSphere, use minion IPs as their names
if [[ "${KUBERNETES_PROVIDER}" == "vsphere" || "${KUBERNETES_PROVIDER}" == "vagrant" || "${KUBERNETES_PROVIDER}" == "libvirt-coreos" || "${KUBERNETES_PROVIDER}" == "juju" ]] ; then
MINION_NAMES=("${KUBE_MINION_IP_ADDRESSES[@]}")
fi
# On AWS we can't really name the minions, so just trust that if the number is right, the right names are there.
if [[ "${KUBERNETES_PROVIDER}" == "aws" ]]; then
MINION_NAMES=("$(cat ${MINIONS_FILE})")
# /healthz validation isn't working for some reason on AWS. So just hope for the best.
# TODO: figure out why and fix, it must be working in some form, or else clusters wouldn't work.
echo "Kubelet health checking on AWS isn't currently supported, assuming everything is good..."
echo -e "${color_green}Cluster validation succeeded${color_norm}"
exit 0
fi
for (( i=0; i<${#MINION_NAMES[@]}; i++)); do
# Grep returns an exit status of 1 when line is not found, so we need the : to always return a 0 exit status
count=$(grep -c "${MINION_NAMES[$i]}" "${MINIONS_FILE}") || :
if [[ "${count}" == "0" ]]; then
echo -e "${color_red}Failed to find ${MINION_NAMES[$i]}, cluster is probably broken.${color_norm}"
cat -n "${MINIONS_FILE}"
exit 1
fi
name="${MINION_NAMES[$i]}"
if [[ "$KUBERNETES_PROVIDER" != "vsphere" && "$KUBERNETES_PROVIDER" != "vagrant" && "$KUBERNETES_PROVIDER" != "libvirt-coreos" && "$KUBERNETES_PROVIDER" != "juju" ]]; then
# Grab fully qualified name
name=$(grep "${MINION_NAMES[$i]}\." "${MINIONS_FILE}")
fi
# Make sure the kubelet is healthy.
# Make several attempts to deal with slow cluster birth.
attempt=0 attempt=0
while true; do while true; do
echo -n "Attempt $((attempt+1)) at checking Kubelet installation on node ${MINION_NAMES[$i]} ..." kubectl_output=$("${KUBE_ROOT}/cluster/kubectl.sh" get cs)
if [[ "$KUBERNETES_PROVIDER" != "libvirt-coreos" && "$KUBERNETES_PROVIDER" != "juju" ]]; then
curl_output=$(curl -s --insecure --user "${KUBE_USER}:${KUBE_PASSWORD}" \ # The "kubectl componentstatuses" output is four columns like this:
"https://${KUBE_MASTER_IP}/api/v1beta1/proxy/minions/${name}/healthz") #
# COMPONENT HEALTH MSG ERR
# controller-manager Healthy ok nil
#
# Parse the output to capture the value of the second column("HEALTH"), then use grep to
# count the number of times it doesn't match "success".
# Because of the header, the actual unsuccessful count is 1 minus the count.
non_success_count=$(echo "${kubectl_output}" | \
sed -n 's/^\([[:alnum:][:punct:]]\+\)\s\+\([[:alnum:][:punct:]]\+\)\s\+.*/\2/p' | \
grep 'Healthy' --invert-match -c)
if ((non_success_count > 1)); then
if ((attempt < 5)); then
echo -e "${color_yellow}Cluster not working yet.${color_norm}"
attempt=$((attempt+1))
sleep 30
else else
curl_output=$(curl -s \ echo -e " ${color_yellow}Validate output:${color_norm}"
"http://${KUBE_MASTER_IP}:8080/api/v1beta1/proxy/minions/${name}/healthz") echo "${kubectl_output}"
fi echo -e "${color_red}Validation returned one or more failed components. Cluster is probably broken.${color_norm}"
if [[ "${curl_output}" != "ok" ]]; then
if (( attempt > 5 )); then
echo
echo -e "${color_red}Kubelet failed to install on node ${MINION_NAMES[$i]}. Your cluster is unlikely to work correctly."
echo -e "Please run ./cluster/kube-down.sh and re-create the cluster. (sorry!)${color_norm}"
exit 1 exit 1
fi fi
else else
echo -e " ${color_green}[working]${color_norm}"
break break
fi fi
echo -e " ${color_yellow}[not working yet]${color_norm}"
attempt=$((attempt+1))
sleep 30
done
done done
echo "Validate output:"
echo "${kubectl_output}"
echo -e "${color_green}Cluster validation succeeded${color_norm}" echo -e "${color_green}Cluster validation succeeded${color_norm}"