kubernetes/cluster/gce/container-linux/health-monitor.sh
Yu-Ju Hong 4e87973a9b Bump container-linux and gci timeout for docker health check
The command `docker ps` can take longer time to respond under heavy load or
when encountering some known issues. In these cases, the containers are running
fine, so aggressive health check could cause serious disruption. Bump the
timeout to 60s to be consistent with the debian-based containerVM.
2017-01-10 13:07:21 -08:00

82 lines
2.5 KiB
Bash

#!/bin/bash
# Copyright 2016 The Kubernetes Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This script is for master and node instance health monitoring, which is
# packed in kube-manifest tarball. It is executed through a systemd service
# in cluster/gce/gci/<master/node>.yaml. The env variables come from an env
# file provided by the systemd service.
set -o nounset
set -o pipefail
# We simply kill the process when there is a failure. Another systemd service will
# automatically restart the process.
function docker_monitoring {
while [ 1 ]; do
if ! timeout 60 docker ps > /dev/null; then
echo "Docker daemon failed!"
pkill docker
# Wait for a while, as we don't want to kill it again before it is really up.
sleep 30
else
sleep "${SLEEP_SECONDS}"
fi
done
}
function kubelet_monitoring {
echo "Wait for 2 minutes for kubelet to be fuctional"
# TODO(andyzheng0831): replace it with a more reliable method if possible.
sleep 120
local -r max_seconds=10
while [ 1 ]; do
if ! curl --insecure -m "${max_seconds}" -f -s https://127.0.0.1:${KUBELET_PORT:-10250}/healthz > /dev/null; then
echo "Kubelet is unhealthy!"
curl --insecure https://127.0.0.1:${KUBELET_PORT:-10250}/healthz
pkill kubelet
# Wait for a while, as we don't want to kill it again before it is really up.
sleep 60
else
sleep "${SLEEP_SECONDS}"
fi
done
}
############## Main Function ################
if [[ "$#" -ne 1 ]]; then
echo "Usage: health-monitor.sh <docker/kubelet>"
exit 1
fi
KUBE_ENV="/home/kubernetes/kube-env"
if [[ ! -e "${KUBE_ENV}" ]]; then
echo "The ${KUBE_ENV} file does not exist!! Terminate health monitoring"
exit 1
fi
SLEEP_SECONDS=10
component=$1
echo "Start kubernetes health monitoring for ${component}"
source "${KUBE_ENV}"
if [[ "${component}" == "docker" ]]; then
docker_monitoring
elif [[ "${component}" == "kubelet" ]]; then
kubelet_monitoring
else
echo "Health monitoring for component "${component}" is not supported!"
fi