
The command `docker ps` can take longer time to respond under heavy load or when encountering some known issues. In these cases, the containers are running fine, so aggressive health check could cause serious disruption. Bump the timeout to 60s to be consistent with the debian-based containerVM.
82 lines
2.5 KiB
Bash
82 lines
2.5 KiB
Bash
#!/bin/bash
|
|
|
|
# Copyright 2016 The Kubernetes Authors.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
# This script is for master and node instance health monitoring, which is
|
|
# packed in kube-manifest tarball. It is executed through a systemd service
|
|
# in cluster/gce/gci/<master/node>.yaml. The env variables come from an env
|
|
# file provided by the systemd service.
|
|
|
|
set -o nounset
|
|
set -o pipefail
|
|
|
|
# We simply kill the process when there is a failure. Another systemd service will
|
|
# automatically restart the process.
|
|
function docker_monitoring {
|
|
while [ 1 ]; do
|
|
if ! timeout 60 docker ps > /dev/null; then
|
|
echo "Docker daemon failed!"
|
|
pkill docker
|
|
# Wait for a while, as we don't want to kill it again before it is really up.
|
|
sleep 30
|
|
else
|
|
sleep "${SLEEP_SECONDS}"
|
|
fi
|
|
done
|
|
}
|
|
|
|
function kubelet_monitoring {
|
|
echo "Wait for 2 minutes for kubelet to be fuctional"
|
|
# TODO(andyzheng0831): replace it with a more reliable method if possible.
|
|
sleep 120
|
|
local -r max_seconds=10
|
|
while [ 1 ]; do
|
|
if ! curl --insecure -m "${max_seconds}" -f -s https://127.0.0.1:${KUBELET_PORT:-10250}/healthz > /dev/null; then
|
|
echo "Kubelet is unhealthy!"
|
|
curl --insecure https://127.0.0.1:${KUBELET_PORT:-10250}/healthz
|
|
pkill kubelet
|
|
# Wait for a while, as we don't want to kill it again before it is really up.
|
|
sleep 60
|
|
else
|
|
sleep "${SLEEP_SECONDS}"
|
|
fi
|
|
done
|
|
}
|
|
|
|
|
|
############## Main Function ################
|
|
if [[ "$#" -ne 1 ]]; then
|
|
echo "Usage: health-monitor.sh <docker/kubelet>"
|
|
exit 1
|
|
fi
|
|
|
|
KUBE_ENV="/home/kubernetes/kube-env"
|
|
if [[ ! -e "${KUBE_ENV}" ]]; then
|
|
echo "The ${KUBE_ENV} file does not exist!! Terminate health monitoring"
|
|
exit 1
|
|
fi
|
|
|
|
SLEEP_SECONDS=10
|
|
component=$1
|
|
echo "Start kubernetes health monitoring for ${component}"
|
|
source "${KUBE_ENV}"
|
|
if [[ "${component}" == "docker" ]]; then
|
|
docker_monitoring
|
|
elif [[ "${component}" == "kubelet" ]]; then
|
|
kubelet_monitoring
|
|
else
|
|
echo "Health monitoring for component "${component}" is not supported!"
|
|
fi
|