diff --git a/cluster/health-monitor.sh b/cluster/health-monitor.sh index b9a690f42..0320a534d 100755 --- a/cluster/health-monitor.sh +++ b/cluster/health-monitor.sh @@ -19,6 +19,11 @@ set -o pipefail # CRICTL is the path of crictl CRICTL=${CRICTL:-"crictl"} +# INITIAL_WAIT_ATTEMPTS is the number to attempt, before start +# performing health check. The problem is that cri-containerd +# and containerd are started around the same time with health +# monitor, they may not be ready yet when health-monitor is started. +INITIAL_WAIT_ATTEMPTS=${INITIAL_WAIT_ATTEMPTS:-5} # COMMAND_TIMEOUT is the timeout for the health check command. COMMAND_TIMEOUT=${COMMAND_TIMEOUT:-60} # CHECK_PERIOD is the health check period. @@ -27,13 +32,21 @@ CHECK_PERIOD=${CHECK_PERIOD:-10} # and containerd. SLEEP_SECONDS=${SLEEP_SECONDS:-120} +attempt=1 +until timeout ${COMMAND_TIMEOUT} ${CRICTL} pods > /dev/null || (( attempt == INITIAL_WAIT_ATTEMPTS )) +do + echo "$attempt initial attempt \"$CRICTL pods\"! Trying again in $attempt seconds..." + sleep $(( attempt++ )) +done + +echo "Start performing health check." while true; do # Use crictl pods because it requires both containerd and # cri-containerd to be working. if ! timeout ${COMMAND_TIMEOUT} ${CRICTL} pods > /dev/null; then - echo "crictl pods timeout!" - pkill containerd - pkill cri-containerd + echo "\"$CRICTL pods\" failed!" + pkill -x cri-containerd + pkill -x containerd # Wait for a while, as we don't want to kill it again before it is really up. sleep ${SLEEP_SECONDS} else diff --git a/hack/test-utils.sh b/hack/test-utils.sh index 705376914..86f56a2b8 100644 --- a/hack/test-utils.sh +++ b/hack/test-utils.sh @@ -45,7 +45,8 @@ test_setup() { echo "containerd is not installed, please run hack/install-deps.sh" exit 1 fi - sudo pkill containerd + sudo pkill -x cri-containerd + sudo pkill -x containerd keepalive "sudo containerd" ${RESTART_WAIT_PERIOD} &> ${report_dir}/containerd.log & containerd_pid=$! # Wait for containerd to be running by using the containerd client ctr to check the version @@ -69,7 +70,8 @@ test_teardown() { if [ -n "${cri_containerd_pid}" ]; then kill ${cri_containerd_pid} fi - sudo pkill containerd + sudo pkill -x cri-containerd + sudo pkill -x containerd } # keepalive runs a command and keeps it alive. diff --git a/integration/test_utils.go b/integration/test_utils.go index 9a88c1c2b..6200bd97e 100644 --- a/integration/test_utils.go +++ b/integration/test_utils.go @@ -206,7 +206,7 @@ func Randomize(str string) string { // KillProcess kills the process by name. pkill is used. func KillProcess(name string) error { - output, err := exec.Command("pkill", fmt.Sprintf("^%s$", name)).CombinedOutput() + output, err := exec.Command("pkill", "-x", fmt.Sprintf("^%s$", name)).CombinedOutput() if err != nil { return fmt.Errorf("failed to kill %q - error: %v, output: %q", name, err, output) }