Merge pull request #46371 from sjenning/fix-liveness-probe-reset

Automatic merge from submit-queue reset resultRun on pod restart xref https://bugzilla.redhat.com/show_bug.cgi?id=1455056 There is currently an issue where, if the pod is restarted due to liveness probe failures exceeding failureThreshold, the failure count is not reset on the probe worker. When the pod restarts, if the liveness probe fails even once, the pod is restarted again, not honoring failureThreshold on the restart. ```yaml apiVersion: v1 kind: Pod metadata: name: busybox spec: containers: - name: busybox image: busybox command: - sleep - "3600" livenessProbe: httpGet: path: /healthz port: 8080 initialDelaySeconds: 3 timeoutSeconds: 1 periodSeconds: 3 successThreshold: 1 failureThreshold: 5 terminationGracePeriodSeconds: 0 ``` Before this PR: ``` $ kubectl create -f busybox-probe-fail.yaml pod "busybox" created $ kubectl get pod -w NAME READY STATUS RESTARTS AGE busybox 1/1 Running 0 4s busybox 1/1 Running 1 24s busybox 1/1 Running 2 33s busybox 0/1 CrashLoopBackOff 2 39s ``` After this PR: ``` $ kubectl create -f busybox-probe-fail.yaml $ kubectl get pod -w NAME READY STATUS RESTARTS AGE busybox 0/1 ContainerCreating 0 2s busybox 1/1 Running 0 4s busybox 1/1 Running 1 27s busybox 1/1 Running 2 45s ``` ```release-note Fix kubelet reset liveness probe failure count across pod restart boundaries ``` Restarts are now happen at even intervals. @derekwaynecarr
2017-06-03 15:15:49 -07:00
parent ebb4b0f7c6 2c866a7aaa
commit b641aedcac
2 changed files with 42 additions and 0 deletions
--- a/pkg/kubelet/prober/worker.go
+++ b/pkg/kubelet/prober/worker.go
@@ -224,6 +224,7 @@ func (w *worker) doProbe() (keepGoing bool) {
 		// chance of hitting #21751, where running `docker exec` when a
 		// container is being stopped may lead to corrupted container state.
 		w.onHold = true
+		w.resultRun = 1
 	}

 	return true
--- a/pkg/kubelet/prober/worker_test.go
+++ b/pkg/kubelet/prober/worker_test.go
@@ -341,3 +341,44 @@ func TestOnHoldOnLivenessCheckFailure(t *testing.T) {
 		t.Errorf("Prober should not be on hold anymore")
 	}
 }
+
+func TestResultRunOnLivenessCheckFailure(t *testing.T) {
+	m := newTestManager()
+	w := newTestWorker(m, liveness, v1.Probe{SuccessThreshold: 1, FailureThreshold: 3})
+	m.statusManager.SetPodStatus(w.pod, getTestRunningStatus())
+
+	m.prober.exec = fakeExecProber{probe.Success, nil}
+	msg := "inital probe success"
+	expectContinue(t, w, w.doProbe(), msg)
+	expectResult(t, w, results.Success, msg)
+	if w.resultRun != 1 {
+		t.Errorf("Prober resultRun should 1")
+	}
+
+	m.prober.exec = fakeExecProber{probe.Failure, nil}
+	msg = "probe failure, result success"
+	expectContinue(t, w, w.doProbe(), msg)
+	expectResult(t, w, results.Success, msg)
+	if w.resultRun != 1 {
+		t.Errorf("Prober resultRun should 1")
+	}
+
+	m.prober.exec = fakeExecProber{probe.Failure, nil}
+	msg = "2nd probe failure, result success"
+	expectContinue(t, w, w.doProbe(), msg)
+	expectResult(t, w, results.Success, msg)
+	if w.resultRun != 2 {
+		t.Errorf("Prober resultRun should be 2")
+	}
+
+	// Exceeding FailureThreshold should cause resultRun to
+	// reset to 1 so that the probe on the restarted pod
+	// also gets FailureThreshold attempts to succeed.
+	m.prober.exec = fakeExecProber{probe.Failure, nil}
+	msg = "3rd probe failure, result failure"
+	expectContinue(t, w, w.doProbe(), msg)
+	expectResult(t, w, results.Failure, msg)
+	if w.resultRun != 1 {
+		t.Errorf("Prober resultRun should be reset to 1")
+	}
+}