Merge pull request #46371 from sjenning/fix-liveness-probe-reset
Automatic merge from submit-queue reset resultRun on pod restart xref https://bugzilla.redhat.com/show_bug.cgi?id=1455056 There is currently an issue where, if the pod is restarted due to liveness probe failures exceeding failureThreshold, the failure count is not reset on the probe worker. When the pod restarts, if the liveness probe fails even once, the pod is restarted again, not honoring failureThreshold on the restart. ```yaml apiVersion: v1 kind: Pod metadata: name: busybox spec: containers: - name: busybox image: busybox command: - sleep - "3600" livenessProbe: httpGet: path: /healthz port: 8080 initialDelaySeconds: 3 timeoutSeconds: 1 periodSeconds: 3 successThreshold: 1 failureThreshold: 5 terminationGracePeriodSeconds: 0 ``` Before this PR: ``` $ kubectl create -f busybox-probe-fail.yaml pod "busybox" created $ kubectl get pod -w NAME READY STATUS RESTARTS AGE busybox 1/1 Running 0 4s busybox 1/1 Running 1 24s busybox 1/1 Running 2 33s busybox 0/1 CrashLoopBackOff 2 39s ``` After this PR: ``` $ kubectl create -f busybox-probe-fail.yaml $ kubectl get pod -w NAME READY STATUS RESTARTS AGE busybox 0/1 ContainerCreating 0 2s busybox 1/1 Running 0 4s busybox 1/1 Running 1 27s busybox 1/1 Running 2 45s ``` ```release-note Fix kubelet reset liveness probe failure count across pod restart boundaries ``` Restarts are now happen at even intervals. @derekwaynecarr
This commit is contained in:
@@ -224,6 +224,7 @@ func (w *worker) doProbe() (keepGoing bool) {
|
||||
// chance of hitting #21751, where running `docker exec` when a
|
||||
// container is being stopped may lead to corrupted container state.
|
||||
w.onHold = true
|
||||
w.resultRun = 1
|
||||
}
|
||||
|
||||
return true
|
||||
|
||||
@@ -341,3 +341,44 @@ func TestOnHoldOnLivenessCheckFailure(t *testing.T) {
|
||||
t.Errorf("Prober should not be on hold anymore")
|
||||
}
|
||||
}
|
||||
|
||||
func TestResultRunOnLivenessCheckFailure(t *testing.T) {
|
||||
m := newTestManager()
|
||||
w := newTestWorker(m, liveness, v1.Probe{SuccessThreshold: 1, FailureThreshold: 3})
|
||||
m.statusManager.SetPodStatus(w.pod, getTestRunningStatus())
|
||||
|
||||
m.prober.exec = fakeExecProber{probe.Success, nil}
|
||||
msg := "inital probe success"
|
||||
expectContinue(t, w, w.doProbe(), msg)
|
||||
expectResult(t, w, results.Success, msg)
|
||||
if w.resultRun != 1 {
|
||||
t.Errorf("Prober resultRun should 1")
|
||||
}
|
||||
|
||||
m.prober.exec = fakeExecProber{probe.Failure, nil}
|
||||
msg = "probe failure, result success"
|
||||
expectContinue(t, w, w.doProbe(), msg)
|
||||
expectResult(t, w, results.Success, msg)
|
||||
if w.resultRun != 1 {
|
||||
t.Errorf("Prober resultRun should 1")
|
||||
}
|
||||
|
||||
m.prober.exec = fakeExecProber{probe.Failure, nil}
|
||||
msg = "2nd probe failure, result success"
|
||||
expectContinue(t, w, w.doProbe(), msg)
|
||||
expectResult(t, w, results.Success, msg)
|
||||
if w.resultRun != 2 {
|
||||
t.Errorf("Prober resultRun should be 2")
|
||||
}
|
||||
|
||||
// Exceeding FailureThreshold should cause resultRun to
|
||||
// reset to 1 so that the probe on the restarted pod
|
||||
// also gets FailureThreshold attempts to succeed.
|
||||
m.prober.exec = fakeExecProber{probe.Failure, nil}
|
||||
msg = "3rd probe failure, result failure"
|
||||
expectContinue(t, w, w.doProbe(), msg)
|
||||
expectResult(t, w, results.Failure, msg)
|
||||
if w.resultRun != 1 {
|
||||
t.Errorf("Prober resultRun should be reset to 1")
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user