Merge pull request #46371 from sjenning/fix-liveness-probe-reset

Automatic merge from submit-queue

reset resultRun on pod restart

xref https://bugzilla.redhat.com/show_bug.cgi?id=1455056

There is currently an issue where, if the pod is restarted due to liveness probe failures exceeding failureThreshold, the failure count is not reset on the probe worker.  When the pod restarts, if the liveness probe fails even once, the pod is restarted again, not honoring failureThreshold on the restart.

```yaml
apiVersion: v1
kind: Pod
metadata:
  name: busybox
spec:
  containers:
  - name: busybox
    image: busybox
    command:
    - sleep
    - "3600"
    livenessProbe:
      httpGet:
        path: /healthz
        port: 8080
      initialDelaySeconds: 3
      timeoutSeconds: 1
      periodSeconds: 3
      successThreshold: 1
      failureThreshold: 5
  terminationGracePeriodSeconds: 0
```

Before this PR:
```
$ kubectl create -f busybox-probe-fail.yaml 
pod "busybox" created
$ kubectl get pod -w
NAME      READY     STATUS    RESTARTS   AGE
busybox   1/1       Running   0          4s
busybox   1/1       Running   1         24s
busybox   1/1       Running   2         33s
busybox   0/1       CrashLoopBackOff   2         39s
```

After this PR:
```
$ kubectl create -f busybox-probe-fail.yaml
$ kubectl get pod -w
NAME      READY     STATUS              RESTARTS   AGE
busybox   0/1       ContainerCreating   0          2s
busybox   1/1       Running   0         4s
busybox   1/1       Running   1         27s
busybox   1/1       Running   2         45s
```

```release-note
Fix kubelet reset liveness probe failure count across pod restart boundaries
```

Restarts are now happen at even intervals.

@derekwaynecarr
This commit is contained in:
Kubernetes Submit Queue
2017-06-03 15:15:49 -07:00
committed by GitHub
2 changed files with 42 additions and 0 deletions

View File

@@ -224,6 +224,7 @@ func (w *worker) doProbe() (keepGoing bool) {
// chance of hitting #21751, where running `docker exec` when a
// container is being stopped may lead to corrupted container state.
w.onHold = true
w.resultRun = 1
}
return true

View File

@@ -341,3 +341,44 @@ func TestOnHoldOnLivenessCheckFailure(t *testing.T) {
t.Errorf("Prober should not be on hold anymore")
}
}
func TestResultRunOnLivenessCheckFailure(t *testing.T) {
m := newTestManager()
w := newTestWorker(m, liveness, v1.Probe{SuccessThreshold: 1, FailureThreshold: 3})
m.statusManager.SetPodStatus(w.pod, getTestRunningStatus())
m.prober.exec = fakeExecProber{probe.Success, nil}
msg := "inital probe success"
expectContinue(t, w, w.doProbe(), msg)
expectResult(t, w, results.Success, msg)
if w.resultRun != 1 {
t.Errorf("Prober resultRun should 1")
}
m.prober.exec = fakeExecProber{probe.Failure, nil}
msg = "probe failure, result success"
expectContinue(t, w, w.doProbe(), msg)
expectResult(t, w, results.Success, msg)
if w.resultRun != 1 {
t.Errorf("Prober resultRun should 1")
}
m.prober.exec = fakeExecProber{probe.Failure, nil}
msg = "2nd probe failure, result success"
expectContinue(t, w, w.doProbe(), msg)
expectResult(t, w, results.Success, msg)
if w.resultRun != 2 {
t.Errorf("Prober resultRun should be 2")
}
// Exceeding FailureThreshold should cause resultRun to
// reset to 1 so that the probe on the restarted pod
// also gets FailureThreshold attempts to succeed.
m.prober.exec = fakeExecProber{probe.Failure, nil}
msg = "3rd probe failure, result failure"
expectContinue(t, w, w.doProbe(), msg)
expectResult(t, w, results.Failure, msg)
if w.resultRun != 1 {
t.Errorf("Prober resultRun should be reset to 1")
}
}