Merge pull request #8421 from yujuhong/active_deadline

Kubelet: move active deadline check to per pod worker
This commit is contained in:
Victor Marmol
2015-05-18 17:42:21 -07:00
3 changed files with 91 additions and 138 deletions

View File

@@ -1155,37 +1155,33 @@ func (kl *Kubelet) cleanupOrphanedVolumes(pods []*api.Pod, runningPods []*kubeco
return nil return nil
} }
// filterOutPodsPastActiveDeadline filters pods with an ActiveDeadlineSeconds value that has been exceeded. // pastActiveDeadline returns true if the pod has been active for more than
// It records an event that the pod has been active longer than the allocated time, and updates the pod status as failed. // ActiveDeadlineSeconds.
// By filtering the pod from the result set, the Kubelet will kill the pod's containers as part of normal SyncPods workflow. func (kl *Kubelet) pastActiveDeadline(pod *api.Pod) bool {
func (kl *Kubelet) filterOutPodsPastActiveDeadline(allPods []*api.Pod) (pods []*api.Pod) {
now := util.Now() now := util.Now()
for _, pod := range allPods { if pod.Spec.ActiveDeadlineSeconds != nil {
keepPod := true podStatus, ok := kl.statusManager.GetPodStatus(kubecontainer.GetPodFullName(pod))
if pod.Spec.ActiveDeadlineSeconds != nil { if !ok {
podStatus, ok := kl.statusManager.GetPodStatus(kubecontainer.GetPodFullName(pod)) podStatus = pod.Status
if !ok {
podStatus = pod.Status
}
if !podStatus.StartTime.IsZero() {
startTime := podStatus.StartTime.Time
duration := now.Time.Sub(startTime)
allowedDuration := time.Duration(*pod.Spec.ActiveDeadlineSeconds) * time.Second
if duration >= allowedDuration {
keepPod = false
}
}
} }
if keepPod { if !podStatus.StartTime.IsZero() {
pods = append(pods, pod) startTime := podStatus.StartTime.Time
} else { duration := now.Time.Sub(startTime)
kl.recorder.Eventf(pod, "deadline", "Pod was active on the node longer than specified deadline") allowedDuration := time.Duration(*pod.Spec.ActiveDeadlineSeconds) * time.Second
kl.statusManager.SetPodStatus(pod, api.PodStatus{ if duration >= allowedDuration {
Phase: api.PodFailed, return true
Message: "Pod was active on the node longer than specified deadline"}) }
} }
} }
return pods return false
}
//podIsTerminated returns true if status is in one of the terminated state.
func podIsTerminated(status *api.PodStatus) bool {
if status.Phase == api.PodFailed || status.Phase == api.PodSucceeded {
return true
}
return false
} }
// Filter out pods in the terminated state ("Failed" or "Succeeded"). // Filter out pods in the terminated state ("Failed" or "Succeeded").
@@ -1201,8 +1197,7 @@ func (kl *Kubelet) filterOutTerminatedPods(allPods []*api.Pod) []*api.Pod {
// restarted. // restarted.
status = pod.Status status = pod.Status
} }
if status.Phase == api.PodFailed || status.Phase == api.PodSucceeded { if podIsTerminated(&status) {
// Pod has reached the final state; ignore it.
continue continue
} }
pods = append(pods, pod) pods = append(pods, pod)
@@ -1497,8 +1492,6 @@ func (kl *Kubelet) admitPods(allPods []*api.Pod, podSyncTypes map[types.UID]metr
// These two conditions could be alleviated by checkpointing kubelet. // These two conditions could be alleviated by checkpointing kubelet.
pods := kl.filterOutTerminatedPods(allPods) pods := kl.filterOutTerminatedPods(allPods)
pods = kl.filterOutPodsPastActiveDeadline(pods)
// Respect the pod creation order when resolving conflicts. // Respect the pod creation order when resolving conflicts.
sort.Sort(podsByCreationTime(pods)) sort.Sort(podsByCreationTime(pods))
@@ -1597,9 +1590,12 @@ func (kl *Kubelet) validateContainerStatus(podStatus *api.PodStatus, containerNa
// or all of them. // or all of them.
func (kl *Kubelet) GetKubeletContainerLogs(podFullName, containerName, tail string, follow, previous bool, stdout, stderr io.Writer) error { func (kl *Kubelet) GetKubeletContainerLogs(podFullName, containerName, tail string, follow, previous bool, stdout, stderr io.Writer) error {
// TODO(vmarmol): Refactor to not need the pod status and verification. // TODO(vmarmol): Refactor to not need the pod status and verification.
podStatus, err := kl.getPodStatus(podFullName) // Pod workers periodically write status to statusManager. If status is not
if err != nil { // cached there, something is wrong (or kubelet just restarted and hasn't
return fmt.Errorf("failed to get status for pod %q - %v", podFullName, err) // caught up yet). Just assume the pod is not ready yet.
podStatus, found := kl.statusManager.GetPodStatus(podFullName)
if !found {
return fmt.Errorf("failed to get status for pod %q", podFullName)
} }
if err := kl.validatePodPhase(&podStatus); err != nil { if err := kl.validatePodPhase(&podStatus); err != nil {
// No log is available if pod is not in a "known" phase (e.g. Unknown). // No log is available if pod is not in a "known" phase (e.g. Unknown).
@@ -1925,30 +1921,21 @@ func getPodReadyCondition(spec *api.PodSpec, statuses []api.ContainerStatus) []a
return ready return ready
} }
// getPodStatus returns information of the containers in the pod from the
// container runtime.
func (kl *Kubelet) getPodStatus(podFullName string) (api.PodStatus, error) {
// Check to see if we have a cached version of the status.
cachedPodStatus, found := kl.statusManager.GetPodStatus(podFullName)
if found {
glog.V(3).Infof("Returning cached status for %q", podFullName)
return cachedPodStatus, nil
}
pod, found := kl.GetPodByFullName(podFullName)
if !found {
return api.PodStatus{}, fmt.Errorf("couldn't find pod %q", podFullName)
}
return kl.generatePodStatus(pod)
}
// By passing the pod directly, this method avoids pod lookup, which requires // By passing the pod directly, this method avoids pod lookup, which requires
// grabbing a lock. // grabbing a lock.
func (kl *Kubelet) generatePodStatus(pod *api.Pod) (api.PodStatus, error) { func (kl *Kubelet) generatePodStatus(pod *api.Pod) (api.PodStatus, error) {
podFullName := kubecontainer.GetPodFullName(pod) podFullName := kubecontainer.GetPodFullName(pod)
glog.V(3).Infof("Generating status for %q", podFullName) glog.V(3).Infof("Generating status for %q", podFullName)
spec := &pod.Spec // TODO: Consider include the container information.
if kl.pastActiveDeadline(pod) {
kl.recorder.Eventf(pod, "deadline", "Pod was active on the node longer than specified deadline")
return api.PodStatus{
Phase: api.PodFailed,
Message: "Pod was active on the node longer than specified deadline"}, nil
}
spec := &pod.Spec
podStatus, err := kl.containerRuntime.GetPodStatus(pod) podStatus, err := kl.containerRuntime.GetPodStatus(pod)
if err != nil { if err != nil {

View File

@@ -3010,19 +3010,9 @@ func TestHandlePortConflicts(t *testing.T) {
kl.handleNotFittingPods(pods) kl.handleNotFittingPods(pods)
// Check pod status stored in the status map. // Check pod status stored in the status map.
status, err := kl.getPodStatus(conflictedPodName) status, found := kl.statusManager.GetPodStatus(conflictedPodName)
if err != nil { if !found {
t.Fatalf("status of pod %q is not found in the status map: %#v", conflictedPodName, err) t.Fatalf("status of pod %q is not found in the status map", conflictedPodName)
}
if status.Phase != api.PodFailed {
t.Fatalf("expected pod status %q. Got %q.", api.PodFailed, status.Phase)
}
// Check if we can retrieve the pod status from GetPodStatus().
kl.podManager.SetPods(pods)
status, err = kl.getPodStatus(conflictedPodName)
if err != nil {
t.Fatalf("unable to retrieve pod status for pod %q: %#v.", conflictedPodName, err)
} }
if status.Phase != api.PodFailed { if status.Phase != api.PodFailed {
t.Fatalf("expected pod status %q. Got %q.", api.PodFailed, status.Phase) t.Fatalf("expected pod status %q. Got %q.", api.PodFailed, status.Phase)
@@ -3062,19 +3052,9 @@ func TestHandleNodeSelector(t *testing.T) {
kl.handleNotFittingPods(pods) kl.handleNotFittingPods(pods)
// Check pod status stored in the status map. // Check pod status stored in the status map.
status, err := kl.getPodStatus(notfittingPodName) status, found := kl.statusManager.GetPodStatus(notfittingPodName)
if err != nil { if !found {
t.Fatalf("status of pod %q is not found in the status map: %#v", notfittingPodName, err) t.Fatalf("status of pod %q is not found in the status map", notfittingPodName)
}
if status.Phase != api.PodFailed {
t.Fatalf("expected pod status %q. Got %q.", api.PodFailed, status.Phase)
}
// Check if we can retrieve the pod status from GetPodStatus().
kl.podManager.SetPods(pods)
status, err = kl.getPodStatus(notfittingPodName)
if err != nil {
t.Fatalf("unable to retrieve pod status for pod %q: %#v.", notfittingPodName, err)
} }
if status.Phase != api.PodFailed { if status.Phase != api.PodFailed {
t.Fatalf("expected pod status %q. Got %q.", api.PodFailed, status.Phase) t.Fatalf("expected pod status %q. Got %q.", api.PodFailed, status.Phase)
@@ -3120,19 +3100,9 @@ func TestHandleMemExceeded(t *testing.T) {
kl.handleNotFittingPods(pods) kl.handleNotFittingPods(pods)
// Check pod status stored in the status map. // Check pod status stored in the status map.
status, err := kl.getPodStatus(notfittingPodName) status, found := kl.statusManager.GetPodStatus(notfittingPodName)
if err != nil { if !found {
t.Fatalf("status of pod %q is not found in the status map: %#v", notfittingPodName, err) t.Fatalf("status of pod %q is not found in the status map", notfittingPodName)
}
if status.Phase != api.PodFailed {
t.Fatalf("expected pod status %q. Got %q.", api.PodFailed, status.Phase)
}
// Check if we can retrieve the pod status from GetPodStatus().
kl.podManager.SetPods(pods)
status, err = kl.getPodStatus(notfittingPodName)
if err != nil {
t.Fatalf("unable to retrieve pod status for pod %q: %#v.", notfittingPodName, err)
} }
if status.Phase != api.PodFailed { if status.Phase != api.PodFailed {
t.Fatalf("expected pod status %q. Got %q.", api.PodFailed, status.Phase) t.Fatalf("expected pod status %q. Got %q.", api.PodFailed, status.Phase)
@@ -3153,13 +3123,13 @@ func TestPurgingObsoleteStatusMapEntries(t *testing.T) {
} }
// Run once to populate the status map. // Run once to populate the status map.
kl.handleNotFittingPods(pods) kl.handleNotFittingPods(pods)
if _, err := kl.getPodStatus(kubecontainer.BuildPodFullName("pod2", "")); err != nil { if _, found := kl.statusManager.GetPodStatus(kubecontainer.BuildPodFullName("pod2", "")); !found {
t.Fatalf("expected to have status cached for %q: %v", "pod2", err) t.Fatalf("expected to have status cached for pod2")
} }
// Sync with empty pods so that the entry in status map will be removed. // Sync with empty pods so that the entry in status map will be removed.
kl.SyncPods([]*api.Pod{}, emptyPodUIDs, map[string]*api.Pod{}, time.Now()) kl.SyncPods([]*api.Pod{}, emptyPodUIDs, map[string]*api.Pod{}, time.Now())
if _, err := kl.getPodStatus(kubecontainer.BuildPodFullName("pod2", "")); err == nil { if _, found := kl.statusManager.GetPodStatus(kubecontainer.BuildPodFullName("pod2", "")); found {
t.Fatalf("expected to not have status cached for %q: %v", "pod2", err) t.Fatalf("expected to not have status cached for pod2")
} }
} }
@@ -4169,11 +4139,11 @@ func TestGetPodStatusWithLastTermination(t *testing.T) {
t.Errorf("%d: unexpected error: %v", i, err) t.Errorf("%d: unexpected error: %v", i, err)
} }
// Check if we can retrieve the pod status from GetPodStatus(). // Check if we can retrieve the pod status.
podName := kubecontainer.GetPodFullName(pods[0]) podName := kubecontainer.GetPodFullName(pods[0])
status, err := kubelet.getPodStatus(podName) status, found := kubelet.statusManager.GetPodStatus(podName)
if err != nil { if !found {
t.Fatalf("unable to retrieve pod status for pod %q: %#v.", podName, err) t.Fatalf("unable to retrieve pod status for pod %q.", podName)
} else { } else {
terminatedContainers := []string{} terminatedContainers := []string{}
for _, cs := range status.ContainerStatuses { for _, cs := range status.ContainerStatuses {
@@ -4244,9 +4214,9 @@ func TestGetPodCreationFailureReason(t *testing.T) {
t.Errorf("unexpected error: %v", err) t.Errorf("unexpected error: %v", err)
} }
status, err := kubelet.getPodStatus(kubecontainer.GetPodFullName(pod)) status, found := kubelet.statusManager.GetPodStatus(kubecontainer.GetPodFullName(pod))
if err != nil { if !found {
t.Errorf("unexpected error %v", err) t.Fatalf("unexpected error %v", err)
} }
if len(status.ContainerStatuses) < 1 { if len(status.ContainerStatuses) < 1 {
t.Errorf("expected 1 container status, got %d", len(status.ContainerStatuses)) t.Errorf("expected 1 container status, got %d", len(status.ContainerStatuses))
@@ -4310,9 +4280,9 @@ func TestGetPodPullImageFailureReason(t *testing.T) {
t.Errorf("unexpected error: %v", err) t.Errorf("unexpected error: %v", err)
} }
status, err := kubelet.getPodStatus(kubecontainer.GetPodFullName(pod)) status, found := kubelet.statusManager.GetPodStatus(kubecontainer.GetPodFullName(pod))
if err != nil { if !found {
t.Errorf("unexpected error %v", err) t.Errorf("expected status of pod %q to be found", kubecontainer.GetPodFullName(pod))
} }
if len(status.ContainerStatuses) < 1 { if len(status.ContainerStatuses) < 1 {
t.Errorf("expected 1 container status, got %d", len(status.ContainerStatuses)) t.Errorf("expected 1 container status, got %d", len(status.ContainerStatuses))
@@ -4506,7 +4476,7 @@ func TestMakePortMappings(t *testing.T) {
} }
} }
func TestFilterOutPodsPastActiveDeadline(t *testing.T) { func TestIsPodPastActiveDeadline(t *testing.T) {
testKubelet := newTestKubelet(t) testKubelet := newTestKubelet(t)
kubelet := testKubelet.kubelet kubelet := testKubelet.kubelet
pods := newTestPods(5) pods := newTestPods(5)
@@ -4519,23 +4489,21 @@ func TestFilterOutPodsPastActiveDeadline(t *testing.T) {
pods[0].Spec.ActiveDeadlineSeconds = &exceededActiveDeadlineSeconds pods[0].Spec.ActiveDeadlineSeconds = &exceededActiveDeadlineSeconds
pods[1].Status.StartTime = &startTime pods[1].Status.StartTime = &startTime
pods[1].Spec.ActiveDeadlineSeconds = &notYetActiveDeadlineSeconds pods[1].Spec.ActiveDeadlineSeconds = &notYetActiveDeadlineSeconds
expected := []*api.Pod{pods[1], pods[2], pods[3], pods[4]} tests := []struct {
pod *api.Pod
expected bool
}{{pods[0], true}, {pods[1], false}, {pods[2], false}, {pods[3], false}, {pods[4], false}}
kubelet.podManager.SetPods(pods) kubelet.podManager.SetPods(pods)
actual := kubelet.filterOutPodsPastActiveDeadline(pods) for i, tt := range tests {
if !reflect.DeepEqual(expected, actual) { actual := kubelet.pastActiveDeadline(tt.pod)
expectedNames := "" if actual != tt.expected {
for _, pod := range expected { t.Errorf("[%d] expected %#v, got %#v", i, tt.expected, actual)
expectedNames = expectedNames + pod.Name + " "
} }
actualNames := ""
for _, pod := range actual {
actualNames = actualNames + pod.Name + " "
}
t.Errorf("expected %#v, got %#v", expectedNames, actualNames)
} }
} }
func TestSyncPodsDeletesPodsThatRunTooLong(t *testing.T) { func TestSyncPodsSetStatusToFailedForPodsThatRunTooLong(t *testing.T) {
testKubelet := newTestKubelet(t) testKubelet := newTestKubelet(t)
testKubelet.fakeCadvisor.On("MachineInfo").Return(&cadvisorApi.MachineInfo{}, nil) testKubelet.fakeCadvisor.On("MachineInfo").Return(&cadvisorApi.MachineInfo{}, nil)
kubelet := testKubelet.kubelet kubelet := testKubelet.kubelet
@@ -4593,27 +4561,22 @@ func TestSyncPodsDeletesPodsThatRunTooLong(t *testing.T) {
}, },
} }
// Let the pod worker sets the status to fail after this sync.
err := kubelet.SyncPods(pods, emptyPodUIDs, map[string]*api.Pod{}, time.Now()) err := kubelet.SyncPods(pods, emptyPodUIDs, map[string]*api.Pod{}, time.Now())
if err != nil { if err != nil {
t.Errorf("unexpected error: %v", err) t.Errorf("unexpected error: %v", err)
} }
podFullName := kubecontainer.GetPodFullName(pods[0])
verifyCalls(t, fakeDocker, []string{"list", "inspect_container", "stop", "inspect_container", "stop", "list"}) status, found := kubelet.statusManager.GetPodStatus(podFullName)
if !found {
// A map iteration is used to delete containers, so must not depend on t.Errorf("expected to found status for pod %q", status)
// order here.
expectedToStop := map[string]bool{
"1234": true,
"9876": true,
} }
if len(fakeDocker.Stopped) != 2 || if status.Phase != api.PodFailed {
!expectedToStop[fakeDocker.Stopped[0]] || t.Fatalf("expected pod status %q, ot %q.", api.PodFailed, status.Phase)
!expectedToStop[fakeDocker.Stopped[1]] {
t.Errorf("Wrong containers were stopped: %v", fakeDocker.Stopped)
} }
} }
func TestSyncPodsDoesNotDeletePodsThatRunTooLong(t *testing.T) { func TestSyncPodsDoesNotSetPodsThatDidNotRunTooLongToFailed(t *testing.T) {
testKubelet := newTestKubelet(t) testKubelet := newTestKubelet(t)
testKubelet.fakeCadvisor.On("MachineInfo").Return(&cadvisorApi.MachineInfo{}, nil) testKubelet.fakeCadvisor.On("MachineInfo").Return(&cadvisorApi.MachineInfo{}, nil)
kubelet := testKubelet.kubelet kubelet := testKubelet.kubelet
@@ -4676,14 +4639,14 @@ func TestSyncPodsDoesNotDeletePodsThatRunTooLong(t *testing.T) {
if err != nil { if err != nil {
t.Errorf("unexpected error: %v", err) t.Errorf("unexpected error: %v", err)
} }
verifyCalls(t, fakeDocker, []string{ podFullName := kubecontainer.GetPodFullName(pods[0])
"list", "list", "list", status, found := kubelet.statusManager.GetPodStatus(podFullName)
// Get pod status. if !found {
"inspect_container", "inspect_container", t.Errorf("expected to found status for pod %q", status)
// Check the pod infra container. }
"inspect_container", if status.Phase == api.PodFailed {
// Get pod status. t.Fatalf("expected pod status to not be %q", status.Phase)
"list", "inspect_container", "inspect_container", "list"}) }
} }
func TestDeletePodDirsForDeletedPods(t *testing.T) { func TestDeletePodDirsForDeletedPods(t *testing.T) {

View File

@@ -81,6 +81,9 @@ func (s *statusManager) SetPodStatus(pod *api.Pod, status api.PodStatus) {
} }
// if the status has no start time, we need to set an initial time // if the status has no start time, we need to set an initial time
// TODO(yujuhong): Consider setting StartTime when generating the pod
// status instead, which would allow statusManager to become a simple cache
// again.
if status.StartTime.IsZero() { if status.StartTime.IsZero() {
if pod.Status.StartTime.IsZero() { if pod.Status.StartTime.IsZero() {
// the pod did not have a previously recorded value so set to now // the pod did not have a previously recorded value so set to now