Merge pull request #15518 from ravilr/mesos_graceful_termination_tasklost
Auto commit by PR queue bot
This commit is contained in:
@@ -437,16 +437,18 @@ func (k *KubernetesScheduler) reconcileTerminalTask(driver bindings.SchedulerDri
|
|||||||
if (state == podtask.StateRunning || state == podtask.StatePending) &&
|
if (state == podtask.StateRunning || state == podtask.StatePending) &&
|
||||||
((taskStatus.GetSource() == mesos.TaskStatus_SOURCE_MASTER && taskStatus.GetReason() == mesos.TaskStatus_REASON_RECONCILIATION) ||
|
((taskStatus.GetSource() == mesos.TaskStatus_SOURCE_MASTER && taskStatus.GetReason() == mesos.TaskStatus_REASON_RECONCILIATION) ||
|
||||||
(taskStatus.GetSource() == mesos.TaskStatus_SOURCE_SLAVE && taskStatus.GetReason() == mesos.TaskStatus_REASON_EXECUTOR_TERMINATED) ||
|
(taskStatus.GetSource() == mesos.TaskStatus_SOURCE_SLAVE && taskStatus.GetReason() == mesos.TaskStatus_REASON_EXECUTOR_TERMINATED) ||
|
||||||
(taskStatus.GetSource() == mesos.TaskStatus_SOURCE_SLAVE && taskStatus.GetReason() == mesos.TaskStatus_REASON_EXECUTOR_UNREGISTERED)) {
|
(taskStatus.GetSource() == mesos.TaskStatus_SOURCE_SLAVE && taskStatus.GetReason() == mesos.TaskStatus_REASON_EXECUTOR_UNREGISTERED) ||
|
||||||
|
(taskStatus.GetSource() == mesos.TaskStatus_SOURCE_EXECUTOR && taskStatus.GetMessage() == messages.ContainersDisappeared)) {
|
||||||
//--
|
//--
|
||||||
// pod-task has metadata that refers to:
|
// pod-task has metadata that refers to:
|
||||||
// (1) a task that Mesos no longer knows about, or else
|
// (1) a task that Mesos no longer knows about, or else
|
||||||
// (2) a pod that the Kubelet will never report as "failed"
|
// (2) a pod that the Kubelet will never report as "failed"
|
||||||
|
// (3) a pod that the kubeletExecutor reported as lost (likely due to docker daemon crash/restart)
|
||||||
// For now, destroy the pod and hope that there's a replication controller backing it up.
|
// For now, destroy the pod and hope that there's a replication controller backing it up.
|
||||||
// TODO(jdef) for case #2 don't delete the pod, just update it's status to Failed
|
// TODO(jdef) for case #2 don't delete the pod, just update it's status to Failed
|
||||||
pod := &task.Pod
|
pod := &task.Pod
|
||||||
log.Warningf("deleting rogue pod %v/%v for lost task %v", pod.Namespace, pod.Name, task.ID)
|
log.Warningf("deleting rogue pod %v/%v for lost task %v", pod.Namespace, pod.Name, task.ID)
|
||||||
if err := k.client.Pods(pod.Namespace).Delete(pod.Name, nil); err != nil && !errors.IsNotFound(err) {
|
if err := k.client.Pods(pod.Namespace).Delete(pod.Name, api.NewDeleteOptions(0)); err != nil && !errors.IsNotFound(err) {
|
||||||
log.Errorf("failed to delete pod %v/%v for terminal task %v: %v", pod.Namespace, pod.Name, task.ID, err)
|
log.Errorf("failed to delete pod %v/%v for terminal task %v: %v", pod.Namespace, pod.Name, task.ID, err)
|
||||||
}
|
}
|
||||||
} else if taskStatus.GetReason() == mesos.TaskStatus_REASON_EXECUTOR_TERMINATED || taskStatus.GetReason() == mesos.TaskStatus_REASON_EXECUTOR_UNREGISTERED {
|
} else if taskStatus.GetReason() == mesos.TaskStatus_REASON_EXECUTOR_TERMINATED || taskStatus.GetReason() == mesos.TaskStatus_REASON_EXECUTOR_UNREGISTERED {
|
||||||
|
Reference in New Issue
Block a user