Kubernetes Mesos integration

This commit includes the fundamental components of the Kubernetes Mesos integration: * Kubernetes-Mesos scheduler * Kubernetes-Mesos executor * Supporting libs Dependencies and upstream changes are included in a separate commit for easy review. After this initial upstream, there'll be two PRs following. * km (hypercube) and k8sm-controller-manager #9265 * Static pods support #9077 Fixes applied: - Precise metrics subsystems definitions - mesosphere/kubernetes-mesos#331 - https://github.com/GoogleCloudPlatform/kubernetes/pull/8882#discussion_r31875232 - https://github.com/GoogleCloudPlatform/kubernetes/pull/8882#discussion_r31875240 - Improve comments and add clarifications - Fixes https://github.com/GoogleCloudPlatform/kubernetes/pull/8882#discussion-diff-31875208 - Fixes https://github.com/GoogleCloudPlatform/kubernetes/pull/8882#discussion-diff-31875226 - Fixes https://github.com/GoogleCloudPlatform/kubernetes/pull/8882#discussion-diff-31875227 - Fixes https://github.com/GoogleCloudPlatform/kubernetes/pull/8882#discussion-diff-31875228 - Fixes https://github.com/GoogleCloudPlatform/kubernetes/pull/8882#discussion-diff-31875239 - Fixes https://github.com/GoogleCloudPlatform/kubernetes/pull/8882#discussion-diff-31875243 - Fixes https://github.com/GoogleCloudPlatform/kubernetes/pull/8882#discussion-diff-31875234 - Fixes https://github.com/GoogleCloudPlatform/kubernetes/pull/8882#discussion-diff-31875256 - Fixes https://github.com/GoogleCloudPlatform/kubernetes/pull/8882#discussion-diff-31875255 - Fixes https://github.com/GoogleCloudPlatform/kubernetes/pull/8882#discussion-diff-31875251 - Clarify which Schedule function is actually called - Fixes https://github.com/GoogleCloudPlatform/kubernetes/pull/8882#discussion-diff-31875246
2015-06-10 20:58:22 +00:00
parent 7d66559725
commit 932c58a497
105 changed files with 15162 additions and 0 deletions
--- a/contrib/mesos/pkg/executor/executor.go
+++ b/contrib/mesos/pkg/executor/executor.go
@@ -0,0 +1,846 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package executor
+
+import (
+	"encoding/json"
+	"fmt"
+	"strings"
+	"sync"
+	"sync/atomic"
+	"time"
+
+	"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/executor/messages"
+	"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/scheduler/meta"
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/api"
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/client"
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/kubelet"
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/kubelet/container"
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/kubelet/dockertools"
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/util"
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/watch"
+	"github.com/fsouza/go-dockerclient"
+	"github.com/gogo/protobuf/proto"
+	log "github.com/golang/glog"
+	bindings "github.com/mesos/mesos-go/executor"
+	mesos "github.com/mesos/mesos-go/mesosproto"
+	mutil "github.com/mesos/mesos-go/mesosutil"
+)
+
+const (
+	containerPollTime = 300 * time.Millisecond
+	launchGracePeriod = 5 * time.Minute
+)
+
+type stateType int32
+
+const (
+	disconnectedState stateType = iota
+	connectedState
+	suicidalState
+	terminalState
+)
+
+func (s *stateType) get() stateType {
+	return stateType(atomic.LoadInt32((*int32)(s)))
+}
+
+func (s *stateType) transition(from, to stateType) bool {
+	return atomic.CompareAndSwapInt32((*int32)(s), int32(from), int32(to))
+}
+
+func (s *stateType) transitionTo(to stateType, unless ...stateType) bool {
+	if len(unless) == 0 {
+		atomic.StoreInt32((*int32)(s), int32(to))
+		return true
+	}
+	for {
+		state := s.get()
+		for _, x := range unless {
+			if state == x {
+				return false
+			}
+		}
+		if s.transition(state, to) {
+			return true
+		}
+	}
+}
+
+type kuberTask struct {
+	mesosTaskInfo *mesos.TaskInfo
+	podName       string
+}
+
+// func that attempts suicide
+type jumper func(bindings.ExecutorDriver, <-chan struct{})
+
+type suicideWatcher interface {
+	Next(time.Duration, bindings.ExecutorDriver, jumper) suicideWatcher
+	Reset(time.Duration) bool
+	Stop() bool
+}
+
+type podStatusFunc func() (*api.PodStatus, error)
+
+// KubernetesExecutor is an mesos executor that runs pods
+// in a minion machine.
+type KubernetesExecutor struct {
+	kl                  *kubelet.Kubelet   // the kubelet instance.
+	updateChan          chan<- interface{} // to send pod config updates to the kubelet
+	state               stateType
+	tasks               map[string]*kuberTask
+	pods                map[string]*api.Pod
+	lock                sync.RWMutex
+	sourcename          string
+	client              *client.Client
+	events              <-chan watch.Event
+	done                chan struct{}                     // signals shutdown
+	outgoing            chan func() (mesos.Status, error) // outgoing queue to the mesos driver
+	dockerClient        dockertools.DockerInterface
+	suicideWatch        suicideWatcher
+	suicideTimeout      time.Duration
+	shutdownAlert       func()          // invoked just prior to executor shutdown
+	kubeletFinished     <-chan struct{} // signals that kubelet Run() died
+	initialRegistration sync.Once
+	exitFunc            func(int)
+	podStatusFunc       func(*kubelet.Kubelet, *api.Pod) (*api.PodStatus, error)
+}
+
+type Config struct {
+	Kubelet         *kubelet.Kubelet
+	Updates         chan<- interface{} // to send pod config updates to the kubelet
+	SourceName      string
+	APIClient       *client.Client
+	Watch           watch.Interface
+	Docker          dockertools.DockerInterface
+	ShutdownAlert   func()
+	SuicideTimeout  time.Duration
+	KubeletFinished <-chan struct{} // signals that kubelet Run() died
+	ExitFunc        func(int)
+	PodStatusFunc   func(*kubelet.Kubelet, *api.Pod) (*api.PodStatus, error)
+}
+
+func (k *KubernetesExecutor) isConnected() bool {
+	return connectedState == (&k.state).get()
+}
+
+// New creates a new kubernetes executor.
+func New(config Config) *KubernetesExecutor {
+	k := &KubernetesExecutor{
+		kl:              config.Kubelet,
+		updateChan:      config.Updates,
+		state:           disconnectedState,
+		tasks:           make(map[string]*kuberTask),
+		pods:            make(map[string]*api.Pod),
+		sourcename:      config.SourceName,
+		client:          config.APIClient,
+		done:            make(chan struct{}),
+		outgoing:        make(chan func() (mesos.Status, error), 1024),
+		dockerClient:    config.Docker,
+		suicideTimeout:  config.SuicideTimeout,
+		kubeletFinished: config.KubeletFinished,
+		suicideWatch:    &suicideTimer{},
+		shutdownAlert:   config.ShutdownAlert,
+		exitFunc:        config.ExitFunc,
+		podStatusFunc:   config.PodStatusFunc,
+	}
+	//TODO(jdef) do something real with these events..
+	if config.Watch != nil {
+		events := config.Watch.ResultChan()
+		if events != nil {
+			go func() {
+				for e := range events {
+					// e ~= watch.Event { ADDED, *api.Event }
+					log.V(1).Info(e)
+				}
+			}()
+			k.events = events
+		}
+	}
+	return k
+}
+
+func (k *KubernetesExecutor) Init(driver bindings.ExecutorDriver) {
+	k.killKubeletContainers()
+	k.resetSuicideWatch(driver)
+	go k.sendLoop()
+	//TODO(jdef) monitor kubeletFinished and shutdown if it happens
+}
+
+func (k *KubernetesExecutor) Done() <-chan struct{} {
+	return k.done
+}
+
+func (k *KubernetesExecutor) isDone() bool {
+	select {
+	case <-k.done:
+		return true
+	default:
+		return false
+	}
+}
+
+// Registered is called when the executor is successfully registered with the slave.
+func (k *KubernetesExecutor) Registered(driver bindings.ExecutorDriver,
+	executorInfo *mesos.ExecutorInfo, frameworkInfo *mesos.FrameworkInfo, slaveInfo *mesos.SlaveInfo) {
+	if k.isDone() {
+		return
+	}
+	log.Infof("Executor %v of framework %v registered with slave %v\n",
+		executorInfo, frameworkInfo, slaveInfo)
+	if !(&k.state).transition(disconnectedState, connectedState) {
+		log.Errorf("failed to register/transition to a connected state")
+	}
+	k.initialRegistration.Do(k.onInitialRegistration)
+}
+
+// Reregistered is called when the executor is successfully re-registered with the slave.
+// This can happen when the slave fails over.
+func (k *KubernetesExecutor) Reregistered(driver bindings.ExecutorDriver, slaveInfo *mesos.SlaveInfo) {
+	if k.isDone() {
+		return
+	}
+	log.Infof("Reregistered with slave %v\n", slaveInfo)
+	if !(&k.state).transition(disconnectedState, connectedState) {
+		log.Errorf("failed to reregister/transition to a connected state")
+	}
+	k.initialRegistration.Do(k.onInitialRegistration)
+}
+
+func (k *KubernetesExecutor) onInitialRegistration() {
+	// emit an empty update to allow the mesos "source" to be marked as seen
+	k.updateChan <- kubelet.PodUpdate{
+		Pods:   []*api.Pod{},
+		Op:     kubelet.SET,
+		Source: k.sourcename,
+	}
+}
+
+// Disconnected is called when the executor is disconnected from the slave.
+func (k *KubernetesExecutor) Disconnected(driver bindings.ExecutorDriver) {
+	if k.isDone() {
+		return
+	}
+	log.Infof("Slave is disconnected\n")
+	if !(&k.state).transition(connectedState, disconnectedState) {
+		log.Errorf("failed to disconnect/transition to a disconnected state")
+	}
+}
+
+// LaunchTask is called when the executor receives a request to launch a task.
+// The happens when the k8sm scheduler has decided to schedule the pod
+// (which corresponds to a Mesos Task) onto the node where this executor
+// is running, but the binding is not recorded in the Kubernetes store yet.
+// This function is invoked to tell the executor to record the binding in the
+// Kubernetes store and start the pod via the Kubelet.
+func (k *KubernetesExecutor) LaunchTask(driver bindings.ExecutorDriver, taskInfo *mesos.TaskInfo) {
+	if k.isDone() {
+		return
+	}
+	log.Infof("Launch task %v\n", taskInfo)
+
+	if !k.isConnected() {
+		log.Errorf("Ignore launch task because the executor is disconnected\n")
+		k.sendStatus(driver, newStatus(taskInfo.GetTaskId(), mesos.TaskState_TASK_FAILED,
+			messages.ExecutorUnregistered))
+		return
+	}
+
+	obj, err := api.Codec.Decode(taskInfo.GetData())
+	if err != nil {
+		log.Errorf("failed to extract yaml data from the taskInfo.data %v", err)
+		k.sendStatus(driver, newStatus(taskInfo.GetTaskId(), mesos.TaskState_TASK_FAILED,
+			messages.UnmarshalTaskDataFailure))
+		return
+	}
+	pod, ok := obj.(*api.Pod)
+	if !ok {
+		log.Errorf("expected *api.Pod instead of %T: %+v", pod, pod)
+		k.sendStatus(driver, newStatus(taskInfo.GetTaskId(), mesos.TaskState_TASK_FAILED,
+			messages.UnmarshalTaskDataFailure))
+		return
+	}
+
+	k.lock.Lock()
+	defer k.lock.Unlock()
+
+	taskId := taskInfo.GetTaskId().GetValue()
+	if _, found := k.tasks[taskId]; found {
+		log.Errorf("task already launched\n")
+		// Not to send back TASK_RUNNING here, because
+		// may be duplicated messages or duplicated task id.
+		return
+	}
+	// remember this task so that:
+	// (a) we ignore future launches for it
+	// (b) we have a record of it so that we can kill it if needed
+	// (c) we're leaving podName == "" for now, indicates we don't need to delete containers
+	k.tasks[taskId] = &kuberTask{
+		mesosTaskInfo: taskInfo,
+	}
+	k.resetSuicideWatch(driver)
+
+	go k.launchTask(driver, taskId, pod)
+}
+
+// TODO(jdef) add metrics for this?
+type suicideTimer struct {
+	timer *time.Timer
+}
+
+func (w *suicideTimer) Next(d time.Duration, driver bindings.ExecutorDriver, f jumper) suicideWatcher {
+	return &suicideTimer{
+		timer: time.AfterFunc(d, func() {
+			log.Warningf("Suicide timeout (%v) expired", d)
+			f(driver, nil)
+		}),
+	}
+}
+
+func (w *suicideTimer) Stop() (result bool) {
+	if w != nil && w.timer != nil {
+		log.Infoln("stopping suicide watch") //TODO(jdef) debug
+		result = w.timer.Stop()
+	}
+	return
+}
+
+// return true if the timer was successfully reset
+func (w *suicideTimer) Reset(d time.Duration) bool {
+	if w != nil && w.timer != nil {
+		log.Infoln("resetting suicide watch") //TODO(jdef) debug
+		w.timer.Reset(d)
+		return true
+	}
+	return false
+}
+
+// determine whether we need to start a suicide countdown. if so, then start
+// a timer that, upon expiration, causes this executor to commit suicide.
+// this implementation runs asynchronously. callers that wish to wait for the
+// reset to complete may wait for the returned signal chan to close.
+func (k *KubernetesExecutor) resetSuicideWatch(driver bindings.ExecutorDriver) <-chan struct{} {
+	ch := make(chan struct{})
+	go func() {
+		defer close(ch)
+		k.lock.Lock()
+		defer k.lock.Unlock()
+
+		if k.suicideTimeout < 1 {
+			return
+		}
+
+		if k.suicideWatch != nil {
+			if len(k.tasks) > 0 {
+				k.suicideWatch.Stop()
+				return
+			}
+			if k.suicideWatch.Reset(k.suicideTimeout) {
+				// valid timer, reset was successful
+				return
+			}
+		}
+
+		//TODO(jdef) reduce verbosity here once we're convinced that suicide watch is working properly
+		log.Infof("resetting suicide watch timer for %v", k.suicideTimeout)
+
+		k.suicideWatch = k.suicideWatch.Next(k.suicideTimeout, driver, jumper(k.attemptSuicide))
+	}()
+	return ch
+}
+
+func (k *KubernetesExecutor) attemptSuicide(driver bindings.ExecutorDriver, abort <-chan struct{}) {
+	k.lock.Lock()
+	defer k.lock.Unlock()
+
+	// this attempt may have been queued and since been aborted
+	select {
+	case <-abort:
+		//TODO(jdef) reduce verbosity once suicide watch is working properly
+		log.Infof("aborting suicide attempt since watch was cancelled")
+		return
+	default: // continue
+	}
+
+	// fail-safe, will abort kamikaze attempts if there are tasks
+	if len(k.tasks) > 0 {
+		ids := []string{}
+		for taskid := range k.tasks {
+			ids = append(ids, taskid)
+		}
+		log.Errorf("suicide attempt failed, there are still running tasks: %v", ids)
+		return
+	}
+
+	log.Infoln("Attempting suicide")
+	if (&k.state).transitionTo(suicidalState, suicidalState, terminalState) {
+		//TODO(jdef) let the scheduler know?
+		//TODO(jdef) is suicide more graceful than slave-demanded shutdown?
+		k.doShutdown(driver)
+	}
+}
+
+// async continuation of LaunchTask
+func (k *KubernetesExecutor) launchTask(driver bindings.ExecutorDriver, taskId string, pod *api.Pod) {
+
+	//HACK(jdef): cloned binding construction from k8s plugin/pkg/scheduler/scheduler.go
+	binding := &api.Binding{
+		ObjectMeta: api.ObjectMeta{
+			Namespace:   pod.Namespace,
+			Name:        pod.Name,
+			Annotations: make(map[string]string),
+		},
+		Target: api.ObjectReference{
+			Kind: "Node",
+			Name: pod.Annotations[meta.BindingHostKey],
+		},
+	}
+
+	// forward the annotations that the scheduler wants to apply
+	for k, v := range pod.Annotations {
+		binding.Annotations[k] = v
+	}
+
+	deleteTask := func() {
+		k.lock.Lock()
+		defer k.lock.Unlock()
+		delete(k.tasks, taskId)
+		k.resetSuicideWatch(driver)
+	}
+
+	log.Infof("Binding '%v/%v' to '%v' with annotations %+v...", pod.Namespace, pod.Name, binding.Target.Name, binding.Annotations)
+	ctx := api.WithNamespace(api.NewContext(), binding.Namespace)
+	// TODO(k8s): use Pods interface for binding once clusters are upgraded
+	// return b.Pods(binding.Namespace).Bind(binding)
+	err := k.client.Post().Namespace(api.NamespaceValue(ctx)).Resource("bindings").Body(binding).Do().Error()
+	if err != nil {
+		deleteTask()
+		k.sendStatus(driver, newStatus(mutil.NewTaskID(taskId), mesos.TaskState_TASK_FAILED,
+			messages.CreateBindingFailure))
+		return
+	}
+	podFullName := container.GetPodFullName(pod)
+
+	// allow a recently failed-over scheduler the chance to recover the task/pod binding:
+	// it may have failed and recovered before the apiserver is able to report the updated
+	// binding information. replays of this status event will signal to the scheduler that
+	// the apiserver should be up-to-date.
+	data, err := json.Marshal(api.PodStatusResult{
+		ObjectMeta: api.ObjectMeta{
+			Name:     podFullName,
+			SelfLink: "/podstatusresult",
+		},
+	})
+	if err != nil {
+		deleteTask()
+		log.Errorf("failed to marshal pod status result: %v", err)
+		k.sendStatus(driver, newStatus(mutil.NewTaskID(taskId), mesos.TaskState_TASK_FAILED,
+			err.Error()))
+		return
+	}
+
+	k.lock.Lock()
+	defer k.lock.Unlock()
+
+	// Add the task.
+	task, found := k.tasks[taskId]
+	if !found {
+		log.V(1).Infof("task %v not found, probably killed: aborting launch, reporting lost", taskId)
+		k.reportLostTask(driver, taskId, messages.LaunchTaskFailed)
+		return
+	}
+
+	//TODO(jdef) check for duplicate pod name, if found send TASK_ERROR
+
+	// from here on, we need to delete containers associated with the task
+	// upon it going into a terminal state
+	task.podName = podFullName
+	k.pods[podFullName] = pod
+
+	// send the latest snapshot of the set of pods to the kubelet via the pod update channel
+	update := kubelet.PodUpdate{Op: kubelet.SET}
+	for _, p := range k.pods {
+		update.Pods = append(update.Pods, p)
+	}
+	k.updateChan <- update
+
+	statusUpdate := &mesos.TaskStatus{
+		TaskId:  mutil.NewTaskID(taskId),
+		State:   mesos.TaskState_TASK_STARTING.Enum(),
+		Message: proto.String(messages.CreateBindingSuccess),
+		Data:    data,
+	}
+	k.sendStatus(driver, statusUpdate)
+
+	// Delay reporting 'task running' until container is up.
+	psf := podStatusFunc(func() (*api.PodStatus, error) {
+		return k.podStatusFunc(k.kl, pod)
+	})
+
+	go k._launchTask(driver, taskId, podFullName, psf)
+}
+
+func (k *KubernetesExecutor) _launchTask(driver bindings.ExecutorDriver, taskId, podFullName string, psf podStatusFunc) {
+
+	expired := make(chan struct{})
+	time.AfterFunc(launchGracePeriod, func() { close(expired) })
+
+	getMarshalledInfo := func() (data []byte, cancel bool) {
+		// potentially long call..
+		if podStatus, err := psf(); err == nil && podStatus != nil {
+			select {
+			case <-expired:
+				cancel = true
+			default:
+				k.lock.Lock()
+				defer k.lock.Unlock()
+				if _, found := k.tasks[taskId]; !found {
+					// don't bother with the pod status if the task is already gone
+					cancel = true
+					break
+				} else if podStatus.Phase != api.PodRunning {
+					// avoid sending back a running status before it's really running
+					break
+				}
+				log.V(2).Infof("Found pod status: '%v'", podStatus)
+				result := api.PodStatusResult{
+					ObjectMeta: api.ObjectMeta{
+						Name:     podFullName,
+						SelfLink: "/podstatusresult",
+					},
+					Status: *podStatus,
+				}
+				if data, err = json.Marshal(result); err != nil {
+					log.Errorf("failed to marshal pod status result: %v", err)
+				}
+			}
+		}
+		return
+	}
+
+waitForRunningPod:
+	for {
+		select {
+		case <-expired:
+			log.Warningf("Launch expired grace period of '%v'", launchGracePeriod)
+			break waitForRunningPod
+		case <-time.After(containerPollTime):
+			if data, cancel := getMarshalledInfo(); cancel {
+				break waitForRunningPod
+			} else if data == nil {
+				continue waitForRunningPod
+			} else {
+				k.lock.Lock()
+				defer k.lock.Unlock()
+				if _, found := k.tasks[taskId]; !found {
+					goto reportLost
+				}
+
+				statusUpdate := &mesos.TaskStatus{
+					TaskId:  mutil.NewTaskID(taskId),
+					State:   mesos.TaskState_TASK_RUNNING.Enum(),
+					Message: proto.String(fmt.Sprintf("pod-running:%s", podFullName)),
+					Data:    data,
+				}
+
+				k.sendStatus(driver, statusUpdate)
+
+				// continue to monitor the health of the pod
+				go k.__launchTask(driver, taskId, podFullName, psf)
+				return
+			}
+		}
+	}
+
+	k.lock.Lock()
+	defer k.lock.Unlock()
+reportLost:
+	k.reportLostTask(driver, taskId, messages.LaunchTaskFailed)
+}
+
+func (k *KubernetesExecutor) __launchTask(driver bindings.ExecutorDriver, taskId, podFullName string, psf podStatusFunc) {
+	// TODO(nnielsen): Monitor health of pod and report if lost.
+	// Should we also allow this to fail a couple of times before reporting lost?
+	// What if the docker daemon is restarting and we can't connect, but it's
+	// going to bring the pods back online as soon as it restarts?
+	knownPod := func() bool {
+		_, err := psf()
+		return err == nil
+	}
+	// Wait for the pod to go away and stop monitoring once it does
+	// TODO (jdefelice) replace with an /events watch?
+	for {
+		time.Sleep(containerPollTime)
+		if k.checkForLostPodTask(driver, taskId, knownPod) {
+			return
+		}
+	}
+}
+
+// Intended to be executed as part of the pod monitoring loop, this fn (ultimately) checks with Docker
+// whether the pod is running. It will only return false if the task is still registered and the pod is
+// registered in Docker. Otherwise it returns true. If there's still a task record on file, but no pod
+// in Docker, then we'll also send a TASK_LOST event.
+func (k *KubernetesExecutor) checkForLostPodTask(driver bindings.ExecutorDriver, taskId string, isKnownPod func() bool) bool {
+	// TODO (jdefelice) don't send false alarms for deleted pods (KILLED tasks)
+	k.lock.Lock()
+	defer k.lock.Unlock()
+
+	// TODO(jdef) we should really consider k.pods here, along with what docker is reporting, since the
+	// kubelet may constantly attempt to instantiate a pod as long as it's in the pod state that we're
+	// handing to it. otherwise, we're probably reporting a TASK_LOST prematurely. Should probably
+	// consult RestartPolicy to determine appropriate behavior. Should probably also gracefully handle
+	// docker daemon restarts.
+	if _, ok := k.tasks[taskId]; ok {
+		if isKnownPod() {
+			return false
+		} else {
+			log.Warningf("Detected lost pod, reporting lost task %v", taskId)
+			k.reportLostTask(driver, taskId, messages.ContainersDisappeared)
+		}
+	} else {
+		log.V(2).Infof("Task %v no longer registered, stop monitoring for lost pods", taskId)
+	}
+	return true
+}
+
+// KillTask is called when the executor receives a request to kill a task.
+func (k *KubernetesExecutor) KillTask(driver bindings.ExecutorDriver, taskId *mesos.TaskID) {
+	if k.isDone() {
+		return
+	}
+	log.Infof("Kill task %v\n", taskId)
+
+	if !k.isConnected() {
+		//TODO(jdefelice) sent TASK_LOST here?
+		log.Warningf("Ignore kill task because the executor is disconnected\n")
+		return
+	}
+
+	k.lock.Lock()
+	defer k.lock.Unlock()
+	k.removePodTask(driver, taskId.GetValue(), messages.TaskKilled, mesos.TaskState_TASK_KILLED)
+}
+
+// Reports a lost task to the slave and updates internal task and pod tracking state.
+// Assumes that the caller is locking around pod and task state.
+func (k *KubernetesExecutor) reportLostTask(driver bindings.ExecutorDriver, tid, reason string) {
+	k.removePodTask(driver, tid, reason, mesos.TaskState_TASK_LOST)
+}
+
+// deletes the pod and task associated with the task identified by tid and sends a task
+// status update to mesos. also attempts to reset the suicide watch.
+// Assumes that the caller is locking around pod and task state.
+func (k *KubernetesExecutor) removePodTask(driver bindings.ExecutorDriver, tid, reason string, state mesos.TaskState) {
+	task, ok := k.tasks[tid]
+	if !ok {
+		log.V(1).Infof("Failed to remove task, unknown task %v\n", tid)
+		return
+	}
+	delete(k.tasks, tid)
+	k.resetSuicideWatch(driver)
+
+	pid := task.podName
+	if _, found := k.pods[pid]; !found {
+		log.Warningf("Cannot remove unknown pod %v for task %v", pid, tid)
+	} else {
+		log.V(2).Infof("deleting pod %v for task %v", pid, tid)
+		delete(k.pods, pid)
+
+		// Send the pod updates to the channel.
+		update := kubelet.PodUpdate{Op: kubelet.SET}
+		for _, p := range k.pods {
+			update.Pods = append(update.Pods, p)
+		}
+		k.updateChan <- update
+	}
+	// TODO(jdef): ensure that the update propagates, perhaps return a signal chan?
+	k.sendStatus(driver, newStatus(mutil.NewTaskID(tid), state, reason))
+}
+
+// FrameworkMessage is called when the framework sends some message to the executor
+func (k *KubernetesExecutor) FrameworkMessage(driver bindings.ExecutorDriver, message string) {
+	if k.isDone() {
+		return
+	}
+	if !k.isConnected() {
+		log.Warningf("Ignore framework message because the executor is disconnected\n")
+		return
+	}
+
+	log.Infof("Receives message from framework %v\n", message)
+	//TODO(jdef) master reported a lost task, reconcile this! @see scheduler.go:handleTaskLost
+	if strings.HasPrefix(message, "task-lost:") && len(message) > 10 {
+		taskId := message[10:]
+		if taskId != "" {
+			// clean up pod state
+			k.lock.Lock()
+			defer k.lock.Unlock()
+			k.reportLostTask(driver, taskId, messages.TaskLostAck)
+		}
+	}
+
+	switch message {
+	case messages.Kamikaze:
+		k.attemptSuicide(driver, nil)
+	}
+}
+
+// Shutdown is called when the executor receives a shutdown request.
+func (k *KubernetesExecutor) Shutdown(driver bindings.ExecutorDriver) {
+	k.lock.Lock()
+	defer k.lock.Unlock()
+	k.doShutdown(driver)
+}
+
+// assumes that caller has obtained state lock
+func (k *KubernetesExecutor) doShutdown(driver bindings.ExecutorDriver) {
+	defer func() {
+		log.Errorf("exiting with unclean shutdown: %v", recover())
+		if k.exitFunc != nil {
+			k.exitFunc(1)
+		}
+	}()
+
+	(&k.state).transitionTo(terminalState)
+
+	// signal to all listeners that this KubeletExecutor is done!
+	close(k.done)
+
+	if k.shutdownAlert != nil {
+		func() {
+			util.HandleCrash()
+			k.shutdownAlert()
+		}()
+	}
+
+	log.Infoln("Stopping executor driver")
+	_, err := driver.Stop()
+	if err != nil {
+		log.Warningf("failed to stop executor driver: %v", err)
+	}
+
+	log.Infoln("Shutdown the executor")
+
+	// according to docs, mesos will generate TASK_LOST updates for us
+	// if needed, so don't take extra time to do that here.
+	k.tasks = map[string]*kuberTask{}
+
+	select {
+	// the main Run() func may still be running... wait for it to finish: it will
+	// clear the pod configuration cleanly, telling k8s "there are no pods" and
+	// clean up resources (pods, volumes, etc).
+	case <-k.kubeletFinished:
+
+	//TODO(jdef) attempt to wait for events to propagate to API server?
+
+	// TODO(jdef) extract constant, should be smaller than whatever the
+	// slave graceful shutdown timeout period is.
+	case <-time.After(15 * time.Second):
+		log.Errorf("timed out waiting for kubelet Run() to die")
+	}
+
+	log.Infoln("exiting")
+	if k.exitFunc != nil {
+		k.exitFunc(0)
+	}
+}
+
+// Destroy existing k8s containers
+func (k *KubernetesExecutor) killKubeletContainers() {
+	if containers, err := dockertools.GetKubeletDockerContainers(k.dockerClient, true); err == nil {
+		opts := docker.RemoveContainerOptions{
+			RemoveVolumes: true,
+			Force:         true,
+		}
+		for _, container := range containers {
+			opts.ID = container.ID
+			log.V(2).Infof("Removing container: %v", opts.ID)
+			if err := k.dockerClient.RemoveContainer(opts); err != nil {
+				log.Warning(err)
+			}
+		}
+	} else {
+		log.Warningf("Failed to list kubelet docker containers: %v", err)
+	}
+}
+
+// Error is called when some error happens.
+func (k *KubernetesExecutor) Error(driver bindings.ExecutorDriver, message string) {
+	log.Errorln(message)
+}
+
+func newStatus(taskId *mesos.TaskID, state mesos.TaskState, message string) *mesos.TaskStatus {
+	return &mesos.TaskStatus{
+		TaskId:  taskId,
+		State:   &state,
+		Message: proto.String(message),
+	}
+}
+
+func (k *KubernetesExecutor) sendStatus(driver bindings.ExecutorDriver, status *mesos.TaskStatus) {
+	select {
+	case <-k.done:
+	default:
+		k.outgoing <- func() (mesos.Status, error) { return driver.SendStatusUpdate(status) }
+	}
+}
+
+func (k *KubernetesExecutor) sendFrameworkMessage(driver bindings.ExecutorDriver, msg string) {
+	select {
+	case <-k.done:
+	default:
+		k.outgoing <- func() (mesos.Status, error) { return driver.SendFrameworkMessage(msg) }
+	}
+}
+
+func (k *KubernetesExecutor) sendLoop() {
+	defer log.V(1).Info("sender loop exiting")
+	for {
+		select {
+		case <-k.done:
+			return
+		default:
+			if !k.isConnected() {
+				select {
+				case <-k.done:
+				case <-time.After(1 * time.Second):
+				}
+				continue
+			}
+			sender, ok := <-k.outgoing
+			if !ok {
+				// programming error
+				panic("someone closed the outgoing channel")
+			}
+			if status, err := sender(); err == nil {
+				continue
+			} else {
+				log.Error(err)
+				if status == mesos.Status_DRIVER_ABORTED {
+					return
+				}
+			}
+			// attempt to re-queue the sender
+			select {
+			case <-k.done:
+			case k.outgoing <- sender:
+			}
+		}
+	}
+}