Merge pull request #16316 from mesosphere/scheduler-refactor

MESOS: Refactor scheduler
2015-11-12 15:28:25 +01:00
parent 70964fedc7 e71f43de93
commit 1a958b0517
52 changed files with 3254 additions and 2591 deletions
--- a/contrib/mesos/docs/scheduler.monopic
+++ b/contrib/mesos/docs/scheduler.monopic
--- a/contrib/mesos/pkg/executor/executor.go
+++ b/contrib/mesos/pkg/executor/executor.go
@@ -99,7 +99,7 @@ type NodeInfo struct {

 // KubernetesExecutor is an mesos executor that runs pods
 // in a minion machine.
-type KubernetesExecutor struct {
+type Executor struct {
 	updateChan           chan<- kubetypes.PodUpdate // sent to the kubelet, closed on shutdown
 	state                stateType
 	tasks                map[string]*kuberTask
@@ -136,13 +136,13 @@ type Config struct {
 	NodeInfos            chan<- NodeInfo
 }

-func (k *KubernetesExecutor) isConnected() bool {
+func (k *Executor) isConnected() bool {
 	return connectedState == (&k.state).get()
 }

 // New creates a new kubernetes executor.
-func New(config Config) *KubernetesExecutor {
-	k := &KubernetesExecutor{
+func New(config Config) *Executor {
+	k := &Executor{
 		updateChan:           config.Updates,
 		state:                disconnectedState,
 		tasks:                make(map[string]*kuberTask),
@@ -187,7 +187,7 @@ func New(config Config) *KubernetesExecutor {
 	return k
 }

-func (k *KubernetesExecutor) Init(driver bindings.ExecutorDriver) {
+func (k *Executor) Init(driver bindings.ExecutorDriver) {
 	k.killKubeletContainers()
 	k.resetSuicideWatch(driver)

@@ -196,7 +196,7 @@ func (k *KubernetesExecutor) Init(driver bindings.ExecutorDriver) {
 	//TODO(jdef) monitor kubeletFinished and shutdown if it happens
 }

-func (k *KubernetesExecutor) isDone() bool {
+func (k *Executor) isDone() bool {
 	select {
 	case <-k.terminate:
 		return true
@@ -206,7 +206,7 @@ func (k *KubernetesExecutor) isDone() bool {
 }

 // sendPodUpdate assumes that caller is holding state lock; returns true when update is sent otherwise false
-func (k *KubernetesExecutor) sendPodUpdate(u *kubetypes.PodUpdate) bool {
+func (k *Executor) sendPodUpdate(u *kubetypes.PodUpdate) bool {
 	if k.isDone() {
 		return false
 	}
@@ -215,7 +215,7 @@ func (k *KubernetesExecutor) sendPodUpdate(u *kubetypes.PodUpdate) bool {
 }

 // Registered is called when the executor is successfully registered with the slave.
-func (k *KubernetesExecutor) Registered(driver bindings.ExecutorDriver,
+func (k *Executor) Registered(driver bindings.ExecutorDriver,
 	executorInfo *mesos.ExecutorInfo, frameworkInfo *mesos.FrameworkInfo, slaveInfo *mesos.SlaveInfo) {
 	if k.isDone() {
 		return
@@ -252,7 +252,7 @@ func (k *KubernetesExecutor) Registered(driver bindings.ExecutorDriver,

 // Reregistered is called when the executor is successfully re-registered with the slave.
 // This can happen when the slave fails over.
-func (k *KubernetesExecutor) Reregistered(driver bindings.ExecutorDriver, slaveInfo *mesos.SlaveInfo) {
+func (k *Executor) Reregistered(driver bindings.ExecutorDriver, slaveInfo *mesos.SlaveInfo) {
 	if k.isDone() {
 		return
 	}
@@ -280,7 +280,7 @@ func (k *KubernetesExecutor) Reregistered(driver bindings.ExecutorDriver, slaveI
 }

 // initializeStaticPodsSource unzips the data slice into the static-pods directory
-func (k *KubernetesExecutor) initializeStaticPodsSource(data []byte) {
+func (k *Executor) initializeStaticPodsSource(data []byte) {
 	log.V(2).Infof("extracting static pods config to %s", k.staticPodsConfigPath)
 	err := archive.UnzipDir(data, k.staticPodsConfigPath)
 	if err != nil {
@@ -290,7 +290,7 @@ func (k *KubernetesExecutor) initializeStaticPodsSource(data []byte) {
 }

 // Disconnected is called when the executor is disconnected from the slave.
-func (k *KubernetesExecutor) Disconnected(driver bindings.ExecutorDriver) {
+func (k *Executor) Disconnected(driver bindings.ExecutorDriver) {
 	if k.isDone() {
 		return
 	}
@@ -306,7 +306,7 @@ func (k *KubernetesExecutor) Disconnected(driver bindings.ExecutorDriver) {
 // is running, but the binding is not recorded in the Kubernetes store yet.
 // This function is invoked to tell the executor to record the binding in the
 // Kubernetes store and start the pod via the Kubelet.
-func (k *KubernetesExecutor) LaunchTask(driver bindings.ExecutorDriver, taskInfo *mesos.TaskInfo) {
+func (k *Executor) LaunchTask(driver bindings.ExecutorDriver, taskInfo *mesos.TaskInfo) {
 	if k.isDone() {
 		return
 	}
@@ -356,7 +356,7 @@ func (k *KubernetesExecutor) LaunchTask(driver bindings.ExecutorDriver, taskInfo
 	go k.launchTask(driver, taskId, pod)
 }

-func (k *KubernetesExecutor) handleChangedApiserverPod(pod *api.Pod) {
+func (k *Executor) handleChangedApiserverPod(pod *api.Pod) {
 	// exclude "pre-scheduled" pods which have a NodeName set to this node without being scheduled already
 	taskId := pod.Annotations[meta.TaskIdKey]
 	if taskId == "" {
@@ -402,7 +402,7 @@ func (k *KubernetesExecutor) handleChangedApiserverPod(pod *api.Pod) {
 // a timer that, upon expiration, causes this executor to commit suicide.
 // this implementation runs asynchronously. callers that wish to wait for the
 // reset to complete may wait for the returned signal chan to close.
-func (k *KubernetesExecutor) resetSuicideWatch(driver bindings.ExecutorDriver) <-chan struct{} {
+func (k *Executor) resetSuicideWatch(driver bindings.ExecutorDriver) <-chan struct{} {
 	ch := make(chan struct{})
 	go func() {
 		defer close(ch)
@@ -432,7 +432,7 @@ func (k *KubernetesExecutor) resetSuicideWatch(driver bindings.ExecutorDriver) <
 	return ch
 }

-func (k *KubernetesExecutor) attemptSuicide(driver bindings.ExecutorDriver, abort <-chan struct{}) {
+func (k *Executor) attemptSuicide(driver bindings.ExecutorDriver, abort <-chan struct{}) {
 	k.lock.Lock()
 	defer k.lock.Unlock()

@@ -464,7 +464,7 @@ func (k *KubernetesExecutor) attemptSuicide(driver bindings.ExecutorDriver, abor
 }

 // async continuation of LaunchTask
-func (k *KubernetesExecutor) launchTask(driver bindings.ExecutorDriver, taskId string, pod *api.Pod) {
+func (k *Executor) launchTask(driver bindings.ExecutorDriver, taskId string, pod *api.Pod) {
 	deleteTask := func() {
 		k.lock.Lock()
 		defer k.lock.Unlock()
@@ -475,7 +475,7 @@ func (k *KubernetesExecutor) launchTask(driver bindings.ExecutorDriver, taskId s
 	// TODO(k8s): use Pods interface for binding once clusters are upgraded
 	// return b.Pods(binding.Namespace).Bind(binding)
 	if pod.Spec.NodeName == "" {
-		//HACK(jdef): cloned binding construction from k8s plugin/pkg/scheduler/scheduler.go
+		//HACK(jdef): cloned binding construction from k8s plugin/pkg/scheduler/framework.go
 		binding := &api.Binding{
 			ObjectMeta: api.ObjectMeta{
 				Namespace:   pod.Namespace,
@@ -588,7 +588,7 @@ func (k *KubernetesExecutor) launchTask(driver bindings.ExecutorDriver, taskId s
 	go k._launchTask(driver, taskId, podFullName, psf)
 }

-func (k *KubernetesExecutor) _launchTask(driver bindings.ExecutorDriver, taskId, podFullName string, psf podStatusFunc) {
+func (k *Executor) _launchTask(driver bindings.ExecutorDriver, taskId, podFullName string, psf podStatusFunc) {

 	expired := make(chan struct{})

@@ -669,7 +669,7 @@ reportLost:
 	k.reportLostTask(driver, taskId, messages.LaunchTaskFailed)
 }

-func (k *KubernetesExecutor) __launchTask(driver bindings.ExecutorDriver, taskId, podFullName string, psf podStatusFunc) {
+func (k *Executor) __launchTask(driver bindings.ExecutorDriver, taskId, podFullName string, psf podStatusFunc) {
 	// TODO(nnielsen): Monitor health of pod and report if lost.
 	// Should we also allow this to fail a couple of times before reporting lost?
 	// What if the docker daemon is restarting and we can't connect, but it's
@@ -692,7 +692,7 @@ func (k *KubernetesExecutor) __launchTask(driver bindings.ExecutorDriver, taskId
 // whether the pod is running. It will only return false if the task is still registered and the pod is
 // registered in Docker. Otherwise it returns true. If there's still a task record on file, but no pod
 // in Docker, then we'll also send a TASK_LOST event.
-func (k *KubernetesExecutor) checkForLostPodTask(driver bindings.ExecutorDriver, taskId string, isKnownPod func() bool) bool {
+func (k *Executor) checkForLostPodTask(driver bindings.ExecutorDriver, taskId string, isKnownPod func() bool) bool {
 	// TODO (jdefelice) don't send false alarms for deleted pods (KILLED tasks)
 	k.lock.Lock()
 	defer k.lock.Unlock()
@@ -716,7 +716,7 @@ func (k *KubernetesExecutor) checkForLostPodTask(driver bindings.ExecutorDriver,
 }

 // KillTask is called when the executor receives a request to kill a task.
-func (k *KubernetesExecutor) KillTask(driver bindings.ExecutorDriver, taskId *mesos.TaskID) {
+func (k *Executor) KillTask(driver bindings.ExecutorDriver, taskId *mesos.TaskID) {
 	if k.isDone() {
 		return
 	}
@@ -735,14 +735,14 @@ func (k *KubernetesExecutor) KillTask(driver bindings.ExecutorDriver, taskId *me

 // Reports a lost task to the slave and updates internal task and pod tracking state.
 // Assumes that the caller is locking around pod and task state.
-func (k *KubernetesExecutor) reportLostTask(driver bindings.ExecutorDriver, tid, reason string) {
+func (k *Executor) reportLostTask(driver bindings.ExecutorDriver, tid, reason string) {
 	k.removePodTask(driver, tid, reason, mesos.TaskState_TASK_LOST)
 }

 // deletes the pod and task associated with the task identified by tid and sends a task
 // status update to mesos. also attempts to reset the suicide watch.
 // Assumes that the caller is locking around pod and task state.
-func (k *KubernetesExecutor) removePodTask(driver bindings.ExecutorDriver, tid, reason string, state mesos.TaskState) {
+func (k *Executor) removePodTask(driver bindings.ExecutorDriver, tid, reason string, state mesos.TaskState) {
 	task, ok := k.tasks[tid]
 	if !ok {
 		log.V(1).Infof("Failed to remove task, unknown task %v\n", tid)
@@ -770,7 +770,7 @@ func (k *KubernetesExecutor) removePodTask(driver bindings.ExecutorDriver, tid,
 }

 // FrameworkMessage is called when the framework sends some message to the executor
-func (k *KubernetesExecutor) FrameworkMessage(driver bindings.ExecutorDriver, message string) {
+func (k *Executor) FrameworkMessage(driver bindings.ExecutorDriver, message string) {
 	if k.isDone() {
 		return
 	}
@@ -780,7 +780,7 @@ func (k *KubernetesExecutor) FrameworkMessage(driver bindings.ExecutorDriver, me
 	}

 	log.Infof("Receives message from framework %v\n", message)
-	//TODO(jdef) master reported a lost task, reconcile this! @see scheduler.go:handleTaskLost
+	//TODO(jdef) master reported a lost task, reconcile this! @see framework.go:handleTaskLost
 	if strings.HasPrefix(message, messages.TaskLost+":") {
 		taskId := message[len(messages.TaskLost)+1:]
 		if taskId != "" {
@@ -798,14 +798,14 @@ func (k *KubernetesExecutor) FrameworkMessage(driver bindings.ExecutorDriver, me
 }

 // Shutdown is called when the executor receives a shutdown request.
-func (k *KubernetesExecutor) Shutdown(driver bindings.ExecutorDriver) {
+func (k *Executor) Shutdown(driver bindings.ExecutorDriver) {
 	k.lock.Lock()
 	defer k.lock.Unlock()
 	k.doShutdown(driver)
 }

 // assumes that caller has obtained state lock
-func (k *KubernetesExecutor) doShutdown(driver bindings.ExecutorDriver) {
+func (k *Executor) doShutdown(driver bindings.ExecutorDriver) {
 	defer func() {
 		log.Errorf("exiting with unclean shutdown: %v", recover())
 		if k.exitFunc != nil {
@@ -859,7 +859,7 @@ func (k *KubernetesExecutor) doShutdown(driver bindings.ExecutorDriver) {
 }

 // Destroy existing k8s containers
-func (k *KubernetesExecutor) killKubeletContainers() {
+func (k *Executor) killKubeletContainers() {
 	if containers, err := dockertools.GetKubeletDockerContainers(k.dockerClient, true); err == nil {
 		opts := docker.RemoveContainerOptions{
 			RemoveVolumes: true,
@@ -878,7 +878,7 @@ func (k *KubernetesExecutor) killKubeletContainers() {
 }

 // Error is called when some error happens.
-func (k *KubernetesExecutor) Error(driver bindings.ExecutorDriver, message string) {
+func (k *Executor) Error(driver bindings.ExecutorDriver, message string) {
 	log.Errorln(message)
 }

@@ -890,7 +890,7 @@ func newStatus(taskId *mesos.TaskID, state mesos.TaskState, message string) *mes
 	}
 }

-func (k *KubernetesExecutor) sendStatus(driver bindings.ExecutorDriver, status *mesos.TaskStatus) {
+func (k *Executor) sendStatus(driver bindings.ExecutorDriver, status *mesos.TaskStatus) {
 	select {
 	case <-k.terminate:
 	default:
@@ -898,7 +898,7 @@ func (k *KubernetesExecutor) sendStatus(driver bindings.ExecutorDriver, status *
 	}
 }

-func (k *KubernetesExecutor) sendFrameworkMessage(driver bindings.ExecutorDriver, msg string) {
+func (k *Executor) sendFrameworkMessage(driver bindings.ExecutorDriver, msg string) {
 	select {
 	case <-k.terminate:
 	default:
@@ -906,7 +906,7 @@ func (k *KubernetesExecutor) sendFrameworkMessage(driver bindings.ExecutorDriver
 	}
 }

-func (k *KubernetesExecutor) sendLoop() {
+func (k *Executor) sendLoop() {
 	defer log.V(1).Info("sender loop exiting")
 	for {
 		select {
--- a/contrib/mesos/pkg/executor/executor_test.go
+++ b/contrib/mesos/pkg/executor/executor_test.go
@@ -170,11 +170,10 @@ func TestExecutorLaunchAndKillTask(t *testing.T) {
 	}

 	pod := NewTestPod(1)
-	podTask, err := podtask.New(api.NewDefaultContext(), "",
-		*pod, &mesosproto.ExecutorInfo{})
+	podTask, err := podtask.New(api.NewDefaultContext(), "", pod)
 	assert.Equal(t, nil, err, "must be able to create a task from a pod")

-	taskInfo := podTask.BuildTaskInfo()
+	taskInfo := podTask.BuildTaskInfo(&mesosproto.ExecutorInfo{})
 	data, err := testapi.Default.Codec().Encode(pod)
 	assert.Equal(t, nil, err, "must be able to encode a pod's spec data")
 	taskInfo.Data = data
@@ -417,10 +416,8 @@ func TestExecutorFrameworkMessage(t *testing.T) {

 	// set up a pod to then lose
 	pod := NewTestPod(1)
-	podTask, _ := podtask.New(api.NewDefaultContext(), "foo",
-		*pod, &mesosproto.ExecutorInfo{})
-
-	taskInfo := podTask.BuildTaskInfo()
+	podTask, _ := podtask.New(api.NewDefaultContext(), "foo", pod)
+	taskInfo := podTask.BuildTaskInfo(&mesosproto.ExecutorInfo{})
 	data, _ := testapi.Default.Codec().Encode(pod)
 	taskInfo.Data = data

--- a/contrib/mesos/pkg/executor/mock_test.go
+++ b/contrib/mesos/pkg/executor/mock_test.go
@@ -66,7 +66,7 @@ func (m *MockExecutorDriver) SendFrameworkMessage(msg string) (mesosproto.Status
 	return args.Get(0).(mesosproto.Status), args.Error(1)
 }

-func NewTestKubernetesExecutor() (*KubernetesExecutor, chan kubetypes.PodUpdate) {
+func NewTestKubernetesExecutor() (*Executor, chan kubetypes.PodUpdate) {
 	updates := make(chan kubetypes.PodUpdate, 1024)
 	return New(Config{
 		Docker:  dockertools.ConnectToDockerOrDie("fake://"),
--- a/contrib/mesos/pkg/minion/server.go
+++ b/contrib/mesos/pkg/minion/server.go
@@ -219,7 +219,7 @@ func (ms *MinionServer) launchHyperkubeServer(server string, args []string, logF
 			}
 			pwd, err := os.Getwd()
 			if err != nil {
-				log.Fatalf("Cannot get current directory: %v", err)
+				panic(fmt.Errorf("Cannot get current directory: %v", err))
 			}
 			kmEnv = append(kmEnv, fmt.Sprintf("%s:%s", e, path.Join(pwd, "bin")))
 		}
--- a/contrib/mesos/pkg/scheduler/components/algorithm/algorithm.go
+++ b/contrib/mesos/pkg/scheduler/components/algorithm/algorithm.go
@@ -0,0 +1,167 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package algorithm
+
+import (
+	"fmt"
+
+	log "github.com/golang/glog"
+	"k8s.io/kubernetes/contrib/mesos/pkg/offers"
+	"k8s.io/kubernetes/contrib/mesos/pkg/queue"
+	"k8s.io/kubernetes/contrib/mesos/pkg/scheduler"
+	"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/components/algorithm/podschedulers"
+	"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/errors"
+	"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/podtask"
+	"k8s.io/kubernetes/pkg/api"
+	"k8s.io/kubernetes/pkg/client/cache"
+)
+
+type SchedulerAlgorithm interface {
+	Schedule(pod *api.Pod) (string, error)
+}
+
+// SchedulerAlgorithm implements the algorithm.ScheduleAlgorithm interface
+type schedulerAlgorithm struct {
+	sched        scheduler.Scheduler
+	podUpdates   queue.FIFO
+	podScheduler podschedulers.PodScheduler
+}
+
+func New(sched scheduler.Scheduler, podUpdates queue.FIFO, podScheduler podschedulers.PodScheduler) SchedulerAlgorithm {
+	return &schedulerAlgorithm{
+		sched:        sched,
+		podUpdates:   podUpdates,
+		podScheduler: podScheduler,
+	}
+}
+
+// Schedule implements the Scheduler interface of Kubernetes.
+// It returns the selectedMachine's name and error (if there's any).
+func (k *schedulerAlgorithm) Schedule(pod *api.Pod) (string, error) {
+	log.Infof("Try to schedule pod %v\n", pod.Name)
+	ctx := api.WithNamespace(api.NewDefaultContext(), pod.Namespace)
+
+	// default upstream scheduler passes pod.Name as binding.PodID
+	podKey, err := podtask.MakePodKey(ctx, pod.Name)
+	if err != nil {
+		return "", err
+	}
+
+	k.sched.Lock()
+	defer k.sched.Unlock()
+
+	switch task, state := k.sched.Tasks().ForPod(podKey); state {
+	case podtask.StateUnknown:
+		// There's a bit of a potential race here, a pod could have been yielded() and
+		// then before we get *here* it could be deleted.
+		// We use meta to index the pod in the store since that's what k8s reflector does.
+		podName, err := cache.MetaNamespaceKeyFunc(pod)
+		if err != nil {
+			log.Warningf("aborting Schedule, unable to understand pod object %+v", pod)
+			return "", errors.NoSuchPodErr
+		}
+		if deleted := k.podUpdates.Poll(podName, queue.DELETE_EVENT); deleted {
+			// avoid scheduling a pod that's been deleted between yieldPod() and Schedule()
+			log.Infof("aborting Schedule, pod has been deleted %+v", pod)
+			return "", errors.NoSuchPodErr
+		}
+
+		podTask, err := podtask.New(ctx, "", pod)
+		if err != nil {
+			log.Warningf("aborting Schedule, unable to create podtask object %+v: %v", pod, err)
+			return "", err
+		}
+
+		podTask, err = k.sched.Tasks().Register(podTask)
+		if err != nil {
+			return "", err
+		}
+
+		return k.doSchedule(podTask)
+
+	//TODO(jdef) it's possible that the pod state has diverged from what
+	//we knew previously, we should probably update the task.Pod state here
+	//before proceeding with scheduling
+	case podtask.StatePending:
+		if pod.UID != task.Pod.UID {
+			// we're dealing with a brand new pod spec here, so the old one must have been
+			// deleted -- and so our task store is out of sync w/ respect to reality
+			//TODO(jdef) reconcile task
+			return "", fmt.Errorf("task %v spec is out of sync with pod %v spec, aborting schedule", task.ID, pod.Name)
+		} else if task.Has(podtask.Launched) {
+			// task has been marked as "launched" but the pod binding creation may have failed in k8s,
+			// but we're going to let someone else handle it, probably the mesos task error handler
+			return "", fmt.Errorf("task %s has already been launched, aborting schedule", task.ID)
+		} else {
+			return k.doSchedule(task)
+		}
+
+	default:
+		return "", fmt.Errorf("task %s is not pending, nothing to schedule", task.ID)
+	}
+}
+
+// Call ScheduleFunc and subtract some resources, returning the name of the machine the task is scheduled on
+func (k *schedulerAlgorithm) doSchedule(task *podtask.T) (string, error) {
+	var offer offers.Perishable
+	var err error
+
+	if task.HasAcceptedOffer() {
+		// verify that the offer is still on the table
+		var ok bool
+		offer, ok = k.sched.Offers().Get(task.GetOfferId())
+
+		if !ok || offer.HasExpired() {
+			task.Offer.Release()
+			task.Reset()
+			if err = k.sched.Tasks().Update(task); err != nil {
+				return "", err
+			}
+		}
+	}
+
+	if offer == nil {
+		offer, err = k.podScheduler.SchedulePod(k.sched.Offers(), task)
+	}
+
+	if err != nil {
+		return "", err
+	}
+
+	details := offer.Details()
+	if details == nil {
+		return "", fmt.Errorf("offer already invalid/expired for task %v", task.ID)
+	}
+
+	if task.Offer != nil && task.Offer != offer {
+		return "", fmt.Errorf("task.offer assignment must be idempotent, task %+v: offer %+v", task, offer)
+	}
+
+	task.Offer = offer
+	if err := k.podScheduler.Procurement()(task, details); err != nil {
+		offer.Release()
+		task.Reset()
+		return "", err
+	}
+
+	if err := k.sched.Tasks().Update(task); err != nil {
+		offer.Release()
+		return "", err
+	}
+
+	return details.GetHostname(), nil
+}
--- a/contrib/mesos/pkg/scheduler/components/algorithm/doc.go
+++ b/contrib/mesos/pkg/scheduler/components/algorithm/doc.go
@@ -0,0 +1,18 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+// Package algorithm implements the SchedulerAlgorithm
+package algorithm
--- a/contrib/mesos/pkg/scheduler/components/algorithm/podschedulers/doc.go
+++ b/contrib/mesos/pkg/scheduler/components/algorithm/podschedulers/doc.go
@@ -0,0 +1,19 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+// Package podschedulers defines an interface (w/ implementations) for matching
+// pods against offers.
+package podschedulers
--- a/contrib/mesos/pkg/scheduler/components/algorithm/podschedulers/fcfs.go
+++ b/contrib/mesos/pkg/scheduler/components/algorithm/podschedulers/fcfs.go
@@ -14,7 +14,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 */

-package scheduler
+package podschedulers

 import (
 	"fmt"
@@ -23,6 +23,7 @@ import (

 	"k8s.io/kubernetes/contrib/mesos/pkg/node"
 	"k8s.io/kubernetes/contrib/mesos/pkg/offers"
+	"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/errors"
 	"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/podtask"
 )

@@ -62,7 +63,7 @@ func NewFCFSPodScheduler(as AllocationStrategy, lookupNode node.LookupFunc) PodS
 }

 // A first-come-first-serve scheduler: acquires the first offer that can support the task
-func (fps *fcfsPodScheduler) SchedulePod(r offers.Registry, unused SlaveIndex, task *podtask.T) (offers.Perishable, error) {
+func (fps *fcfsPodScheduler) SchedulePod(r offers.Registry, task *podtask.T) (offers.Perishable, error) {
 	podName := fmt.Sprintf("%s/%s", task.Pod.Namespace, task.Pod.Name)
 	var acceptedOffer offers.Perishable
 	err := r.Walk(func(p offers.Perishable) (bool, error) {
@@ -101,5 +102,5 @@ func (fps *fcfsPodScheduler) SchedulePod(r offers.Registry, unused SlaveIndex, t
 		return nil, err
 	}
 	log.V(2).Infof("failed to find a fit for pod: %s", podName)
-	return nil, noSuitableOffersErr
+	return nil, errors.NoSuitableOffersErr
 }
--- a/contrib/mesos/pkg/scheduler/components/algorithm/podschedulers/types.go
+++ b/contrib/mesos/pkg/scheduler/components/algorithm/podschedulers/types.go
@@ -14,11 +14,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 */

-package scheduler
+package podschedulers

 import (
-	"errors"
-
 	"k8s.io/kubernetes/contrib/mesos/pkg/offers"
 	"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/podtask"
 )
@@ -37,25 +35,11 @@ type PodScheduler interface {
 	// SchedulePod implements how to schedule pods among slaves.
 	// We can have different implementation for different scheduling policy.
 	//
-	// The function accepts a group of slaves (each contains offers from
-	// that slave) and a single pod, which aligns well with the k8s scheduling
-	// algorithm. It returns an offerId that is acceptable for the pod, otherwise
-	// nil. The caller is responsible for filling in task state w/ relevant offer
-	// details.
+	// The function accepts a set of offers and a single pod, which aligns well
+	// with the k8s scheduling algorithm. It returns an offerId that is acceptable
+	// for the pod, otherwise nil. The caller is responsible for filling in task
+	// state w/ relevant offer details.
 	//
 	// See the FCFSPodScheduler for example.
-	SchedulePod(r offers.Registry, slaves SlaveIndex, task *podtask.T) (offers.Perishable, error)
-}
-
-// A minimal placeholder
-type empty struct{}
-
-var (
-	noSuitableOffersErr = errors.New("No suitable offers for pod/task")
-	noSuchPodErr        = errors.New("No such pod exists")
-	noSuchTaskErr       = errors.New("No such task exists")
-)
-
-type SlaveIndex interface {
-	slaveHostNameFor(id string) string
+	SchedulePod(r offers.Registry, task *podtask.T) (offers.Perishable, error)
 }
--- a/contrib/mesos/pkg/scheduler/components/binder/binder.go
+++ b/contrib/mesos/pkg/scheduler/components/binder/binder.go
@@ -0,0 +1,157 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package binder
+
+import (
+	"fmt"
+	"strconv"
+
+	log "github.com/golang/glog"
+	"k8s.io/kubernetes/contrib/mesos/pkg/scheduler"
+	"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/errors"
+	annotation "k8s.io/kubernetes/contrib/mesos/pkg/scheduler/meta"
+	"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/podtask"
+	"k8s.io/kubernetes/pkg/api"
+)
+
+type Binder interface {
+	Bind(binding *api.Binding) error
+}
+
+type binder struct {
+	sched scheduler.Scheduler
+}
+
+func New(sched scheduler.Scheduler) Binder {
+	return &binder{
+		sched: sched,
+	}
+}
+
+// implements binding.Registry, launches the pod-associated-task in mesos
+func (b *binder) Bind(binding *api.Binding) error {
+
+	ctx := api.WithNamespace(api.NewContext(), binding.Namespace)
+
+	// default upstream scheduler passes pod.Name as binding.Name
+	podKey, err := podtask.MakePodKey(ctx, binding.Name)
+	if err != nil {
+		return err
+	}
+
+	b.sched.Lock()
+	defer b.sched.Unlock()
+
+	switch task, state := b.sched.Tasks().ForPod(podKey); state {
+	case podtask.StatePending:
+		return b.bind(ctx, binding, task)
+	default:
+		// in this case it's likely that the pod has been deleted between Schedule
+		// and Bind calls
+		log.Infof("No pending task for pod %s", podKey)
+		return errors.NoSuchPodErr //TODO(jdef) this error is somewhat misleading since the task could be running?!
+	}
+}
+
+func (b *binder) rollback(task *podtask.T, err error) error {
+	task.Offer.Release()
+	task.Reset()
+	if err2 := b.sched.Tasks().Update(task); err2 != nil {
+		log.Errorf("failed to update pod task: %v", err2)
+	}
+	return err
+}
+
+// assumes that: caller has acquired scheduler lock and that the task is still pending
+//
+// bind does not actually do the binding itself, but launches the pod as a Mesos task. The
+// kubernetes executor on the slave will finally do the binding. This is different from the
+// upstream scheduler in the sense that the upstream scheduler does the binding and the
+// kubelet will notice that and launches the pod.
+func (b *binder) bind(ctx api.Context, binding *api.Binding, task *podtask.T) (err error) {
+	// sanity check: ensure that the task hasAcceptedOffer(), it's possible that between
+	// Schedule() and now that the offer for this task was rescinded or invalidated.
+	// ((we should never see this here))
+	if !task.HasAcceptedOffer() {
+		return fmt.Errorf("task has not accepted a valid offer %v", task.ID)
+	}
+
+	// By this time, there is a chance that the slave is disconnected.
+	offerId := task.GetOfferId()
+	if offer, ok := b.sched.Offers().Get(offerId); !ok || offer.HasExpired() {
+		// already rescinded or timed out or otherwise invalidated
+		return b.rollback(task, fmt.Errorf("failed prior to launchTask due to expired offer for task %v", task.ID))
+	}
+
+	if err = b.prepareTaskForLaunch(ctx, binding.Target.Name, task, offerId); err == nil {
+		log.V(2).Infof("launching task: %q on target %q slave %q for pod \"%v/%v\", cpu %.2f, mem %.2f MB",
+			task.ID, binding.Target.Name, task.Spec.SlaveID, task.Pod.Namespace, task.Pod.Name, task.Spec.CPU, task.Spec.Memory)
+		if err = b.sched.LaunchTask(task); err == nil {
+			b.sched.Offers().Invalidate(offerId)
+			task.Set(podtask.Launched)
+			if err = b.sched.Tasks().Update(task); err != nil {
+				// this should only happen if the task has been removed or has changed status,
+				// which SHOULD NOT HAPPEN as long as we're synchronizing correctly
+				log.Errorf("failed to update task w/ Launched status: %v", err)
+			}
+			return
+		}
+	}
+	return b.rollback(task, fmt.Errorf("Failed to launch task %v: %v", task.ID, err))
+}
+
+//TODO(jdef) unit test this, ensure that task's copy of api.Pod is not modified
+func (b *binder) prepareTaskForLaunch(ctx api.Context, machine string, task *podtask.T, offerId string) error {
+	pod := task.Pod
+
+	// we make an effort here to avoid making changes to the task's copy of the pod, since
+	// we want that to reflect the initial user spec, and not the modified spec that we
+	// build for the executor to consume.
+	oemCt := pod.Spec.Containers
+	pod.Spec.Containers = append([]api.Container{}, oemCt...) // (shallow) clone before mod
+
+	if pod.Annotations == nil {
+		pod.Annotations = make(map[string]string)
+	}
+
+	task.SaveRecoveryInfo(pod.Annotations)
+	pod.Annotations[annotation.BindingHostKey] = task.Spec.AssignedSlave
+
+	for _, entry := range task.Spec.PortMap {
+		oemPorts := pod.Spec.Containers[entry.ContainerIdx].Ports
+		ports := append([]api.ContainerPort{}, oemPorts...)
+		p := &ports[entry.PortIdx]
+		p.HostPort = int(entry.OfferPort)
+		op := strconv.FormatUint(entry.OfferPort, 10)
+		pod.Annotations[fmt.Sprintf(annotation.PortMappingKeyFormat, p.Protocol, p.ContainerPort)] = op
+		if p.Name != "" {
+			pod.Annotations[fmt.Sprintf(annotation.PortNameMappingKeyFormat, p.Protocol, p.Name)] = op
+		}
+		pod.Spec.Containers[entry.ContainerIdx].Ports = ports
+	}
+
+	// the kubelet-executor uses this to instantiate the pod
+	log.V(3).Infof("prepared pod spec: %+v", pod)
+
+	data, err := api.Codec.Encode(&pod)
+	if err != nil {
+		log.V(2).Infof("Failed to marshal the pod spec: %v", err)
+		return err
+	}
+	task.Spec.Data = data
+	return nil
+}
--- a/contrib/mesos/pkg/scheduler/components/binder/doc.go
+++ b/contrib/mesos/pkg/scheduler/components/binder/doc.go
@@ -0,0 +1,19 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+// Package binder implements the Binder which launched a task and let the
+// executor do the actual binding.
+package binder
--- a/contrib/mesos/pkg/scheduler/components/controller/controller.go
+++ b/contrib/mesos/pkg/scheduler/components/controller/controller.go
@@ -0,0 +1,107 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package controller
+
+import (
+	"time"
+
+	log "github.com/golang/glog"
+	"k8s.io/kubernetes/contrib/mesos/pkg/runtime"
+	"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/components/algorithm"
+	"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/components/binder"
+	"k8s.io/kubernetes/pkg/api"
+	"k8s.io/kubernetes/pkg/client/record"
+	client "k8s.io/kubernetes/pkg/client/unversioned"
+)
+
+const (
+	recoveryDelay = 100 * time.Millisecond // delay after scheduler plugin crashes, before we resume scheduling
+
+	FailedScheduling = "FailedScheduling"
+	Scheduled        = "Scheduled"
+)
+
+type Controller interface {
+	Run(<-chan struct{})
+}
+
+type controller struct {
+	algorithm algorithm.SchedulerAlgorithm
+	binder    binder.Binder
+	nextPod   func() *api.Pod
+	error     func(*api.Pod, error)
+	recorder  record.EventRecorder
+	client    *client.Client
+	started   chan<- struct{} // startup latch
+}
+
+func New(client *client.Client, algorithm algorithm.SchedulerAlgorithm,
+	recorder record.EventRecorder, nextPod func() *api.Pod, error func(pod *api.Pod, schedulingErr error),
+	binder binder.Binder, started chan<- struct{}) Controller {
+	return &controller{
+		algorithm: algorithm,
+		binder:    binder,
+		nextPod:   nextPod,
+		error:     error,
+		recorder:  recorder,
+		client:    client,
+		started:   started,
+	}
+}
+
+func (s *controller) Run(done <-chan struct{}) {
+	defer close(s.started)
+	go runtime.Until(s.scheduleOne, recoveryDelay, done)
+}
+
+// hacked from GoogleCloudPlatform/kubernetes/plugin/pkg/scheduler/scheduler.go,
+// with the Modeler stuff removed since we don't use it because we have mesos.
+func (s *controller) scheduleOne() {
+	pod := s.nextPod()
+
+	// pods which are pre-scheduled (i.e. NodeName is set) are deleted by the kubelet
+	// in upstream. Not so in Mesos because the kubelet hasn't see that pod yet. Hence,
+	// the scheduler has to take care of this:
+	if pod.Spec.NodeName != "" && pod.DeletionTimestamp != nil {
+		log.V(3).Infof("deleting pre-scheduled, not yet running pod: %s/%s", pod.Namespace, pod.Name)
+		s.client.Pods(pod.Namespace).Delete(pod.Name, api.NewDeleteOptions(0))
+		return
+	}
+
+	log.V(3).Infof("Attempting to schedule: %+v", pod)
+	dest, err := s.algorithm.Schedule(pod)
+	if err != nil {
+		log.V(1).Infof("Failed to schedule: %+v", pod)
+		s.recorder.Eventf(pod, FailedScheduling, "Error scheduling: %v", err)
+		s.error(pod, err)
+		return
+	}
+	b := &api.Binding{
+		ObjectMeta: api.ObjectMeta{Namespace: pod.Namespace, Name: pod.Name},
+		Target: api.ObjectReference{
+			Kind: "Node",
+			Name: dest,
+		},
+	}
+	if err := s.binder.Bind(b); err != nil {
+		log.V(1).Infof("Failed to bind pod: %+v", err)
+		s.recorder.Eventf(pod, FailedScheduling, "Binding rejected: %v", err)
+		s.error(pod, err)
+		return
+	}
+	s.recorder.Eventf(pod, Scheduled, "Successfully assigned %v to %v", pod.Name, dest)
+}
--- a/contrib/mesos/pkg/scheduler/components/controller/doc.go
+++ b/contrib/mesos/pkg/scheduler/components/controller/doc.go
@@ -0,0 +1,20 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+// Package controller implements the scheduling controller which waits for pod
+// events from the queuer (i.e. from the apiserver), passes them to the
+// SchedulerAlgorithm and in case of success to the binder which does the launch.
+package controller
--- a/contrib/mesos/pkg/scheduler/components/deleter/deleter.go
+++ b/contrib/mesos/pkg/scheduler/components/deleter/deleter.go
@@ -0,0 +1,125 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package deleter
+
+import (
+	"time"
+
+	log "github.com/golang/glog"
+	"k8s.io/kubernetes/contrib/mesos/pkg/queue"
+	"k8s.io/kubernetes/contrib/mesos/pkg/runtime"
+	"k8s.io/kubernetes/contrib/mesos/pkg/scheduler"
+	"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/errors"
+	"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/podtask"
+	"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/queuer"
+	"k8s.io/kubernetes/pkg/api"
+)
+
+type Deleter interface {
+	Run(updates <-chan queue.Entry, done <-chan struct{})
+	DeleteOne(pod *queuer.Pod) error
+}
+
+type deleter struct {
+	sched scheduler.Scheduler
+	qr    queuer.Queuer
+}
+
+func New(sched scheduler.Scheduler, qr queuer.Queuer) Deleter {
+	return &deleter{
+		sched: sched,
+		qr:    qr,
+	}
+}
+
+// currently monitors for "pod deleted" events, upon which handle()
+// is invoked.
+func (k *deleter) Run(updates <-chan queue.Entry, done <-chan struct{}) {
+	go runtime.Until(func() {
+		for {
+			entry := <-updates
+			pod := entry.Value().(*queuer.Pod)
+			if entry.Is(queue.DELETE_EVENT) {
+				if err := k.DeleteOne(pod); err != nil {
+					log.Error(err)
+				}
+			} else if !entry.Is(queue.POP_EVENT) {
+				k.qr.UpdatesAvailable()
+			}
+		}
+	}, 1*time.Second, done)
+}
+
+func (k *deleter) DeleteOne(pod *queuer.Pod) error {
+	ctx := api.WithNamespace(api.NewDefaultContext(), pod.Namespace)
+	podKey, err := podtask.MakePodKey(ctx, pod.Name)
+	if err != nil {
+		return err
+	}
+
+	log.V(2).Infof("pod deleted: %v", podKey)
+
+	// order is important here: we want to make sure we have the lock before
+	// removing the pod from the scheduling queue. this makes the concurrent
+	// execution of scheduler-error-handling and delete-handling easier to
+	// reason about.
+	k.sched.Lock()
+	defer k.sched.Unlock()
+
+	// prevent the scheduler from attempting to pop this; it's also possible that
+	// it's concurrently being scheduled (somewhere between pod scheduling and
+	// binding) - if so, then we'll end up removing it from taskRegistry which
+	// will abort Bind()ing
+	k.qr.Dequeue(pod.GetUID())
+
+	switch task, state := k.sched.Tasks().ForPod(podKey); state {
+	case podtask.StateUnknown:
+		log.V(2).Infof("Could not resolve pod '%s' to task id", podKey)
+		return errors.NoSuchPodErr
+
+	// determine if the task has already been launched to mesos, if not then
+	// cleanup is easier (unregister) since there's no state to sync
+	case podtask.StatePending:
+		if !task.Has(podtask.Launched) {
+			// we've been invoked in between Schedule() and Bind()
+			if task.HasAcceptedOffer() {
+				task.Offer.Release()
+				task.Reset()
+				task.Set(podtask.Deleted)
+				//TODO(jdef) probably want better handling here
+				if err := k.sched.Tasks().Update(task); err != nil {
+					return err
+				}
+			}
+			k.sched.Tasks().Unregister(task)
+			return nil
+		}
+		fallthrough
+
+	case podtask.StateRunning:
+		// signal to watchers that the related pod is going down
+		task.Set(podtask.Deleted)
+		if err := k.sched.Tasks().Update(task); err != nil {
+			log.Errorf("failed to update task w/ Deleted status: %v", err)
+		}
+		return k.sched.KillTask(task.ID)
+
+	default:
+		log.Infof("cannot kill pod '%s': non-terminal task not found %v", podKey, task.ID)
+		return errors.NoSuchTaskErr
+	}
+}
--- a/contrib/mesos/pkg/scheduler/components/deleter/deleter_test.go
+++ b/contrib/mesos/pkg/scheduler/components/deleter/deleter_test.go
@@ -0,0 +1,160 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package deleter
+
+import (
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+	"k8s.io/kubernetes/contrib/mesos/pkg/queue"
+	types "k8s.io/kubernetes/contrib/mesos/pkg/scheduler"
+	"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/errors"
+	"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/podtask"
+	"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/queuer"
+	"k8s.io/kubernetes/pkg/api"
+)
+
+func TestDeleteOne_NonexistentPod(t *testing.T) {
+	assert := assert.New(t)
+	obj := &types.MockScheduler{}
+	reg := podtask.NewInMemoryRegistry()
+	obj.On("Tasks").Return(reg)
+
+	q := queue.NewDelayFIFO()
+	qr := queuer.New(q, nil)
+	assert.Equal(0, len(q.List()))
+	d := New(obj, qr)
+	pod := &queuer.Pod{Pod: &api.Pod{
+		ObjectMeta: api.ObjectMeta{
+			Name:      "foo",
+			Namespace: api.NamespaceDefault,
+		}}}
+	err := d.DeleteOne(pod)
+	assert.Equal(err, errors.NoSuchPodErr)
+	obj.AssertExpectations(t)
+}
+
+func TestDeleteOne_PendingPod(t *testing.T) {
+	assert := assert.New(t)
+	obj := &types.MockScheduler{}
+	reg := podtask.NewInMemoryRegistry()
+	obj.On("Tasks").Return(reg)
+
+	pod := &queuer.Pod{Pod: &api.Pod{
+		ObjectMeta: api.ObjectMeta{
+			Name:      "foo",
+			UID:       "foo0",
+			Namespace: api.NamespaceDefault,
+		}}}
+	task, err := podtask.New(api.NewDefaultContext(), "bar", pod.Pod)
+	if err != nil {
+		t.Fatalf("failed to create task: %v", err)
+	}
+
+	_, err = reg.Register(task)
+	if err != nil {
+		t.Fatalf("failed to register task: %v", err)
+	}
+
+	// preconditions
+	q := queue.NewDelayFIFO()
+	qr := queuer.New(q, nil)
+	q.Add(pod, queue.ReplaceExisting)
+	assert.Equal(1, len(q.List()))
+	_, found := q.Get("default/foo")
+	assert.True(found)
+
+	// exec & post conditions
+	d := New(obj, qr)
+	err = d.DeleteOne(pod)
+	assert.Nil(err)
+	_, found = q.Get("foo0")
+	assert.False(found)
+	assert.Equal(0, len(q.List()))
+	obj.AssertExpectations(t)
+}
+
+func TestDeleteOne_Running(t *testing.T) {
+	assert := assert.New(t)
+	obj := &types.MockScheduler{}
+	reg := podtask.NewInMemoryRegistry()
+	obj.On("Tasks").Return(reg)
+
+	pod := &queuer.Pod{Pod: &api.Pod{
+		ObjectMeta: api.ObjectMeta{
+			Name:      "foo",
+			UID:       "foo0",
+			Namespace: api.NamespaceDefault,
+		}}}
+	task, err := podtask.New(api.NewDefaultContext(), "bar", pod.Pod)
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+
+	task, err = reg.Register(task)
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+
+	task.Set(podtask.Launched)
+	err = reg.Update(task)
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+
+	// preconditions
+	q := queue.NewDelayFIFO()
+	qr := queuer.New(q, nil)
+	q.Add(pod, queue.ReplaceExisting)
+	assert.Equal(1, len(q.List()))
+	_, found := q.Get("default/foo")
+	assert.True(found)
+
+	obj.On("KillTask", task.ID).Return(nil)
+
+	// exec & post conditions
+	d := New(obj, qr)
+	err = d.DeleteOne(pod)
+	assert.Nil(err)
+	_, found = q.Get("foo0")
+	assert.False(found)
+	assert.Equal(0, len(q.List()))
+	obj.AssertExpectations(t)
+}
+
+func TestDeleteOne_badPodNaming(t *testing.T) {
+	assert := assert.New(t)
+	obj := &types.MockScheduler{}
+	pod := &queuer.Pod{Pod: &api.Pod{}}
+	q := queue.NewDelayFIFO()
+	qr := queuer.New(q, nil)
+	d := New(obj, qr)
+
+	err := d.DeleteOne(pod)
+	assert.NotNil(err)
+
+	pod.Pod.ObjectMeta.Name = "foo"
+	err = d.DeleteOne(pod)
+	assert.NotNil(err)
+
+	pod.Pod.ObjectMeta.Name = ""
+	pod.Pod.ObjectMeta.Namespace = "bar"
+	err = d.DeleteOne(pod)
+	assert.NotNil(err)
+
+	obj.AssertExpectations(t)
+}
--- a/contrib/mesos/pkg/scheduler/components/deleter/doc.go
+++ b/contrib/mesos/pkg/scheduler/components/deleter/doc.go
@@ -0,0 +1,19 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+// Package deleter implements the deleter which listens for pod DELETE events
+// from the apiserver and kills tasks for deleted pods.
+package deleter
--- a/contrib/mesos/pkg/scheduler/components/doc.go
+++ b/contrib/mesos/pkg/scheduler/components/doc.go
@@ -0,0 +1,20 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+// Package components implements independent aspects of the scheduler which
+// do not use Framework or Scheduler internals, but rely solely on the Scheduler
+// interface.
+package components
--- a/contrib/mesos/pkg/scheduler/components/errorhandler/doc.go
+++ b/contrib/mesos/pkg/scheduler/components/errorhandler/doc.go
@@ -0,0 +1,19 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+// Package errorhandler implements the ErrorHandler which handles scheduer error
+// and possibly requeue pods for scheduling again.
+package errorhandler
--- a/contrib/mesos/pkg/scheduler/components/errorhandler/errorhandler.go
+++ b/contrib/mesos/pkg/scheduler/components/errorhandler/errorhandler.go
@@ -0,0 +1,97 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package errorhandler
+
+import (
+	log "github.com/golang/glog"
+	"k8s.io/kubernetes/contrib/mesos/pkg/backoff"
+	"k8s.io/kubernetes/contrib/mesos/pkg/queue"
+	"k8s.io/kubernetes/contrib/mesos/pkg/scheduler"
+	"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/errors"
+	"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/podtask"
+	"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/queuer"
+	"k8s.io/kubernetes/pkg/api"
+	"k8s.io/kubernetes/pkg/util"
+)
+
+type ErrorHandler interface {
+	Error(pod *api.Pod, schedulingErr error)
+}
+
+type errorHandler struct {
+	sched        scheduler.Scheduler
+	backoff      *backoff.Backoff
+	qr           queuer.Queuer
+	newBreakChan func(podKey string) queue.BreakChan
+}
+
+func New(sched scheduler.Scheduler, backoff *backoff.Backoff, qr queuer.Queuer, newBC func(podKey string) queue.BreakChan) ErrorHandler {
+	return &errorHandler{
+		sched:        sched,
+		backoff:      backoff,
+		qr:           qr,
+		newBreakChan: newBC,
+	}
+}
+
+// implementation of scheduling plugin's Error func; see plugin/pkg/scheduler
+func (k *errorHandler) Error(pod *api.Pod, schedulingErr error) {
+
+	if schedulingErr == errors.NoSuchPodErr {
+		log.V(2).Infof("Not rescheduling non-existent pod %v", pod.Name)
+		return
+	}
+
+	log.Infof("Error scheduling %v: %v; retrying", pod.Name, schedulingErr)
+	defer util.HandleCrash()
+
+	// default upstream scheduler passes pod.Name as binding.PodID
+	ctx := api.WithNamespace(api.NewDefaultContext(), pod.Namespace)
+	podKey, err := podtask.MakePodKey(ctx, pod.Name)
+	if err != nil {
+		log.Errorf("Failed to construct pod key, aborting scheduling for pod %v: %v", pod.Name, err)
+		return
+	}
+
+	k.backoff.GC()
+	k.sched.Lock()
+	defer k.sched.Unlock()
+
+	switch task, state := k.sched.Tasks().ForPod(podKey); state {
+	case podtask.StateUnknown:
+		// if we don't have a mapping here any more then someone deleted the pod
+		log.V(2).Infof("Could not resolve pod to task, aborting pod reschdule: %s", podKey)
+		return
+
+	case podtask.StatePending:
+		if task.Has(podtask.Launched) {
+			log.V(2).Infof("Skipping re-scheduling for already-launched pod %v", podKey)
+			return
+		}
+		breakoutEarly := queue.BreakChan(nil)
+		if schedulingErr == errors.NoSuitableOffersErr {
+			log.V(3).Infof("adding backoff breakout handler for pod %v", podKey)
+			breakoutEarly = k.newBreakChan(podKey)
+		}
+		delay := k.backoff.Get(podKey)
+		log.V(3).Infof("requeuing pod %v with delay %v", podKey, delay)
+		k.qr.Requeue(&queuer.Pod{Pod: pod, Delay: &delay, Notify: breakoutEarly})
+
+	default:
+		log.V(2).Infof("Task is no longer pending, aborting reschedule for pod %v", podKey)
+	}
+}
--- a/contrib/mesos/pkg/scheduler/components/framework/doc.go
+++ b/contrib/mesos/pkg/scheduler/components/framework/doc.go
@@ -14,5 +14,5 @@ See the License for the specific language governing permissions and
 limitations under the License.
 */

-// Package slave manages node hostnames for slave ids.
-package slave
+// Package framework implements the Mesos scheduler.
+package framework
--- a/contrib/mesos/pkg/scheduler/components/framework/driver_mock.go
+++ b/contrib/mesos/pkg/scheduler/components/framework/driver_mock.go
@@ -14,83 +14,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 */

-package scheduler
+package framework

 import (
-	"sync"
-	"testing"
-
 	mesos "github.com/mesos/mesos-go/mesosproto"
 	"github.com/stretchr/testify/mock"
-	"k8s.io/kubernetes/contrib/mesos/pkg/offers"
-	"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/podtask"
-	"k8s.io/kubernetes/pkg/api"
 )

-// implements SchedulerInterface
-type MockScheduler struct {
-	sync.RWMutex
-	mock.Mock
-}
-
-func (m *MockScheduler) slaveHostNameFor(id string) (hostName string) {
-	args := m.Called(id)
-	x := args.Get(0)
-	if x != nil {
-		hostName = x.(string)
-	}
-	return
-}
-func (m *MockScheduler) algorithm() (f PodScheduler) {
-	args := m.Called()
-	x := args.Get(0)
-	if x != nil {
-		f = x.(PodScheduler)
-	}
-	return
-}
-func (m *MockScheduler) createPodTask(ctx api.Context, pod *api.Pod) (task *podtask.T, err error) {
-	args := m.Called(ctx, pod)
-	x := args.Get(0)
-	if x != nil {
-		task = x.(*podtask.T)
-	}
-	err = args.Error(1)
-	return
-}
-func (m *MockScheduler) offers() (f offers.Registry) {
-	args := m.Called()
-	x := args.Get(0)
-	if x != nil {
-		f = x.(offers.Registry)
-	}
-	return
-}
-func (m *MockScheduler) tasks() (f podtask.Registry) {
-	args := m.Called()
-	x := args.Get(0)
-	if x != nil {
-		f = x.(podtask.Registry)
-	}
-	return
-}
-func (m *MockScheduler) killTask(taskId string) error {
-	args := m.Called(taskId)
-	return args.Error(0)
-}
-func (m *MockScheduler) launchTask(task *podtask.T) error {
-	args := m.Called(task)
-	return args.Error(0)
-}
-
-// @deprecated this is a placeholder for me to test the mock package
-func TestNoSlavesYet(t *testing.T) {
-	obj := &MockScheduler{}
-	obj.On("slaveHostNameFor", "foo").Return(nil)
-	obj.slaveHostNameFor("foo")
-	obj.AssertExpectations(t)
-}
-
 /*-----------------------------------------------------------------------------
 |
 |   this really belongs in the mesos-go package, but that's being updated soon
@@ -146,57 +76,84 @@ func (m *MockSchedulerDriver) Init() error {
 	args := m.Called()
 	return args.Error(0)
 }
+
 func (m *MockSchedulerDriver) Start() (mesos.Status, error) {
 	args := m.Called()
 	return status(args, 0), args.Error(1)
 }
+
 func (m *MockSchedulerDriver) Stop(b bool) (mesos.Status, error) {
 	args := m.Called(b)
 	return status(args, 0), args.Error(1)
 }
+
 func (m *MockSchedulerDriver) Abort() (mesos.Status, error) {
 	args := m.Called()
 	return status(args, 0), args.Error(1)
 }
+
 func (m *MockSchedulerDriver) Join() (mesos.Status, error) {
 	args := m.Called()
 	return status(args, 0), args.Error(1)
 }
+
 func (m *MockSchedulerDriver) Run() (mesos.Status, error) {
 	args := m.Called()
 	return status(args, 0), args.Error(1)
 }
+
 func (m *MockSchedulerDriver) RequestResources(r []*mesos.Request) (mesos.Status, error) {
 	args := m.Called(r)
 	return status(args, 0), args.Error(1)
 }
+
 func (m *MockSchedulerDriver) ReconcileTasks(statuses []*mesos.TaskStatus) (mesos.Status, error) {
 	args := m.Called(statuses)
 	return status(args, 0), args.Error(1)
 }
+
 func (m *MockSchedulerDriver) LaunchTasks(offerIds []*mesos.OfferID, ti []*mesos.TaskInfo, f *mesos.Filters) (mesos.Status, error) {
 	args := m.Called(offerIds, ti, f)
 	return status(args, 0), args.Error(1)
 }
+
 func (m *MockSchedulerDriver) KillTask(tid *mesos.TaskID) (mesos.Status, error) {
 	args := m.Called(tid)
 	return status(args, 0), args.Error(1)
 }
+
 func (m *MockSchedulerDriver) DeclineOffer(oid *mesos.OfferID, f *mesos.Filters) (mesos.Status, error) {
 	args := m.Called(oid, f)
 	return status(args, 0), args.Error(1)
 }
+
 func (m *MockSchedulerDriver) ReviveOffers() (mesos.Status, error) {
 	args := m.Called()
 	return status(args, 0), args.Error(0)
 }
+
 func (m *MockSchedulerDriver) SendFrameworkMessage(eid *mesos.ExecutorID, sid *mesos.SlaveID, s string) (mesos.Status, error) {
 	args := m.Called(eid, sid, s)
 	return status(args, 0), args.Error(1)
 }
+
 func (m *MockSchedulerDriver) Destroy() {
 	m.Called()
 }
+
 func (m *MockSchedulerDriver) Wait() {
 	m.Called()
 }
+
+type JoinableDriver struct {
+	MockSchedulerDriver
+	joinFunc func() (mesos.Status, error)
+}
+
+// Join invokes joinFunc if it has been set, otherwise blocks forever
+func (m *JoinableDriver) Join() (mesos.Status, error) {
+	if m.joinFunc != nil {
+		return m.joinFunc()
+	}
+	select {}
+}
--- a/contrib/mesos/pkg/scheduler/components/framework/framework.go
+++ b/contrib/mesos/pkg/scheduler/components/framework/framework.go
@@ -0,0 +1,716 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package framework
+
+import (
+	"fmt"
+	"io"
+	"math"
+	"net/http"
+	"sync"
+	"time"
+
+	log "github.com/golang/glog"
+	mesos "github.com/mesos/mesos-go/mesosproto"
+	mutil "github.com/mesos/mesos-go/mesosutil"
+	bindings "github.com/mesos/mesos-go/scheduler"
+	execcfg "k8s.io/kubernetes/contrib/mesos/pkg/executor/config"
+	"k8s.io/kubernetes/contrib/mesos/pkg/executor/messages"
+	"k8s.io/kubernetes/contrib/mesos/pkg/node"
+	"k8s.io/kubernetes/contrib/mesos/pkg/offers"
+	offermetrics "k8s.io/kubernetes/contrib/mesos/pkg/offers/metrics"
+	"k8s.io/kubernetes/contrib/mesos/pkg/proc"
+	"k8s.io/kubernetes/contrib/mesos/pkg/runtime"
+	"k8s.io/kubernetes/contrib/mesos/pkg/scheduler"
+	"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/components/tasksreconciler"
+	schedcfg "k8s.io/kubernetes/contrib/mesos/pkg/scheduler/config"
+	merrors "k8s.io/kubernetes/contrib/mesos/pkg/scheduler/errors"
+	"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/meta"
+	"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/metrics"
+	"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/podtask"
+	"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/uid"
+	"k8s.io/kubernetes/pkg/api"
+	"k8s.io/kubernetes/pkg/api/errors"
+	client "k8s.io/kubernetes/pkg/client/unversioned"
+	"k8s.io/kubernetes/pkg/fields"
+	"k8s.io/kubernetes/pkg/kubelet/container"
+	kubetypes "k8s.io/kubernetes/pkg/kubelet/types"
+	"k8s.io/kubernetes/pkg/labels"
+	"k8s.io/kubernetes/pkg/util/sets"
+)
+
+type Framework interface {
+	bindings.Scheduler
+
+	Init(sched scheduler.Scheduler, electedMaster proc.Process, mux *http.ServeMux) error
+	Registration() <-chan struct{}
+	Offers() offers.Registry
+	LaunchTask(t *podtask.T) error
+	KillTask(id string) error
+}
+
+type framework struct {
+	// We use a lock here to avoid races
+	// between invoking the mesos callback
+	*sync.RWMutex
+
+	// Config related, write-once
+	sched             scheduler.Scheduler
+	schedulerConfig   *schedcfg.Config
+	executor          *mesos.ExecutorInfo
+	executorGroup     uint64
+	client            *client.Client
+	failoverTimeout   float64 // in seconds
+	reconcileInterval int64
+	nodeRegistrator   node.Registrator
+	storeFrameworkId  func(id string)
+
+	// Mesos context
+	driver         bindings.SchedulerDriver // late initialization
+	frameworkId    *mesos.FrameworkID
+	masterInfo     *mesos.MasterInfo
+	registered     bool
+	registration   chan struct{} // signal chan that closes upon first successful registration
+	onRegistration sync.Once
+	offers         offers.Registry
+	slaveHostNames *slaveRegistry
+
+	// via deferred init
+	tasksReconciler    taskreconciler.TasksReconciler
+	mux                *http.ServeMux
+	reconcileCooldown  time.Duration
+	asRegisteredMaster proc.Doer
+	terminate          <-chan struct{} // signal chan, closes when we should kill background tasks
+}
+
+type Config struct {
+	SchedulerConfig   schedcfg.Config
+	Executor          *mesos.ExecutorInfo
+	Client            *client.Client
+	StoreFrameworkId  func(id string)
+	FailoverTimeout   float64
+	ReconcileInterval int64
+	ReconcileCooldown time.Duration
+	LookupNode        node.LookupFunc
+}
+
+// New creates a new Framework
+func New(config Config) Framework {
+	var k *framework
+	k = &framework{
+		schedulerConfig:   &config.SchedulerConfig,
+		RWMutex:           new(sync.RWMutex),
+		executor:          config.Executor,
+		executorGroup:     uid.Parse(config.Executor.ExecutorId.GetValue()).Group(),
+		client:            config.Client,
+		failoverTimeout:   config.FailoverTimeout,
+		reconcileInterval: config.ReconcileInterval,
+		nodeRegistrator:   node.NewRegistrator(config.Client, config.LookupNode),
+		offers: offers.CreateRegistry(offers.RegistryConfig{
+			Compat: func(o *mesos.Offer) bool {
+				// the node must be registered and have up-to-date labels
+				n := config.LookupNode(o.GetHostname())
+				if n == nil || !node.IsUpToDate(n, node.SlaveAttributesToLabels(o.GetAttributes())) {
+					return false
+				}
+
+				// the executor IDs must not identify a kubelet-executor with a group that doesn't match ours
+				for _, eid := range o.GetExecutorIds() {
+					execuid := uid.Parse(eid.GetValue())
+					if execuid.Name() == execcfg.DefaultInfoID && execuid.Group() != k.executorGroup {
+						return false
+					}
+				}
+
+				return true
+			},
+			DeclineOffer: func(id string) <-chan error {
+				errOnce := proc.NewErrorOnce(k.terminate)
+				errOuter := k.asRegisteredMaster.Do(func() {
+					var err error
+					defer errOnce.Report(err)
+					offerId := mutil.NewOfferID(id)
+					filters := &mesos.Filters{}
+					_, err = k.driver.DeclineOffer(offerId, filters)
+				})
+				return errOnce.Send(errOuter).Err()
+			},
+			// remember expired offers so that we can tell if a previously scheduler offer relies on one
+			LingerTTL:     config.SchedulerConfig.OfferLingerTTL.Duration,
+			TTL:           config.SchedulerConfig.OfferTTL.Duration,
+			ListenerDelay: config.SchedulerConfig.ListenerDelay.Duration,
+		}),
+		slaveHostNames:    newSlaveRegistry(),
+		reconcileCooldown: config.ReconcileCooldown,
+		registration:      make(chan struct{}),
+		asRegisteredMaster: proc.DoerFunc(func(proc.Action) <-chan error {
+			return proc.ErrorChanf("cannot execute action with unregistered scheduler")
+		}),
+		storeFrameworkId: config.StoreFrameworkId,
+	}
+	return k
+}
+
+func (k *framework) Init(sched scheduler.Scheduler, electedMaster proc.Process, mux *http.ServeMux) error {
+	log.V(1).Infoln("initializing kubernetes mesos scheduler")
+
+	k.sched = sched
+	k.mux = mux
+	k.asRegisteredMaster = proc.DoerFunc(func(a proc.Action) <-chan error {
+		if !k.registered {
+			return proc.ErrorChanf("failed to execute action, scheduler is disconnected")
+		}
+		return electedMaster.Do(a)
+	})
+	k.terminate = electedMaster.Done()
+	k.offers.Init(k.terminate)
+	k.nodeRegistrator.Run(k.terminate)
+	return k.recoverTasks()
+}
+
+func (k *framework) asMaster() proc.Doer {
+	k.RLock()
+	defer k.RUnlock()
+	return k.asRegisteredMaster
+}
+
+func (k *framework) installDebugHandlers(mux *http.ServeMux) {
+	wrappedHandler := func(uri string, h http.Handler) {
+		mux.HandleFunc(uri, func(w http.ResponseWriter, r *http.Request) {
+			ch := make(chan struct{})
+			closer := runtime.Closer(ch)
+			proc.OnError(k.asMaster().Do(func() {
+				defer closer()
+				h.ServeHTTP(w, r)
+			}), func(err error) {
+				defer closer()
+				log.Warningf("failed HTTP request for %s: %v", uri, err)
+				w.WriteHeader(http.StatusServiceUnavailable)
+			}, k.terminate)
+			select {
+			case <-time.After(k.schedulerConfig.HttpHandlerTimeout.Duration):
+				log.Warningf("timed out waiting for request to be processed")
+				w.WriteHeader(http.StatusServiceUnavailable)
+				return
+			case <-ch: // noop
+			}
+		})
+	}
+	requestReconciliation := func(uri string, requestAction func()) {
+		wrappedHandler(uri, http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+			requestAction()
+			w.WriteHeader(http.StatusNoContent)
+		}))
+	}
+	requestReconciliation("/debug/actions/requestExplicit", k.tasksReconciler.RequestExplicit)
+	requestReconciliation("/debug/actions/requestImplicit", k.tasksReconciler.RequestImplicit)
+
+	wrappedHandler("/debug/actions/kamikaze", http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		slaves := k.slaveHostNames.SlaveIDs()
+		for _, slaveId := range slaves {
+			_, err := k.driver.SendFrameworkMessage(
+				k.executor.ExecutorId,
+				mutil.NewSlaveID(slaveId),
+				messages.Kamikaze)
+			if err != nil {
+				log.Warningf("failed to send kamikaze message to slave %s: %v", slaveId, err)
+			} else {
+				io.WriteString(w, fmt.Sprintf("kamikaze slave %s\n", slaveId))
+			}
+		}
+		io.WriteString(w, "OK")
+	}))
+}
+
+func (k *framework) Registration() <-chan struct{} {
+	return k.registration
+}
+
+// Registered is called when the scheduler registered with the master successfully.
+func (k *framework) Registered(drv bindings.SchedulerDriver, fid *mesos.FrameworkID, mi *mesos.MasterInfo) {
+	log.Infof("Scheduler registered with the master: %v with frameworkId: %v\n", mi, fid)
+
+	k.driver = drv
+	k.frameworkId = fid
+	k.masterInfo = mi
+	k.registered = true
+
+	k.onRegistration.Do(func() { k.onInitialRegistration(drv) })
+	k.tasksReconciler.RequestExplicit()
+}
+
+// Reregistered is called when the scheduler re-registered with the master successfully.
+// This happends when the master fails over.
+func (k *framework) Reregistered(drv bindings.SchedulerDriver, mi *mesos.MasterInfo) {
+	log.Infof("Scheduler reregistered with the master: %v\n", mi)
+
+	k.driver = drv
+	k.masterInfo = mi
+	k.registered = true
+
+	k.onRegistration.Do(func() { k.onInitialRegistration(drv) })
+	k.tasksReconciler.RequestExplicit()
+}
+
+// perform one-time initialization actions upon the first registration event received from Mesos.
+func (k *framework) onInitialRegistration(driver bindings.SchedulerDriver) {
+	defer close(k.registration)
+
+	if k.failoverTimeout > 0 {
+		refreshInterval := k.schedulerConfig.FrameworkIdRefreshInterval.Duration
+		if k.failoverTimeout < k.schedulerConfig.FrameworkIdRefreshInterval.Duration.Seconds() {
+			refreshInterval = time.Duration(math.Max(1, k.failoverTimeout/2)) * time.Second
+		}
+		go runtime.Until(func() {
+			k.storeFrameworkId(k.frameworkId.GetValue())
+		}, refreshInterval, k.terminate)
+	}
+
+	r1 := k.makeTaskRegistryReconciler()
+	r2 := k.makePodRegistryReconciler()
+
+	k.tasksReconciler = taskreconciler.New(k.asRegisteredMaster, taskreconciler.MakeComposite(k.terminate, r1, r2),
+		k.reconcileCooldown, k.schedulerConfig.ExplicitReconciliationAbortTimeout.Duration, k.terminate)
+	go k.tasksReconciler.Run(driver, k.terminate)
+
+	if k.reconcileInterval > 0 {
+		ri := time.Duration(k.reconcileInterval) * time.Second
+		time.AfterFunc(k.schedulerConfig.InitialImplicitReconciliationDelay.Duration, func() { runtime.Until(k.tasksReconciler.RequestImplicit, ri, k.terminate) })
+		log.Infof("will perform implicit task reconciliation at interval: %v after %v", ri, k.schedulerConfig.InitialImplicitReconciliationDelay.Duration)
+	}
+
+	k.installDebugHandlers(k.mux)
+}
+
+// Disconnected is called when the scheduler loses connection to the master.
+func (k *framework) Disconnected(driver bindings.SchedulerDriver) {
+	log.Infof("Master disconnected!\n")
+
+	k.registered = false
+
+	// discard all cached offers to avoid unnecessary TASK_LOST updates
+	k.offers.Invalidate("")
+}
+
+// ResourceOffers is called when the scheduler receives some offers from the master.
+func (k *framework) ResourceOffers(driver bindings.SchedulerDriver, offers []*mesos.Offer) {
+	log.V(2).Infof("Received offers %+v", offers)
+
+	// Record the offers in the global offer map as well as each slave's offer map.
+	k.offers.Add(offers)
+	for _, offer := range offers {
+		slaveId := offer.GetSlaveId().GetValue()
+		k.slaveHostNames.Register(slaveId, offer.GetHostname())
+
+		// create api object if not existing already
+		if k.nodeRegistrator != nil {
+			labels := node.SlaveAttributesToLabels(offer.GetAttributes())
+			_, err := k.nodeRegistrator.Register(offer.GetHostname(), labels)
+			if err != nil {
+				log.Error(err)
+			}
+		}
+	}
+}
+
+// OfferRescinded is called when the resources are recinded from the scheduler.
+func (k *framework) OfferRescinded(driver bindings.SchedulerDriver, offerId *mesos.OfferID) {
+	log.Infof("Offer rescinded %v\n", offerId)
+
+	oid := offerId.GetValue()
+	k.offers.Delete(oid, offermetrics.OfferRescinded)
+}
+
+// StatusUpdate is called when a status update message is sent to the scheduler.
+func (k *framework) StatusUpdate(driver bindings.SchedulerDriver, taskStatus *mesos.TaskStatus) {
+
+	source, reason := "none", "none"
+	if taskStatus.Source != nil {
+		source = (*taskStatus.Source).String()
+	}
+	if taskStatus.Reason != nil {
+		reason = (*taskStatus.Reason).String()
+	}
+	taskState := taskStatus.GetState()
+	metrics.StatusUpdates.WithLabelValues(source, reason, taskState.String()).Inc()
+
+	message := "none"
+	if taskStatus.Message != nil {
+		message = *taskStatus.Message
+	}
+
+	log.Infof(
+		"task status update %q from %q for task %q on slave %q executor %q for reason %q with message %q",
+		taskState.String(),
+		source,
+		taskStatus.TaskId.GetValue(),
+		taskStatus.SlaveId.GetValue(),
+		taskStatus.ExecutorId.GetValue(),
+		reason,
+		message,
+	)
+
+	switch taskState {
+	case mesos.TaskState_TASK_RUNNING, mesos.TaskState_TASK_FINISHED, mesos.TaskState_TASK_STARTING, mesos.TaskState_TASK_STAGING:
+		if _, state := k.sched.Tasks().UpdateStatus(taskStatus); state == podtask.StateUnknown {
+			if taskState != mesos.TaskState_TASK_FINISHED {
+				//TODO(jdef) what if I receive this after a TASK_LOST or TASK_KILLED?
+				//I don't want to reincarnate then..  TASK_LOST is a special case because
+				//the master is stateless and there are scenarios where I may get TASK_LOST
+				//followed by TASK_RUNNING.
+				//TODO(jdef) consider running this asynchronously since there are API server
+				//calls that may be made
+				k.reconcileNonTerminalTask(driver, taskStatus)
+			} // else, we don't really care about FINISHED tasks that aren't registered
+			return
+		}
+		if hostName := k.slaveHostNames.HostName(taskStatus.GetSlaveId().GetValue()); hostName == "" {
+			// a registered task has an update reported by a slave that we don't recognize.
+			// this should never happen! So we don't reconcile it.
+			log.Errorf("Ignore status %+v because the slave does not exist", taskStatus)
+			return
+		}
+	case mesos.TaskState_TASK_FAILED, mesos.TaskState_TASK_ERROR:
+		if task, _ := k.sched.Tasks().UpdateStatus(taskStatus); task != nil {
+			if task.Has(podtask.Launched) && !task.Has(podtask.Bound) {
+				go k.sched.Reconcile(task)
+				return
+			}
+		} else {
+			// unknown task failed, not much we can do about it
+			return
+		}
+		// last-ditch effort to reconcile our records
+		fallthrough
+	case mesos.TaskState_TASK_LOST, mesos.TaskState_TASK_KILLED:
+		k.reconcileTerminalTask(driver, taskStatus)
+	default:
+		log.Errorf(
+			"unknown task status %q from %q for task %q on slave %q executor %q for reason %q with message %q",
+			taskState.String(),
+			source,
+			taskStatus.TaskId.GetValue(),
+			taskStatus.SlaveId.GetValue(),
+			taskStatus.ExecutorId.GetValue(),
+			reason,
+			message,
+		)
+	}
+}
+
+func (k *framework) reconcileTerminalTask(driver bindings.SchedulerDriver, taskStatus *mesos.TaskStatus) {
+	task, state := k.sched.Tasks().UpdateStatus(taskStatus)
+
+	if (state == podtask.StateRunning || state == podtask.StatePending) &&
+		((taskStatus.GetSource() == mesos.TaskStatus_SOURCE_MASTER && taskStatus.GetReason() == mesos.TaskStatus_REASON_RECONCILIATION) ||
+			(taskStatus.GetSource() == mesos.TaskStatus_SOURCE_SLAVE && taskStatus.GetReason() == mesos.TaskStatus_REASON_EXECUTOR_TERMINATED) ||
+			(taskStatus.GetSource() == mesos.TaskStatus_SOURCE_SLAVE && taskStatus.GetReason() == mesos.TaskStatus_REASON_EXECUTOR_UNREGISTERED) ||
+			(taskStatus.GetSource() == mesos.TaskStatus_SOURCE_EXECUTOR && taskStatus.GetMessage() == messages.ContainersDisappeared)) {
+		//--
+		// pod-task has metadata that refers to:
+		// (1) a task that Mesos no longer knows about, or else
+		// (2) a pod that the Kubelet will never report as "failed"
+		// (3) a pod that the kubeletExecutor reported as lost (likely due to docker daemon crash/restart)
+		// For now, destroy the pod and hope that there's a replication controller backing it up.
+		// TODO(jdef) for case #2 don't delete the pod, just update it's status to Failed
+		pod := &task.Pod
+		log.Warningf("deleting rogue pod %v/%v for lost task %v", pod.Namespace, pod.Name, task.ID)
+		if err := k.client.Pods(pod.Namespace).Delete(pod.Name, api.NewDeleteOptions(0)); err != nil && !errors.IsNotFound(err) {
+			log.Errorf("failed to delete pod %v/%v for terminal task %v: %v", pod.Namespace, pod.Name, task.ID, err)
+		}
+	} else if taskStatus.GetReason() == mesos.TaskStatus_REASON_EXECUTOR_TERMINATED || taskStatus.GetReason() == mesos.TaskStatus_REASON_EXECUTOR_UNREGISTERED {
+		// attempt to prevent dangling pods in the pod and task registries
+		log.V(1).Infof("request explicit reconciliation to clean up for task %v after executor reported (terminated/unregistered)", taskStatus.TaskId.GetValue())
+		k.tasksReconciler.RequestExplicit()
+	} else if taskStatus.GetState() == mesos.TaskState_TASK_LOST && state == podtask.StateRunning && taskStatus.ExecutorId != nil && taskStatus.SlaveId != nil {
+		//TODO(jdef) this may not be meaningful once we have proper checkpointing and master detection
+		//If we're reconciling and receive this then the executor may be
+		//running a task that we need it to kill. It's possible that the framework
+		//is unrecognized by the master at this point, so KillTask is not guaranteed
+		//to do anything. The underlying driver transport may be able to send a
+		//FrameworkMessage directly to the slave to terminate the task.
+		log.V(2).Info("forwarding TASK_LOST message to executor %v on slave %v", taskStatus.ExecutorId, taskStatus.SlaveId)
+		data := fmt.Sprintf("%s:%s", messages.TaskLost, task.ID) //TODO(jdef) use a real message type
+		if _, err := driver.SendFrameworkMessage(taskStatus.ExecutorId, taskStatus.SlaveId, data); err != nil {
+			log.Error(err.Error())
+		}
+	}
+}
+
+// reconcile an unknown (from the perspective of our registry) non-terminal task
+func (k *framework) reconcileNonTerminalTask(driver bindings.SchedulerDriver, taskStatus *mesos.TaskStatus) {
+	// attempt to recover task from pod info:
+	// - task data may contain an api.PodStatusResult; if status.reason == REASON_RECONCILIATION then status.data == nil
+	// - the Name can be parsed by container.ParseFullName() to yield a pod Name and Namespace
+	// - pull the pod metadata down from the api server
+	// - perform task recovery based on pod metadata
+	taskId := taskStatus.TaskId.GetValue()
+	if taskStatus.GetReason() == mesos.TaskStatus_REASON_RECONCILIATION && taskStatus.GetSource() == mesos.TaskStatus_SOURCE_MASTER {
+		// there will be no data in the task status that we can use to determine the associated pod
+		switch taskStatus.GetState() {
+		case mesos.TaskState_TASK_STAGING:
+			// there is still hope for this task, don't kill it just yet
+			//TODO(jdef) there should probably be a limit for how long we tolerate tasks stuck in this state
+			return
+		default:
+			// for TASK_{STARTING,RUNNING} we should have already attempted to recoverTasks() for.
+			// if the scheduler failed over before the executor fired TASK_STARTING, then we should *not*
+			// be processing this reconciliation update before we process the one from the executor.
+			// point: we don't know what this task is (perhaps there was unrecoverable metadata in the pod),
+			// so it gets killed.
+			log.Errorf("killing non-terminal, unrecoverable task %v", taskId)
+		}
+	} else if podStatus, err := podtask.ParsePodStatusResult(taskStatus); err != nil {
+		// possible rogue pod exists at this point because we can't identify it; should kill the task
+		log.Errorf("possible rogue pod; illegal task status data for task %v, expected an api.PodStatusResult: %v", taskId, err)
+	} else if name, namespace, err := container.ParsePodFullName(podStatus.Name); err != nil {
+		// possible rogue pod exists at this point because we can't identify it; should kill the task
+		log.Errorf("possible rogue pod; illegal api.PodStatusResult, unable to parse full pod name from: '%v' for task %v: %v",
+			podStatus.Name, taskId, err)
+	} else if pod, err := k.client.Pods(namespace).Get(name); err == nil {
+		if t, ok, err := podtask.RecoverFrom(*pod); ok {
+			log.Infof("recovered task %v from metadata in pod %v/%v", taskId, namespace, name)
+			_, err := k.sched.Tasks().Register(t)
+			if err != nil {
+				// someone beat us to it?!
+				log.Warningf("failed to register recovered task: %v", err)
+				return
+			} else {
+				k.sched.Tasks().UpdateStatus(taskStatus)
+			}
+			return
+		} else if err != nil {
+			//should kill the pod and the task
+			log.Errorf("killing pod, failed to recover task from pod %v/%v: %v", namespace, name, err)
+			if err := k.client.Pods(namespace).Delete(name, nil); err != nil {
+				log.Errorf("failed to delete pod %v/%v: %v", namespace, name, err)
+			}
+		} else {
+			//this is pretty unexpected: we received a TASK_{STARTING,RUNNING} message, but the apiserver's pod
+			//metadata is not appropriate for task reconstruction -- which should almost certainly never
+			//be the case unless someone swapped out the pod on us (and kept the same namespace/name) while
+			//we were failed over.
+
+			//kill this task, allow the newly launched scheduler to schedule the new pod
+			log.Warningf("unexpected pod metadata for task %v in apiserver, assuming new unscheduled pod spec: %+v", taskId, pod)
+		}
+	} else if errors.IsNotFound(err) {
+		// pod lookup failed, should delete the task since the pod is no longer valid; may be redundant, that's ok
+		log.Infof("killing task %v since pod %v/%v no longer exists", taskId, namespace, name)
+	} else if errors.IsServerTimeout(err) {
+		log.V(2).Infof("failed to reconcile task due to API server timeout: %v", err)
+		return
+	} else {
+		log.Errorf("unexpected API server error, aborting reconcile for task %v: %v", taskId, err)
+		return
+	}
+	if _, err := driver.KillTask(taskStatus.TaskId); err != nil {
+		log.Errorf("failed to kill task %v: %v", taskId, err)
+	}
+}
+
+// FrameworkMessage is called when the scheduler receives a message from the executor.
+func (k *framework) FrameworkMessage(driver bindings.SchedulerDriver,
+	executorId *mesos.ExecutorID, slaveId *mesos.SlaveID, message string) {
+	log.Infof("Received messages from executor %v of slave %v, %v\n", executorId, slaveId, message)
+}
+
+// SlaveLost is called when some slave is lost.
+func (k *framework) SlaveLost(driver bindings.SchedulerDriver, slaveId *mesos.SlaveID) {
+	log.Infof("Slave %v is lost\n", slaveId)
+
+	sid := slaveId.GetValue()
+	k.offers.InvalidateForSlave(sid)
+
+	// TODO(jdef): delete slave from our internal list? probably not since we may need to reconcile
+	// tasks. it would be nice to somehow flag the slave as lost so that, perhaps, we can periodically
+	// flush lost slaves older than X, and for which no tasks or pods reference.
+
+	// unfinished tasks/pods will be dropped. use a replication controller if you want pods to
+	// be restarted when slaves die.
+}
+
+// ExecutorLost is called when some executor is lost.
+func (k *framework) ExecutorLost(driver bindings.SchedulerDriver, executorId *mesos.ExecutorID, slaveId *mesos.SlaveID, status int) {
+	log.Infof("Executor %v of slave %v is lost, status: %v\n", executorId, slaveId, status)
+	// TODO(yifan): Restart any unfinished tasks of the executor.
+}
+
+// Error is called when there is an unrecoverable error in the scheduler or scheduler driver.
+// The driver should have been aborted before this is invoked.
+func (k *framework) Error(driver bindings.SchedulerDriver, message string) {
+	log.Fatalf("fatal scheduler error: %v\n", message)
+}
+
+// filter func used for explicit task reconciliation, selects only non-terminal tasks which
+// have been communicated to mesos (read: launched).
+func explicitTaskFilter(t *podtask.T) bool {
+	switch t.State {
+	case podtask.StateRunning:
+		return true
+	case podtask.StatePending:
+		return t.Has(podtask.Launched)
+	default:
+		return false
+	}
+}
+
+// reconciler action factory, performs explicit task reconciliation for non-terminal
+// tasks listed in the scheduler's internal taskRegistry.
+func (k *framework) makeTaskRegistryReconciler() taskreconciler.Action {
+	return taskreconciler.Action(func(drv bindings.SchedulerDriver, cancel <-chan struct{}) <-chan error {
+		taskToSlave := make(map[string]string)
+		for _, t := range k.sched.Tasks().List(explicitTaskFilter) {
+			if t.Spec.SlaveID != "" {
+				taskToSlave[t.ID] = t.Spec.SlaveID
+			}
+		}
+		return proc.ErrorChan(k.explicitlyReconcileTasks(drv, taskToSlave, cancel))
+	})
+}
+
+// reconciler action factory, performs explicit task reconciliation for non-terminal
+// tasks identified by annotations in the Kubernetes pod registry.
+func (k *framework) makePodRegistryReconciler() taskreconciler.Action {
+	return taskreconciler.Action(func(drv bindings.SchedulerDriver, cancel <-chan struct{}) <-chan error {
+		podList, err := k.client.Pods(api.NamespaceAll).List(labels.Everything(), fields.Everything())
+		if err != nil {
+			return proc.ErrorChanf("failed to reconcile pod registry: %v", err)
+		}
+		taskToSlave := make(map[string]string)
+		for _, pod := range podList.Items {
+			if len(pod.Annotations) == 0 {
+				continue
+			}
+			taskId, found := pod.Annotations[meta.TaskIdKey]
+			if !found {
+				continue
+			}
+			slaveId, found := pod.Annotations[meta.SlaveIdKey]
+			if !found {
+				continue
+			}
+			taskToSlave[taskId] = slaveId
+		}
+		return proc.ErrorChan(k.explicitlyReconcileTasks(drv, taskToSlave, cancel))
+	})
+}
+
+// execute an explicit task reconciliation, as per http://mesos.apache.org/documentation/latest/reconciliation/
+func (k *framework) explicitlyReconcileTasks(driver bindings.SchedulerDriver, taskToSlave map[string]string, cancel <-chan struct{}) error {
+	log.Info("explicit reconcile tasks")
+
+	// tell mesos to send us the latest status updates for all the non-terminal tasks that we know about
+	statusList := []*mesos.TaskStatus{}
+	remaining := sets.StringKeySet(taskToSlave)
+	for taskId, slaveId := range taskToSlave {
+		if slaveId == "" {
+			delete(taskToSlave, taskId)
+			continue
+		}
+		statusList = append(statusList, &mesos.TaskStatus{
+			TaskId:  mutil.NewTaskID(taskId),
+			SlaveId: mutil.NewSlaveID(slaveId),
+			State:   mesos.TaskState_TASK_RUNNING.Enum(), // req'd field, doesn't have to reflect reality
+		})
+	}
+
+	select {
+	case <-cancel:
+		return merrors.ReconciliationCancelledErr
+	default:
+		if _, err := driver.ReconcileTasks(statusList); err != nil {
+			return err
+		}
+	}
+
+	start := time.Now()
+	first := true
+	for backoff := 1 * time.Second; first || remaining.Len() > 0; backoff = backoff * 2 {
+		first = false
+		// nothing to do here other than wait for status updates..
+		if backoff > k.schedulerConfig.ExplicitReconciliationMaxBackoff.Duration {
+			backoff = k.schedulerConfig.ExplicitReconciliationMaxBackoff.Duration
+		}
+		select {
+		case <-cancel:
+			return merrors.ReconciliationCancelledErr
+		case <-time.After(backoff):
+			for taskId := range remaining {
+				if task, _ := k.sched.Tasks().Get(taskId); task != nil && explicitTaskFilter(task) && task.UpdatedTime.Before(start) {
+					// keep this task in remaining list
+					continue
+				}
+				remaining.Delete(taskId)
+			}
+		}
+	}
+	return nil
+}
+
+func (ks *framework) recoverTasks() error {
+	podList, err := ks.client.Pods(api.NamespaceAll).List(labels.Everything(), fields.Everything())
+	if err != nil {
+		log.V(1).Infof("failed to recover pod registry, madness may ensue: %v", err)
+		return err
+	}
+	recoverSlave := func(t *podtask.T) {
+
+		slaveId := t.Spec.SlaveID
+		ks.slaveHostNames.Register(slaveId, t.Offer.Host())
+	}
+	for _, pod := range podList.Items {
+		if _, isMirrorPod := pod.Annotations[kubetypes.ConfigMirrorAnnotationKey]; isMirrorPod {
+			// mirrored pods are never reconciled because the scheduler isn't responsible for
+			// scheduling them; they're started by the executor/kubelet upon instantiation and
+			// reflected in the apiserver afterward. the scheduler has no knowledge of them.
+			continue
+		}
+		if t, ok, err := podtask.RecoverFrom(pod); err != nil {
+			log.Errorf("failed to recover task from pod, will attempt to delete '%v/%v': %v", pod.Namespace, pod.Name, err)
+			err := ks.client.Pods(pod.Namespace).Delete(pod.Name, nil)
+			//TODO(jdef) check for temporary or not-found errors
+			if err != nil {
+				log.Errorf("failed to delete pod '%v/%v': %v", pod.Namespace, pod.Name, err)
+			}
+		} else if ok {
+			ks.sched.Tasks().Register(t)
+			recoverSlave(t)
+			log.Infof("recovered task %v from pod %v/%v", t.ID, pod.Namespace, pod.Name)
+		}
+	}
+	return nil
+}
+
+func (ks *framework) KillTask(id string) error {
+	killTaskId := mutil.NewTaskID(id)
+	_, err := ks.driver.KillTask(killTaskId)
+	return err
+}
+
+func (ks *framework) LaunchTask(t *podtask.T) error {
+	// assume caller is holding scheduler lock
+	taskList := []*mesos.TaskInfo{t.BuildTaskInfo(ks.executor)}
+	offerIds := []*mesos.OfferID{t.Offer.Details().Id}
+	filters := &mesos.Filters{}
+	_, err := ks.driver.LaunchTasks(offerIds, taskList, filters)
+	return err
+}
+
+func (ks *framework) Offers() offers.Registry {
+	return ks.offers
+}
--- a/contrib/mesos/pkg/scheduler/components/framework/framework_test.go
+++ b/contrib/mesos/pkg/scheduler/components/framework/framework_test.go
@@ -14,7 +14,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 */

-package scheduler
+package framework

 import (
 	"reflect"
@@ -25,9 +25,9 @@ import (
 	"github.com/stretchr/testify/assert"
 	"k8s.io/kubernetes/contrib/mesos/pkg/offers"
 	"k8s.io/kubernetes/contrib/mesos/pkg/proc"
+	"k8s.io/kubernetes/contrib/mesos/pkg/scheduler"
 	schedcfg "k8s.io/kubernetes/contrib/mesos/pkg/scheduler/config"
 	"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/podtask"
-	"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/slave"
 	"k8s.io/kubernetes/pkg/api"
 	"k8s.io/kubernetes/pkg/client/cache"
 )
@@ -81,12 +81,19 @@ func (r *mockRegistrator) Register(hostName string, labels map[string]string) (b
 	}
 }

+func mockScheduler() scheduler.Scheduler {
+	mockScheduler := &scheduler.MockScheduler{}
+	reg := podtask.NewInMemoryRegistry()
+	mockScheduler.On("Tasks").Return(reg)
+	return mockScheduler
+}
+
 //test adding of ressource offer, should be added to offer registry and slaves
 func TestResourceOffer_Add(t *testing.T) {
 	assert := assert.New(t)

 	registrator := &mockRegistrator{cache.NewStore(cache.MetaNamespaceKeyFunc)}
-	testScheduler := &KubernetesScheduler{
+	testFramework := &framework{
 		offers: offers.CreateRegistry(offers.RegistryConfig{
 			Compat: func(o *mesos.Offer) bool {
 				return true
@@ -99,39 +106,40 @@ func TestResourceOffer_Add(t *testing.T) {
 			TTL:           schedcfg.DefaultOfferTTL,
 			ListenerDelay: schedcfg.DefaultListenerDelay,
 		}),
-		slaveHostNames:  slave.NewRegistry(),
+		slaveHostNames:  newSlaveRegistry(),
 		nodeRegistrator: registrator,
+		sched:           mockScheduler(),
 	}

 	hostname := "h1"
 	offerID1 := util.NewOfferID("test1")
 	offer1 := &mesos.Offer{Id: offerID1, Hostname: &hostname, SlaveId: util.NewSlaveID(hostname)}
 	offers1 := []*mesos.Offer{offer1}
-	testScheduler.ResourceOffers(nil, offers1)
+	testFramework.ResourceOffers(nil, offers1)
 	assert.Equal(1, len(registrator.store.List()))

-	assert.Equal(1, getNumberOffers(testScheduler.offers))
+	assert.Equal(1, getNumberOffers(testFramework.offers))
 	//check slave hostname
-	assert.Equal(1, len(testScheduler.slaveHostNames.SlaveIDs()))
+	assert.Equal(1, len(testFramework.slaveHostNames.SlaveIDs()))

 	//add another offer
 	hostname2 := "h2"
 	offer2 := &mesos.Offer{Id: util.NewOfferID("test2"), Hostname: &hostname2, SlaveId: util.NewSlaveID(hostname2)}
 	offers2 := []*mesos.Offer{offer2}
-	testScheduler.ResourceOffers(nil, offers2)
+	testFramework.ResourceOffers(nil, offers2)

 	//check it is stored in registry
-	assert.Equal(2, getNumberOffers(testScheduler.offers))
+	assert.Equal(2, getNumberOffers(testFramework.offers))

 	//check slave hostnames
-	assert.Equal(2, len(testScheduler.slaveHostNames.SlaveIDs()))
+	assert.Equal(2, len(testFramework.slaveHostNames.SlaveIDs()))
 }

 //test adding of ressource offer, should be added to offer registry and slavesf
 func TestResourceOffer_Add_Rescind(t *testing.T) {
 	assert := assert.New(t)

-	testScheduler := &KubernetesScheduler{
+	testFramework := &framework{
 		offers: offers.CreateRegistry(offers.RegistryConfig{
 			Compat: func(o *mesos.Offer) bool {
 				return true
@@ -144,42 +152,43 @@ func TestResourceOffer_Add_Rescind(t *testing.T) {
 			TTL:           schedcfg.DefaultOfferTTL,
 			ListenerDelay: schedcfg.DefaultListenerDelay,
 		}),
-		slaveHostNames: slave.NewRegistry(),
+		slaveHostNames: newSlaveRegistry(),
+		sched:          mockScheduler(),
 	}

 	hostname := "h1"
 	offerID1 := util.NewOfferID("test1")
 	offer1 := &mesos.Offer{Id: offerID1, Hostname: &hostname, SlaveId: util.NewSlaveID(hostname)}
 	offers1 := []*mesos.Offer{offer1}
-	testScheduler.ResourceOffers(nil, offers1)
+	testFramework.ResourceOffers(nil, offers1)

-	assert.Equal(1, getNumberOffers(testScheduler.offers))
+	assert.Equal(1, getNumberOffers(testFramework.offers))

 	//check slave hostname
-	assert.Equal(1, len(testScheduler.slaveHostNames.SlaveIDs()))
+	assert.Equal(1, len(testFramework.slaveHostNames.SlaveIDs()))

 	//add another offer
 	hostname2 := "h2"
 	offer2 := &mesos.Offer{Id: util.NewOfferID("test2"), Hostname: &hostname2, SlaveId: util.NewSlaveID(hostname2)}
 	offers2 := []*mesos.Offer{offer2}
-	testScheduler.ResourceOffers(nil, offers2)
+	testFramework.ResourceOffers(nil, offers2)

-	assert.Equal(2, getNumberOffers(testScheduler.offers))
+	assert.Equal(2, getNumberOffers(testFramework.offers))

 	//check slave hostnames
-	assert.Equal(2, len(testScheduler.slaveHostNames.SlaveIDs()))
+	assert.Equal(2, len(testFramework.slaveHostNames.SlaveIDs()))

 	//next whether offers can be rescinded
-	testScheduler.OfferRescinded(nil, offerID1)
-	assert.Equal(1, getNumberOffers(testScheduler.offers))
+	testFramework.OfferRescinded(nil, offerID1)
+	assert.Equal(1, getNumberOffers(testFramework.offers))

 	//next whether offers can be rescinded
-	testScheduler.OfferRescinded(nil, util.NewOfferID("test2"))
+	testFramework.OfferRescinded(nil, util.NewOfferID("test2"))
 	//walk offers again and check it is removed from registry
-	assert.Equal(0, getNumberOffers(testScheduler.offers))
+	assert.Equal(0, getNumberOffers(testFramework.offers))

 	//remove non existing ID
-	testScheduler.OfferRescinded(nil, util.NewOfferID("notExist"))
+	testFramework.OfferRescinded(nil, util.NewOfferID("notExist"))
 }

 //test that when a slave is lost we remove all offers
@@ -187,7 +196,7 @@ func TestSlave_Lost(t *testing.T) {
 	assert := assert.New(t)

 	//
-	testScheduler := &KubernetesScheduler{
+	testFramework := &framework{
 		offers: offers.CreateRegistry(offers.RegistryConfig{
 			Compat: func(o *mesos.Offer) bool {
 				return true
@@ -197,45 +206,46 @@ func TestSlave_Lost(t *testing.T) {
 			TTL:           schedcfg.DefaultOfferTTL,
 			ListenerDelay: schedcfg.DefaultListenerDelay,
 		}),
-		slaveHostNames: slave.NewRegistry(),
+		slaveHostNames: newSlaveRegistry(),
+		sched:          mockScheduler(),
 	}

 	hostname := "h1"
 	offer1 := &mesos.Offer{Id: util.NewOfferID("test1"), Hostname: &hostname, SlaveId: util.NewSlaveID(hostname)}
 	offers1 := []*mesos.Offer{offer1}
-	testScheduler.ResourceOffers(nil, offers1)
+	testFramework.ResourceOffers(nil, offers1)
 	offer2 := &mesos.Offer{Id: util.NewOfferID("test2"), Hostname: &hostname, SlaveId: util.NewSlaveID(hostname)}
 	offers2 := []*mesos.Offer{offer2}
-	testScheduler.ResourceOffers(nil, offers2)
+	testFramework.ResourceOffers(nil, offers2)

 	//add another offer from different slaveID
 	hostname2 := "h2"
 	offer3 := &mesos.Offer{Id: util.NewOfferID("test3"), Hostname: &hostname2, SlaveId: util.NewSlaveID(hostname2)}
 	offers3 := []*mesos.Offer{offer3}
-	testScheduler.ResourceOffers(nil, offers3)
+	testFramework.ResourceOffers(nil, offers3)

 	//test precondition
-	assert.Equal(3, getNumberOffers(testScheduler.offers))
-	assert.Equal(2, len(testScheduler.slaveHostNames.SlaveIDs()))
+	assert.Equal(3, getNumberOffers(testFramework.offers))
+	assert.Equal(2, len(testFramework.slaveHostNames.SlaveIDs()))

 	//remove first slave
-	testScheduler.SlaveLost(nil, util.NewSlaveID(hostname))
+	testFramework.SlaveLost(nil, util.NewSlaveID(hostname))

 	//offers should be removed
-	assert.Equal(1, getNumberOffers(testScheduler.offers))
+	assert.Equal(1, getNumberOffers(testFramework.offers))
 	//slave hostnames should still be all present
-	assert.Equal(2, len(testScheduler.slaveHostNames.SlaveIDs()))
+	assert.Equal(2, len(testFramework.slaveHostNames.SlaveIDs()))

 	//remove second slave
-	testScheduler.SlaveLost(nil, util.NewSlaveID(hostname2))
+	testFramework.SlaveLost(nil, util.NewSlaveID(hostname2))

 	//offers should be removed
-	assert.Equal(0, getNumberOffers(testScheduler.offers))
+	assert.Equal(0, getNumberOffers(testFramework.offers))
 	//slave hostnames should still be all present
-	assert.Equal(2, len(testScheduler.slaveHostNames.SlaveIDs()))
+	assert.Equal(2, len(testFramework.slaveHostNames.SlaveIDs()))

 	//try to remove non existing slave
-	testScheduler.SlaveLost(nil, util.NewSlaveID("notExist"))
+	testFramework.SlaveLost(nil, util.NewSlaveID("notExist"))

 }

@@ -244,7 +254,7 @@ func TestDisconnect(t *testing.T) {
 	assert := assert.New(t)

 	//
-	testScheduler := &KubernetesScheduler{
+	testFramework := &framework{
 		offers: offers.CreateRegistry(offers.RegistryConfig{
 			Compat: func(o *mesos.Offer) bool {
 				return true
@@ -254,30 +264,31 @@ func TestDisconnect(t *testing.T) {
 			TTL:           schedcfg.DefaultOfferTTL,
 			ListenerDelay: schedcfg.DefaultListenerDelay,
 		}),
-		slaveHostNames: slave.NewRegistry(),
+		slaveHostNames: newSlaveRegistry(),
+		sched:          mockScheduler(),
 	}

 	hostname := "h1"
 	offer1 := &mesos.Offer{Id: util.NewOfferID("test1"), Hostname: &hostname, SlaveId: util.NewSlaveID(hostname)}
 	offers1 := []*mesos.Offer{offer1}
-	testScheduler.ResourceOffers(nil, offers1)
+	testFramework.ResourceOffers(nil, offers1)
 	offer2 := &mesos.Offer{Id: util.NewOfferID("test2"), Hostname: &hostname, SlaveId: util.NewSlaveID(hostname)}
 	offers2 := []*mesos.Offer{offer2}
-	testScheduler.ResourceOffers(nil, offers2)
+	testFramework.ResourceOffers(nil, offers2)

 	//add another offer from different slaveID
 	hostname2 := "h2"
 	offer3 := &mesos.Offer{Id: util.NewOfferID("test2"), Hostname: &hostname2, SlaveId: util.NewSlaveID(hostname2)}
 	offers3 := []*mesos.Offer{offer3}
-	testScheduler.ResourceOffers(nil, offers3)
+	testFramework.ResourceOffers(nil, offers3)

 	//disconnect
-	testScheduler.Disconnected(nil)
+	testFramework.Disconnected(nil)

 	//all offers should be removed
-	assert.Equal(0, getNumberOffers(testScheduler.offers))
+	assert.Equal(0, getNumberOffers(testFramework.offers))
 	//slave hostnames should still be all present
-	assert.Equal(2, len(testScheduler.slaveHostNames.SlaveIDs()))
+	assert.Equal(2, len(testFramework.slaveHostNames.SlaveIDs()))
 }

 //test we can handle different status updates, TODO check state transitions
@@ -287,7 +298,7 @@ func TestStatus_Update(t *testing.T) {
 	// setup expectations
 	mockdriver.On("KillTask", util.NewTaskID("test-task-001")).Return(mesos.Status_DRIVER_RUNNING, nil)

-	testScheduler := &KubernetesScheduler{
+	testFramework := &framework{
 		offers: offers.CreateRegistry(offers.RegistryConfig{
 			Compat: func(o *mesos.Offer) bool {
 				return true
@@ -297,28 +308,28 @@ func TestStatus_Update(t *testing.T) {
 			TTL:           schedcfg.DefaultOfferTTL,
 			ListenerDelay: schedcfg.DefaultListenerDelay,
 		}),
-		slaveHostNames: slave.NewRegistry(),
+		slaveHostNames: newSlaveRegistry(),
 		driver:         &mockdriver,
-		taskRegistry:   podtask.NewInMemoryRegistry(),
+		sched:          mockScheduler(),
 	}

 	taskStatus_task_starting := util.NewTaskStatus(
 		util.NewTaskID("test-task-001"),
 		mesos.TaskState_TASK_RUNNING,
 	)
-	testScheduler.StatusUpdate(testScheduler.driver, taskStatus_task_starting)
+	testFramework.StatusUpdate(testFramework.driver, taskStatus_task_starting)

 	taskStatus_task_running := util.NewTaskStatus(
 		util.NewTaskID("test-task-001"),
 		mesos.TaskState_TASK_RUNNING,
 	)
-	testScheduler.StatusUpdate(testScheduler.driver, taskStatus_task_running)
+	testFramework.StatusUpdate(testFramework.driver, taskStatus_task_running)

 	taskStatus_task_failed := util.NewTaskStatus(
 		util.NewTaskID("test-task-001"),
 		mesos.TaskState_TASK_FAILED,
 	)
-	testScheduler.StatusUpdate(testScheduler.driver, taskStatus_task_failed)
+	testFramework.StatusUpdate(testFramework.driver, taskStatus_task_failed)

 	//assert that mock was invoked
 	mockdriver.AssertExpectations(t)
--- a/contrib/mesos/pkg/scheduler/components/framework/slaveregistry.go
+++ b/contrib/mesos/pkg/scheduler/components/framework/slaveregistry.go
@@ -14,25 +14,26 @@ See the License for the specific language governing permissions and
 limitations under the License.
 */

-package slave
+package framework

 import (
 	"sync"
 )

-type Registry struct {
+// slaveRegistry manages node hostnames for slave ids.
+type slaveRegistry struct {
 	lock      sync.Mutex
 	hostNames map[string]string
 }

-func NewRegistry() *Registry {
-	return &Registry{
+func newSlaveRegistry() *slaveRegistry {
+	return &slaveRegistry{
 		hostNames: map[string]string{},
 	}
 }

 // Register creates a mapping between a slaveId and slave if not existing.
-func (st *Registry) Register(slaveId, slaveHostname string) {
+func (st *slaveRegistry) Register(slaveId, slaveHostname string) {
 	st.lock.Lock()
 	defer st.lock.Unlock()
 	_, exists := st.hostNames[slaveId]
@@ -42,7 +43,7 @@ func (st *Registry) Register(slaveId, slaveHostname string) {
 }

 // SlaveIDs returns the keys of the registry
-func (st *Registry) SlaveIDs() []string {
+func (st *slaveRegistry) SlaveIDs() []string {
 	st.lock.Lock()
 	defer st.lock.Unlock()
 	slaveIds := make([]string, 0, len(st.hostNames))
@@ -53,7 +54,7 @@ func (st *Registry) SlaveIDs() []string {
 }

 // HostName looks up a hostname for a given slaveId
-func (st *Registry) HostName(slaveId string) string {
+func (st *slaveRegistry) HostName(slaveId string) string {
 	st.lock.Lock()
 	defer st.lock.Unlock()
 	return st.hostNames[slaveId]
--- a/contrib/mesos/pkg/scheduler/components/framework/slaveregistry_test.go
+++ b/contrib/mesos/pkg/scheduler/components/framework/slaveregistry_test.go
@@ -14,7 +14,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 */

-package slave
+package framework

 import (
 	"testing"
@@ -26,7 +26,7 @@ import (
 func TestSlaveStorage_Register(t *testing.T) {
 	assert := assert.New(t)

-	slaveStorage := NewRegistry()
+	slaveStorage := newSlaveRegistry()
 	assert.Equal(0, len(slaveStorage.hostNames))

 	slaveId := "slave1"
@@ -42,7 +42,7 @@ func TestSlaveStorage_Register(t *testing.T) {
 func TestSlaveStorage_HostName(t *testing.T) {
 	assert := assert.New(t)

-	slaveStorage := NewRegistry()
+	slaveStorage := newSlaveRegistry()
 	assert.Equal(0, len(slaveStorage.hostNames))

 	slaveId := "slave1"
@@ -62,7 +62,7 @@ func TestSlaveStorage_HostName(t *testing.T) {
 func TestSlaveStorage_SlaveIds(t *testing.T) {
 	assert := assert.New(t)

-	slaveStorage := NewRegistry()
+	slaveStorage := newSlaveRegistry()
 	assert.Equal(0, len(slaveStorage.hostNames))

 	slaveId := "1"
--- a/contrib/mesos/pkg/scheduler/components/podreconciler/doc.go
+++ b/contrib/mesos/pkg/scheduler/components/podreconciler/doc.go
@@ -0,0 +1,19 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+// Package podreconciler implements pod reconcilation of pods which failed
+// to launch, i.e. before binding by the executor took place.
+package podreconciler
--- a/contrib/mesos/pkg/scheduler/components/podreconciler/podreconciler.go
+++ b/contrib/mesos/pkg/scheduler/components/podreconciler/podreconciler.go
@@ -0,0 +1,120 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package podreconciler
+
+import (
+	"time"
+
+	log "github.com/golang/glog"
+	"k8s.io/kubernetes/contrib/mesos/pkg/scheduler"
+	"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/components/deleter"
+	"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/errors"
+	"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/podtask"
+	"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/queuer"
+	"k8s.io/kubernetes/pkg/api"
+	apierrors "k8s.io/kubernetes/pkg/api/errors"
+	client "k8s.io/kubernetes/pkg/client/unversioned"
+)
+
+// PodReconciler reconciles a pod with the apiserver
+type PodReconciler interface {
+	Reconcile(t *podtask.T)
+}
+
+type podReconciler struct {
+	sched   scheduler.Scheduler
+	client  *client.Client
+	qr      queuer.Queuer
+	deleter deleter.Deleter
+}
+
+func New(sched scheduler.Scheduler, client *client.Client, qr queuer.Queuer, deleter deleter.Deleter) PodReconciler {
+	return &podReconciler{
+		sched:   sched,
+		client:  client,
+		qr:      qr,
+		deleter: deleter,
+	}
+}
+
+// this pod may be out of sync with respect to the API server registry:
+//      this pod   |  apiserver registry
+//    -------------|----------------------
+//      host=.*    |  404           ; pod was deleted
+//      host=.*    |  5xx           ; failed to sync, try again later?
+//      host=""    |  host=""       ; perhaps no updates to process?
+//      host=""    |  host="..."    ; pod has been scheduled and assigned, is there a task assigned? (check TaskIdKey in binding?)
+//      host="..." |  host=""       ; pod is no longer scheduled, does it need to be re-queued?
+//      host="..." |  host="..."    ; perhaps no updates to process?
+//
+// TODO(jdef) this needs an integration test
+func (s *podReconciler) Reconcile(t *podtask.T) {
+	log.V(1).Infof("reconcile pod %v, assigned to slave %q", t.Pod.Name, t.Spec.AssignedSlave)
+	ctx := api.WithNamespace(api.NewDefaultContext(), t.Pod.Namespace)
+	pod, err := s.client.Pods(api.NamespaceValue(ctx)).Get(t.Pod.Name)
+	if err != nil {
+		if apierrors.IsNotFound(err) {
+			// attempt to delete
+			if err = s.deleter.DeleteOne(&queuer.Pod{Pod: &t.Pod}); err != nil && err != errors.NoSuchPodErr && err != errors.NoSuchTaskErr {
+				log.Errorf("failed to delete pod: %v: %v", t.Pod.Name, err)
+			}
+		} else {
+			//TODO(jdef) other errors should probably trigger a retry (w/ backoff).
+			//For now, drop the pod on the floor
+			log.Warning("aborting reconciliation for pod %v: %v", t.Pod.Name, err)
+		}
+		return
+	}
+
+	log.Infof("pod %v scheduled on %q according to apiserver", pod.Name, pod.Spec.NodeName)
+	if t.Spec.AssignedSlave != pod.Spec.NodeName {
+		if pod.Spec.NodeName == "" {
+			// pod is unscheduled.
+			// it's possible that we dropped the pod in the scheduler error handler
+			// because of task misalignment with the pod (task.Has(podtask.Launched) == true)
+
+			podKey, err := podtask.MakePodKey(ctx, pod.Name)
+			if err != nil {
+				log.Error(err)
+				return
+			}
+
+			s.sched.Lock()
+			defer s.sched.Unlock()
+
+			if _, state := s.sched.Tasks().ForPod(podKey); state != podtask.StateUnknown {
+				//TODO(jdef) reconcile the task
+				log.Errorf("task already registered for pod %v", pod.Name)
+				return
+			}
+
+			now := time.Now()
+			log.V(3).Infof("reoffering pod %v", podKey)
+			s.qr.Reoffer(queuer.NewPodWithDeadline(pod, &now))
+		} else {
+			// pod is scheduled.
+			// not sure how this happened behind our backs. attempt to reconstruct
+			// at least a partial podtask.T record.
+			//TODO(jdef) reconcile the task
+			log.Errorf("pod already scheduled: %v", pod.Name)
+		}
+	} else {
+		//TODO(jdef) for now, ignore the fact that the rest of the spec may be different
+		//and assume that our knowledge of the pod aligns with that of the apiserver
+		log.Error("pod reconciliation does not support updates; not yet implemented")
+	}
+}
--- a/contrib/mesos/pkg/scheduler/components/podstoreadapter.go
+++ b/contrib/mesos/pkg/scheduler/components/podstoreadapter.go
@@ -0,0 +1,63 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package components
+
+import (
+	"k8s.io/kubernetes/contrib/mesos/pkg/queue"
+	"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/queuer"
+	"k8s.io/kubernetes/pkg/api"
+)
+
+// Consumes *api.Pod, produces *Pod; the k8s reflector wants to push *api.Pod
+// objects at us, but we want to store more flexible (Pod) type defined in
+// this package. The adapter implementation facilitates this. It's a little
+// hackish since the object type going in is different than the object type
+// coming out -- you've been warned.
+type podStoreAdapter struct {
+	queue.FIFO
+}
+
+func (psa *podStoreAdapter) Add(obj interface{}) error {
+	pod := obj.(*api.Pod)
+	return psa.FIFO.Add(&queuer.Pod{Pod: pod})
+}
+
+func (psa *podStoreAdapter) Update(obj interface{}) error {
+	pod := obj.(*api.Pod)
+	return psa.FIFO.Update(&queuer.Pod{Pod: pod})
+}
+
+func (psa *podStoreAdapter) Delete(obj interface{}) error {
+	pod := obj.(*api.Pod)
+	return psa.FIFO.Delete(&queuer.Pod{Pod: pod})
+}
+
+func (psa *podStoreAdapter) Get(obj interface{}) (interface{}, bool, error) {
+	pod := obj.(*api.Pod)
+	return psa.FIFO.Get(&queuer.Pod{Pod: pod})
+}
+
+// Replace will delete the contents of the store, using instead the
+// given map. This store implementation does NOT take ownership of the map.
+func (psa *podStoreAdapter) Replace(objs []interface{}, resourceVersion string) error {
+	newobjs := make([]interface{}, len(objs))
+	for i, v := range objs {
+		pod := v.(*api.Pod)
+		newobjs[i] = &queuer.Pod{Pod: pod}
+	}
+	return psa.FIFO.Replace(newobjs, resourceVersion)
+}
--- a/contrib/mesos/pkg/scheduler/components/scheduler.go
+++ b/contrib/mesos/pkg/scheduler/components/scheduler.go
@@ -0,0 +1,137 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package components
+
+import (
+	"net/http"
+	"sync"
+
+	mesos "github.com/mesos/mesos-go/mesosproto"
+	"k8s.io/kubernetes/contrib/mesos/pkg/backoff"
+	"k8s.io/kubernetes/contrib/mesos/pkg/offers"
+	"k8s.io/kubernetes/contrib/mesos/pkg/queue"
+	"k8s.io/kubernetes/contrib/mesos/pkg/runtime"
+	"k8s.io/kubernetes/contrib/mesos/pkg/scheduler"
+	"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/components/algorithm"
+	"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/components/algorithm/podschedulers"
+	"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/components/binder"
+	"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/components/controller"
+	"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/components/deleter"
+	"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/components/errorhandler"
+	"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/components/framework"
+	"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/components/podreconciler"
+	"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/config"
+	"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/podtask"
+	"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/queuer"
+	"k8s.io/kubernetes/pkg/api"
+	"k8s.io/kubernetes/pkg/client/cache"
+	"k8s.io/kubernetes/pkg/client/record"
+	client "k8s.io/kubernetes/pkg/client/unversioned"
+)
+
+// sched implements the Scheduler interface.
+type sched struct {
+	podReconciler podreconciler.PodReconciler
+	framework     framework.Framework
+	controller    controller.Controller
+
+	// unsafe state, needs to be guarded, especially changes to podtask.T objects
+	sync.RWMutex
+	taskRegistry podtask.Registry
+}
+
+func New(c *config.Config, fw framework.Framework, ps podschedulers.PodScheduler,
+	client *client.Client, recorder record.EventRecorder, terminate <-chan struct{}, mux *http.ServeMux, lw *cache.ListWatch) scheduler.Scheduler {
+
+	core := &sched{
+		framework:    fw,
+		taskRegistry: podtask.NewInMemoryRegistry(),
+	}
+
+	// Watch and queue pods that need scheduling.
+	podUpdatesBypass := make(chan queue.Entry, c.UpdatesBacklog)
+	podUpdates := &podStoreAdapter{queue.NewHistorical(podUpdatesBypass)}
+	reflector := cache.NewReflector(lw, &api.Pod{}, podUpdates, 0)
+
+	q := queuer.New(queue.NewDelayFIFO(), podUpdates)
+
+	algorithm := algorithm.New(core, podUpdates, ps)
+
+	podDeleter := deleter.New(core, q)
+
+	core.podReconciler = podreconciler.New(core, client, q, podDeleter)
+
+	bo := backoff.New(c.InitialPodBackoff.Duration, c.MaxPodBackoff.Duration)
+	newBC := func(podKey string) queue.BreakChan {
+		return queue.BreakChan(core.Offers().Listen(podKey, func(offer *mesos.Offer) bool {
+			core.Lock()
+			defer core.Unlock()
+			switch task, state := core.Tasks().ForPod(podKey); state {
+			case podtask.StatePending:
+				// Assess fitness of pod with the current offer. The scheduler normally
+				// "backs off" when it can't find an offer that matches up with a pod.
+				// The backoff period for a pod can terminate sooner if an offer becomes
+				// available that matches up.
+				return !task.Has(podtask.Launched) && ps.FitPredicate()(task, offer, nil)
+			default:
+				// no point in continuing to check for matching offers
+				return true
+			}
+		}))
+	}
+	errorHandler := errorhandler.New(core, bo, q, newBC)
+
+	binder := binder.New(core)
+
+	startLatch := make(chan struct{})
+
+	runtime.On(startLatch, func() {
+		reflector.Run() // TODO(jdef) should listen for termination
+		podDeleter.Run(podUpdatesBypass, terminate)
+		q.Run(terminate)
+
+		q.InstallDebugHandlers(mux)
+		podtask.InstallDebugHandlers(core.Tasks(), mux)
+	})
+
+	core.controller = controller.New(client, algorithm, recorder, q.Yield, errorHandler.Error, binder, startLatch)
+	return core
+}
+
+func (c *sched) Run(done <-chan struct{}) {
+	c.controller.Run(done)
+}
+
+func (c *sched) Reconcile(t *podtask.T) {
+	c.podReconciler.Reconcile(t)
+}
+
+func (c *sched) Tasks() podtask.Registry {
+	return c.taskRegistry
+}
+
+func (c *sched) Offers() offers.Registry {
+	return c.framework.Offers()
+}
+
+func (c *sched) KillTask(id string) error {
+	return c.framework.KillTask(id)
+}
+
+func (c *sched) LaunchTask(t *podtask.T) error {
+	return c.framework.LaunchTask(t)
+}
--- a/contrib/mesos/pkg/scheduler/components/tasksreconciler/doc.go
+++ b/contrib/mesos/pkg/scheduler/components/tasksreconciler/doc.go
@@ -0,0 +1,18 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+// Package taskreconciler implement Mesos task reconcilation.
+package taskreconciler
--- a/contrib/mesos/pkg/scheduler/components/tasksreconciler/tasksreconciler.go
+++ b/contrib/mesos/pkg/scheduler/components/tasksreconciler/tasksreconciler.go
@@ -0,0 +1,235 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package taskreconciler
+
+import (
+	"fmt"
+	"time"
+
+	log "github.com/golang/glog"
+	mesos "github.com/mesos/mesos-go/mesosproto"
+	bindings "github.com/mesos/mesos-go/scheduler"
+	"k8s.io/kubernetes/contrib/mesos/pkg/proc"
+	"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/errors"
+	"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/metrics"
+)
+
+type Action func(driver bindings.SchedulerDriver, cancel <-chan struct{}) <-chan error
+
+type TasksReconciler interface {
+	RequestExplicit()
+	RequestImplicit()
+	Run(driver bindings.SchedulerDriver, done <-chan struct{})
+}
+
+type tasksReconciler struct {
+	proc.Doer
+	Action                             Action
+	explicit                           chan struct{} // send an empty struct to trigger explicit reconciliation
+	implicit                           chan struct{} // send an empty struct to trigger implicit reconciliation
+	cooldown                           time.Duration
+	explicitReconciliationAbortTimeout time.Duration
+}
+
+func New(doer proc.Doer, action Action,
+	cooldown, explicitReconciliationAbortTimeout time.Duration, done <-chan struct{}) TasksReconciler {
+	return &tasksReconciler{
+		Doer:     doer,
+		explicit: make(chan struct{}, 1),
+		implicit: make(chan struct{}, 1),
+		cooldown: cooldown,
+		explicitReconciliationAbortTimeout: explicitReconciliationAbortTimeout,
+		Action: func(driver bindings.SchedulerDriver, cancel <-chan struct{}) <-chan error {
+			// trigged the reconciler action in the doer's execution context,
+			// but it could take a while and the scheduler needs to be able to
+			// process updates, the callbacks for which ALSO execute in the SAME
+			// deferred execution context -- so the action MUST be executed async.
+			errOnce := proc.NewErrorOnce(cancel)
+			return errOnce.Send(doer.Do(func() {
+				// only triggers the action if we're the currently elected,
+				// registered master and runs the action async.
+				go func() {
+					var err <-chan error
+					defer errOnce.Send(err)
+					err = action(driver, cancel)
+				}()
+			})).Err()
+		},
+	}
+}
+
+func (r *tasksReconciler) RequestExplicit() {
+	select {
+	case r.explicit <- struct{}{}: // noop
+	default: // request queue full; noop
+	}
+}
+
+func (r *tasksReconciler) RequestImplicit() {
+	select {
+	case r.implicit <- struct{}{}: // noop
+	default: // request queue full; noop
+	}
+}
+
+// execute task reconciliation, returns when r.done is closed. intended to run as a goroutine.
+// if reconciliation is requested while another is in progress, the in-progress operation will be
+// cancelled before the new reconciliation operation begins.
+func (r *tasksReconciler) Run(driver bindings.SchedulerDriver, done <-chan struct{}) {
+	var cancel, finished chan struct{}
+requestLoop:
+	for {
+		select {
+		case <-done:
+			return
+		default: // proceed
+		}
+		select {
+		case <-r.implicit:
+			metrics.ReconciliationRequested.WithLabelValues("implicit").Inc()
+			select {
+			case <-done:
+				return
+			case <-r.explicit:
+				break // give preference to a pending request for explicit
+			default: // continue
+				// don't run implicit reconciliation while explicit is ongoing
+				if finished != nil {
+					select {
+					case <-finished: // continue w/ implicit
+					default:
+						log.Infoln("skipping implicit reconcile because explicit reconcile is ongoing")
+						continue requestLoop
+					}
+				}
+				errOnce := proc.NewErrorOnce(done)
+				errCh := r.Do(func() {
+					var err error
+					defer errOnce.Report(err)
+					log.Infoln("implicit reconcile tasks")
+					metrics.ReconciliationExecuted.WithLabelValues("implicit").Inc()
+					if _, err = driver.ReconcileTasks([]*mesos.TaskStatus{}); err != nil {
+						log.V(1).Infof("failed to request implicit reconciliation from mesos: %v", err)
+					}
+				})
+				proc.OnError(errOnce.Send(errCh).Err(), func(err error) {
+					log.Errorf("failed to run implicit reconciliation: %v", err)
+				}, done)
+				goto slowdown
+			}
+		case <-done:
+			return
+		case <-r.explicit: // continue
+			metrics.ReconciliationRequested.WithLabelValues("explicit").Inc()
+		}
+
+		if cancel != nil {
+			close(cancel)
+			cancel = nil
+
+			// play nice and wait for the prior operation to finish, complain
+			// if it doesn't
+			select {
+			case <-done:
+				return
+			case <-finished: // noop, expected
+			case <-time.After(r.explicitReconciliationAbortTimeout): // very unexpected
+				log.Error("reconciler action failed to stop upon cancellation")
+			}
+		}
+		// copy 'finished' to 'fin' here in case we end up with simultaneous go-routines,
+		// if cancellation takes too long or fails - we don't want to close the same chan
+		// more than once
+		cancel = make(chan struct{})
+		finished = make(chan struct{})
+		go func(fin chan struct{}) {
+			startedAt := time.Now()
+			defer func() {
+				metrics.ReconciliationLatency.Observe(metrics.InMicroseconds(time.Since(startedAt)))
+			}()
+
+			metrics.ReconciliationExecuted.WithLabelValues("explicit").Inc()
+			defer close(fin)
+			err := <-r.Action(driver, cancel)
+			if err == errors.ReconciliationCancelledErr {
+				metrics.ReconciliationCancelled.WithLabelValues("explicit").Inc()
+				log.Infoln(err.Error())
+			} else if err != nil {
+				log.Errorf("reconciler action failed: %v", err)
+			}
+		}(finished)
+	slowdown:
+		// don't allow reconciliation to run very frequently, either explicit or implicit
+		select {
+		case <-done:
+			return
+		case <-time.After(r.cooldown): // noop
+		}
+	} // for
+}
+
+// MakeComposite invokes the given ReconcilerAction funcs in sequence, aborting the sequence if reconciliation
+// is cancelled. if any other errors occur the composite reconciler will attempt to complete the
+// sequence, reporting only the last generated error.
+func MakeComposite(done <-chan struct{}, actions ...Action) Action {
+	if x := len(actions); x == 0 {
+		// programming error
+		panic("no actions specified for composite reconciler")
+	} else if x == 1 {
+		return actions[0]
+	}
+	chained := func(d bindings.SchedulerDriver, c <-chan struct{}, a, b Action) <-chan error {
+		ech := a(d, c)
+		ch := make(chan error, 1)
+		go func() {
+			select {
+			case <-done:
+			case <-c:
+			case e := <-ech:
+				if e != nil {
+					ch <- e
+					return
+				}
+				ech = b(d, c)
+				select {
+				case <-done:
+				case <-c:
+				case e := <-ech:
+					if e != nil {
+						ch <- e
+						return
+					}
+					close(ch)
+					return
+				}
+			}
+			ch <- fmt.Errorf("aborting composite reconciler action")
+		}()
+		return ch
+	}
+	result := func(d bindings.SchedulerDriver, c <-chan struct{}) <-chan error {
+		return chained(d, c, actions[0], actions[1])
+	}
+	for i := 2; i < len(actions); i++ {
+		i := i
+		next := func(d bindings.SchedulerDriver, c <-chan struct{}) <-chan error {
+			return chained(d, c, Action(result), actions[i])
+		}
+		result = next
+	}
+	return Action(result)
+}
--- a/contrib/mesos/pkg/scheduler/doc.go
+++ b/contrib/mesos/pkg/scheduler/doc.go
@@ -16,3 +16,58 @@ limitations under the License.

 // Package scheduler implements the Kubernetes Mesos scheduler.
 package scheduler
+
+// Created from contrib/mesos/docs/scheduler.monopic:
+//
+//                     ┌───────────────────────────────────────────────────────────────────────┐
+//                     │                ┌───────────────────────────────────────┐            ┌─┴──────────────────────┐             ┌───────────────┐
+//            ┌────────▼─────────┐      │Queuer                                 │  Await()   │       podUpdates       │             │               │
+//            │ podUpdatesBypass │      │- Yield() *api.Pod                     ├──pod CRUD ─▶ (queue.HistoricalFIFO) ◀──reflector──▶pods ListWatch ├──apiserver──▶
+//            └────────▲─────────┘      │- Requeue(pod)/Dequeue(id)/Reoffer(pod)│   events   │                        │             │               │
+//                     │                └───────────────────▲───────────────────┘            └───────────┬────────────┘             └───────────────┘
+//                     │                                    │                                            │
+//                     │                                    │                                            │
+//                     └───────────────┐┌───────────────────▲────────────────────▲─────────────────────┐ └───────────────────────┐
+//                                     ││                                        │                     │    ┌────────────────────┼─────────────────┐
+//                 ┌───────────────────┼┼──────────────────────────────────────┐ │ ┌───────────────────┼────┼───────────┐        │                 │
+//     ┌───────────▼──────────┐┌───────┴┴───────┐   ┌───────────────────┐   ┌──┴─┴─┴──────┐   ┌────────┴────┴───┐  ┌────▼────────▼─────────────┐   │
+//     │Binder (task launcher)││Deleter         │   │PodReconciler      │   │Controller   │   │  ErrorHandler   │  │SchedulerAlgorithm         │   │
+//     │- Bind(binding)       ││- DeleteOne(pod)│   │- Reconcile(pod)   │   │- Run()      │   │- Error(pod, err)│  │- Schedule(pod) -> NodeName│   │
+//     │                      ││                │◀──│                   │   │             │──▶│                 │  │                           │   │
+//     │               ┌─────┐││    ┌─────┐     │   │      ┌─────┐      │   │   ┌─────┐   │   │    ┌─────┐      │  │┌─────┐                    │   │
+//     └───────────────┤sched├┘└────┤sched├─────┘   └──────┤sched├───▲──┘   └───┤sched├───┘   └────┤sched├──────┘  └┤sched├──────────────┬─────┘   │
+//                     ├-│││-┴──────┴--││-┴────────────────┴--│--┴───┼──────────┴--│--┴────────────┴-│---┴──────────┴-│││-┤ ┌────────────▼─────────▼─────────┐
+//                     │ │││           ││                     │      │             │                 │                │││ │ │          podScheduler          │
+//                     │ ││└───────────▼┼─────────────────────▼──────┼─────────────▼─────────────────▼────────────────┘││ │ │    (e.g. fcfsPodScheduler)     │
+//                     │ │└─────────────┼────────────────────────────┼─────────────┼──────────────────▼────────────────┘│ │ │                                │
+//                     │ │              │                            │             │                  │                 │ │ │  scheduleOne(pod, offers ...)  │
+//                     │ │              │                            │             │                  │                 │ │ │     ┌──────────────────────────┤
+//                     │ │              │         ╲   │   │   │   ╱  │             │                  │                 ▼ │ │     │    allocationStrategy    │
+//                     │ │              │          ╲  └┐  │  ┌┘  ╱   │             │                  │                   │ │     │      - FitPredicate      │
+//                     │ │              │           ╲  │  │  │  ╱    │             │                  │                   │ │     │      - Procurement       │
+//                     │ │              │            ╲ └┐ │ ┌┘ ╱     │             │                  │                   │ └─────┴──────────────────────────┘
+//                     │┌▼────────────┐┌▼──────────┐┌─▼─▼─▼─▼─▼─┐┌───┴────────┐┌───▼───┐         ┌────▼───┐               │
+//                     ││LaunchTask(t)││KillTask(t)││sync.Mutex ││reconcile(t)││Tasks()│         │Offers()│               │
+//                     │└──────┬──────┘└─────┬─────┘└───────────┘└────────▲───┘└───┬───┘         └────┬───┘               │
+//                     │       │             │                            │        │                  │                   │
+//                     │       │             └──────────────────┐         │    ┌───▼────────────┐     │                   │
+//                     │       └──────────────────────────────┐ │         │    │podtask.Registry│     │                   │
+//                     │                                      │ │         │    └────────────────┘     │                   │           ┌──────────────────────┐
+//                     │                                      │ │         │                           │                   │           │                      │
+//                     │Scheduler                             │ └──────┐  │                           │                   │           │   A ──────────▶ B    │
+//                     └──────────────────────────────────────┼────────┼─┬│----┬──────────────────────┼───────────────────┘           │                      │
+//                     ┌──────────────────────────────────────┼────────┼─┤sched├──────────────────────┼─────────────────────────┐     │  A has a reference   │
+//                     │Framework                             │        │ └─────┘                 ┌────▼───┐                     │     │   on B and calls B   │
+//                     │                               ┌──────▼──────┐┌▼──────────┐              │Offers()│                     │     │                      │
+//                     │                               │LaunchTask(t)││KillTask(t)│              └────┬───┘                     │     └──────────────────────┘
+//                     │                               └─────────┬───┘└──────┬────┘          ┌────────▼───────┐                 │
+//                     │implements: mesos-go/scheduler.Scheduler └───────────▼               │offers.Registry │                 │
+//                     │                                                     │               └────────────────┘                 │
+//                     │                        ┌─────────────────┐       ┌──▼─────────────┐                                    │
+//                     └────────────────────────┤                 ├───────┤     Mesos      ├────────────────────────────────────┘
+//                                              │ TasksReconciler │       │   Scheduler    │
+//                                              │                 ├───────▶     Driver     │
+//                                              └─────────────────┘       └────────┬───────┘
+//                                                                                 │
+//                                                                                 │
+//                                                                                 ▼
--- a/contrib/mesos/pkg/scheduler/errors/doc.go
+++ b/contrib/mesos/pkg/scheduler/errors/doc.go
@@ -0,0 +1,18 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+// Package errors contains all scheduler wide used errors
+package errors
--- a/contrib/mesos/pkg/scheduler/errors/errors.go
+++ b/contrib/mesos/pkg/scheduler/errors/errors.go
@@ -0,0 +1,28 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package errors
+
+import (
+	"errors"
+)
+
+var (
+	NoSuchPodErr               = errors.New("No such pod exists")
+	NoSuchTaskErr              = errors.New("No such task exists")
+	ReconciliationCancelledErr = errors.New("explicit task reconciliation cancelled")
+	NoSuitableOffersErr        = errors.New("No suitable offers for pod/task")
+)
--- a/contrib/mesos/pkg/scheduler/ha/ha.go
+++ b/contrib/mesos/pkg/scheduler/ha/ha.go
@@ -112,10 +112,10 @@ type SchedulerProcess struct {
 	fin      chan struct{}
 }

-func New(sched bindings.Scheduler) *SchedulerProcess {
+func New(framework bindings.Scheduler) *SchedulerProcess {
 	p := &SchedulerProcess{
 		Process:   proc.New(),
-		Scheduler: sched,
+		Scheduler: framework,
 		stage:     initStage,
 		elected:   make(chan struct{}),
 		failover:  make(chan struct{}),
--- a/contrib/mesos/pkg/scheduler/integration/doc.go
+++ b/contrib/mesos/pkg/scheduler/integration/doc.go
@@ -0,0 +1,18 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+// Package integration implements integration tests.
+package integration
--- a/contrib/mesos/pkg/scheduler/integration/integration_test.go
+++ b/contrib/mesos/pkg/scheduler/integration/integration_test.go
@@ -14,7 +14,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 */

-package scheduler
+package integration

 import (
 	"encoding/json"
@@ -25,14 +25,6 @@ import (
 	"testing"
 	"time"

-	"k8s.io/kubernetes/pkg/api"
-	"k8s.io/kubernetes/pkg/api/testapi"
-	"k8s.io/kubernetes/pkg/api/unversioned"
-	"k8s.io/kubernetes/pkg/client/cache"
-	client "k8s.io/kubernetes/pkg/client/unversioned"
-	"k8s.io/kubernetes/pkg/runtime"
-	"k8s.io/kubernetes/pkg/watch"
-
 	log "github.com/golang/glog"
 	mesos "github.com/mesos/mesos-go/mesosproto"
 	"github.com/mesos/mesos-go/mesosutil"
@@ -41,13 +33,24 @@ import (
 	"github.com/stretchr/testify/mock"
 	assertext "k8s.io/kubernetes/contrib/mesos/pkg/assert"
 	"k8s.io/kubernetes/contrib/mesos/pkg/executor/messages"
-	"k8s.io/kubernetes/contrib/mesos/pkg/queue"
+	"k8s.io/kubernetes/contrib/mesos/pkg/scheduler"
+	"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/components"
+	"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/components/algorithm/podschedulers"
+	"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/components/controller"
+	"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/components/framework"
 	schedcfg "k8s.io/kubernetes/contrib/mesos/pkg/scheduler/config"
 	"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/ha"
 	"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/meta"
 	"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/podtask"
 	mresource "k8s.io/kubernetes/contrib/mesos/pkg/scheduler/resource"
+	"k8s.io/kubernetes/pkg/api"
+	"k8s.io/kubernetes/pkg/api/testapi"
+	"k8s.io/kubernetes/pkg/api/unversioned"
+	"k8s.io/kubernetes/pkg/client/cache"
+	client "k8s.io/kubernetes/pkg/client/unversioned"
+	"k8s.io/kubernetes/pkg/runtime"
 	"k8s.io/kubernetes/pkg/util"
+	"k8s.io/kubernetes/pkg/watch"
 )

 // A apiserver mock which partially mocks the pods API
@@ -399,19 +402,6 @@ func (a *EventAssertions) EventWithReason(observer *EventObserver, reason string
 	}, msgAndArgs...)
 }

-type joinableDriver struct {
-	MockSchedulerDriver
-	joinFunc func() (mesos.Status, error)
-}
-
-// Join invokes joinFunc if it has been set, otherwise blocks forever
-func (m *joinableDriver) Join() (mesos.Status, error) {
-	if m.joinFunc != nil {
-		return m.joinFunc()
-	}
-	select {}
-}
-
 // Create mesos.TaskStatus for a given task
 func newTaskStatusForTask(task *mesos.TaskInfo, state mesos.TaskState) *mesos.TaskStatus {
 	healthy := state == mesos.TaskState_TASK_RUNNING
@@ -436,12 +426,12 @@ type LaunchedTask struct {

 type lifecycleTest struct {
 	apiServer     *TestServer
-	driver        *joinableDriver
+	driver        *framework.JoinableDriver
 	eventObs      *EventObserver
-	plugin        *schedulingPlugin
 	podsListWatch *MockPodsListWatch
-	scheduler     *KubernetesScheduler
+	framework     framework.Framework
 	schedulerProc *ha.SchedulerProcess
+	sched         scheduler.Scheduler
 	t             *testing.T
 }

@@ -454,15 +444,33 @@ func newLifecycleTest(t *testing.T) lifecycleTest {
 	// create fake apiserver
 	apiServer := NewTestServer(t, api.NamespaceDefault, podsListWatch)

-	// create executor with some data for static pods if set
-	executor := mesosutil.NewExecutorInfo(
+	// create ExecutorInfo with some data for static pods if set
+	ei := mesosutil.NewExecutorInfo(
 		mesosutil.NewExecutorID("executor-id"),
 		mesosutil.NewCommandInfo("executor-cmd"),
 	)
-	executor.Data = []byte{0, 1, 2}
+	ei.Data = []byte{0, 1, 2}

-	// create scheduler
-	strategy := NewAllocationStrategy(
+	// create framework
+	client := client.NewOrDie(&client.Config{
+		Host:    apiServer.server.URL,
+		Version: testapi.Default.Version(),
+	})
+	c := *schedcfg.CreateDefaultConfig()
+	fw := framework.New(framework.Config{
+		Executor:        ei,
+		Client:          client,
+		SchedulerConfig: c,
+		LookupNode:      apiServer.LookupNode,
+	})
+
+	// TODO(sttts): re-enable the following tests
+	// assert.NotNil(framework.client, "client is nil")
+	// assert.NotNil(framework.executor, "executor is nil")
+	// assert.NotNil(framework.offers, "offer registry is nil")
+
+	// create pod scheduler
+	strategy := podschedulers.NewAllocationStrategy(
 		podtask.NewDefaultPredicate(
 			mresource.DefaultDefaultContainerCPULimit,
 			mresource.DefaultDefaultContainerMemLimit,
@@ -472,64 +480,39 @@ func newLifecycleTest(t *testing.T) lifecycleTest {
 			mresource.DefaultDefaultContainerMemLimit,
 		),
 	)
-
-	scheduler := New(Config{
-		Executor: executor,
-		Client: client.NewOrDie(&client.Config{
-			Host:    apiServer.server.URL,
-			Version: testapi.Default.Version(),
-		}),
-		Scheduler:  NewFCFSPodScheduler(strategy, apiServer.LookupNode),
-		Schedcfg:   *schedcfg.CreateDefaultConfig(),
-		LookupNode: apiServer.LookupNode,
-	})
-
-	assert.NotNil(scheduler.client, "client is nil")
-	assert.NotNil(scheduler.executor, "executor is nil")
-	assert.NotNil(scheduler.offers, "offer registry is nil")
+	fcfs := podschedulers.NewFCFSPodScheduler(strategy, apiServer.LookupNode)

 	// create scheduler process
-	schedulerProc := ha.New(scheduler)
+	schedulerProc := ha.New(fw)

-	// get plugin config from it
-	config := scheduler.NewPluginConfig(
-		schedulerProc.Terminal(),
-		http.DefaultServeMux,
-		&podsListWatch.ListWatch,
-	)
-	assert.NotNil(config)
-
-	// make events observable
+	// create scheduler
 	eventObs := NewEventObserver()
-	config.Recorder = eventObs
-
-	// create plugin
-	plugin := NewPlugin(config).(*schedulingPlugin)
-	assert.NotNil(plugin)
+	scheduler := components.New(&c, fw, fcfs, client, eventObs, schedulerProc.Terminal(), http.DefaultServeMux, &podsListWatch.ListWatch)
+	assert.NotNil(scheduler)

 	// create mock mesos scheduler driver
-	driver := &joinableDriver{}
+	driver := &framework.JoinableDriver{}

 	return lifecycleTest{
 		apiServer:     apiServer,
 		driver:        driver,
 		eventObs:      eventObs,
-		plugin:        plugin,
 		podsListWatch: podsListWatch,
-		scheduler:     scheduler,
+		framework:     fw,
 		schedulerProc: schedulerProc,
+		sched:         scheduler,
 		t:             t,
 	}
 }

 func (lt lifecycleTest) Start() <-chan LaunchedTask {
 	assert := &EventAssertions{*assert.New(lt.t)}
-	lt.plugin.Run(lt.schedulerProc.Terminal())
+	lt.sched.Run(lt.schedulerProc.Terminal())

-	// init scheduler
-	err := lt.scheduler.Init(
+	// init framework
+	err := lt.framework.Init(
+		lt.sched,
 		lt.schedulerProc.Master(),
-		lt.plugin,
 		http.DefaultServeMux,
 	)
 	assert.NoError(err)
@@ -582,7 +565,7 @@ func (lt lifecycleTest) Start() <-chan LaunchedTask {
 	<-started

 	// tell scheduler to be registered
-	lt.scheduler.Registered(
+	lt.framework.Registered(
 		lt.driver,
 		mesosutil.NewFrameworkID("kubernetes-id"),
 		mesosutil.NewMasterInfo("master-id", (192<<24)+(168<<16)+(0<<8)+1, 5050),
@@ -601,19 +584,10 @@ func (lt lifecycleTest) End() <-chan struct{} {
 	return lt.schedulerProc.End()
 }

-// Test to create the scheduler plugin with an empty plugin config
-func TestPlugin_New(t *testing.T) {
-	assert := assert.New(t)
-
-	c := PluginConfig{}
-	p := NewPlugin(&c)
-	assert.NotNil(p)
-}
-
-// TestPlugin_LifeCycle creates a scheduler plugin with the config returned by the scheduler,
+// TestScheduler_LifeCycle creates a scheduler plugin with the config returned by the scheduler,
 // and plays through the whole life cycle of the plugin while creating pods, deleting
 // and failing them.
-func TestPlugin_LifeCycle(t *testing.T) {
+func TestScheduler_LifeCycle(t *testing.T) {
 	assert := &EventAssertions{*assert.New(t)}
 	lt := newLifecycleTest(t)
 	defer lt.Close()
@@ -627,29 +601,29 @@ func TestPlugin_LifeCycle(t *testing.T) {
 	lt.podsListWatch.Add(pod, true) // notify watchers

 	// wait for failedScheduling event because there is no offer
-	assert.EventWithReason(lt.eventObs, FailedScheduling, "failedScheduling event not received")
+	assert.EventWithReason(lt.eventObs, controller.FailedScheduling, "failedScheduling event not received")

 	// add some matching offer
 	offers := []*mesos.Offer{NewTestOffer(fmt.Sprintf("offer%d", i))}
-	lt.scheduler.ResourceOffers(nil, offers)
+	lt.framework.ResourceOffers(nil, offers)

 	// first offer is declined because node is not available yet
 	lt.apiServer.WaitForNode("some_hostname")

 	// add one more offer
-	lt.scheduler.ResourceOffers(nil, offers)
+	lt.framework.ResourceOffers(nil, offers)

 	// and wait for scheduled pod
-	assert.EventWithReason(lt.eventObs, Scheduled)
+	assert.EventWithReason(lt.eventObs, controller.Scheduled)
 	select {
 	case launchedTask := <-launchedTasks:
 		// report back that the task has been staged, and then started by mesos
-		lt.scheduler.StatusUpdate(
+		lt.framework.StatusUpdate(
 			lt.driver,
 			newTaskStatusForTask(launchedTask.taskInfo, mesos.TaskState_TASK_STAGING),
 		)

-		lt.scheduler.StatusUpdate(
+		lt.framework.StatusUpdate(
 			lt.driver,
 			newTaskStatusForTask(launchedTask.taskInfo, mesos.TaskState_TASK_RUNNING),
 		)
@@ -660,7 +634,7 @@ func TestPlugin_LifeCycle(t *testing.T) {
 		// report back that the task has been lost
 		lt.driver.AssertNumberOfCalls(t, "SendFrameworkMessage", 0)

-		lt.scheduler.StatusUpdate(
+		lt.framework.StatusUpdate(
 			lt.driver,
 			newTaskStatusForTask(launchedTask.taskInfo, mesos.TaskState_TASK_LOST),
 		)
@@ -677,22 +651,22 @@ func TestPlugin_LifeCycle(t *testing.T) {
 	// Launch a pod and wait until the scheduler driver is called
 	schedulePodWithOffers := func(pod *api.Pod, offers []*mesos.Offer) (*api.Pod, *LaunchedTask, *mesos.Offer) {
 		// wait for failedScheduling event because there is no offer
-		assert.EventWithReason(lt.eventObs, FailedScheduling, "failedScheduling event not received")
+		assert.EventWithReason(lt.eventObs, controller.FailedScheduling, "failedScheduling event not received")

 		// supply a matching offer
-		lt.scheduler.ResourceOffers(lt.driver, offers)
+		lt.framework.ResourceOffers(lt.driver, offers)
 		for _, offer := range offers {
 			if _, ok := offeredNodes[offer.GetHostname()]; !ok {
 				offeredNodes[offer.GetHostname()] = struct{}{}
 				lt.apiServer.WaitForNode(offer.GetHostname())

 				// reoffer since it must have been declined above
-				lt.scheduler.ResourceOffers(lt.driver, []*mesos.Offer{offer})
+				lt.framework.ResourceOffers(lt.driver, []*mesos.Offer{offer})
 			}
 		}

 		// and wait to get scheduled
-		assert.EventWithReason(lt.eventObs, Scheduled)
+		assert.EventWithReason(lt.eventObs, controller.Scheduled)

 		// wait for driver.launchTasks call
 		select {
@@ -722,11 +696,11 @@ func TestPlugin_LifeCycle(t *testing.T) {
 		pod, launchedTask, offer := launchPodWithOffers(pod, offers)
 		if pod != nil {
 			// report back status
-			lt.scheduler.StatusUpdate(
+			lt.framework.StatusUpdate(
 				lt.driver,
 				newTaskStatusForTask(launchedTask.taskInfo, mesos.TaskState_TASK_STAGING),
 			)
-			lt.scheduler.StatusUpdate(
+			lt.framework.StatusUpdate(
 				lt.driver,
 				newTaskStatusForTask(launchedTask.taskInfo, mesos.TaskState_TASK_RUNNING),
 			)
@@ -762,7 +736,7 @@ func TestPlugin_LifeCycle(t *testing.T) {
 	select {
 	case <-killTaskCalled:
 		// report back that the task is finished
-		lt.scheduler.StatusUpdate(
+		lt.framework.StatusUpdate(
 			lt.driver,
 			newTaskStatusForTask(launchedTask.taskInfo, mesos.TaskState_TASK_FINISHED),
 		)
@@ -787,8 +761,8 @@ func TestPlugin_LifeCycle(t *testing.T) {
 	assert.Equal(offers[1].Id.GetValue(), usedOffer.Id.GetValue())
 	assert.Equal(pod.Spec.NodeName, *usedOffer.Hostname)

-	lt.scheduler.OfferRescinded(lt.driver, offers[0].Id)
-	lt.scheduler.OfferRescinded(lt.driver, offers[2].Id)
+	lt.framework.OfferRescinded(lt.driver, offers[0].Id)
+	lt.framework.OfferRescinded(lt.driver, offers[2].Id)

 	// start pods:
 	// - which are failing while binding,
@@ -800,7 +774,7 @@ func TestPlugin_LifeCycle(t *testing.T) {
 		status := newTaskStatusForTask(task, mesos.TaskState_TASK_FAILED)
 		message := messages.CreateBindingFailure
 		status.Message = &message
-		lt.scheduler.StatusUpdate(lt.driver, status)
+		lt.framework.StatusUpdate(lt.driver, status)

 		// wait until pod is looked up at the apiserver
 		assertext.EventuallyTrue(t, util.ForeverTestTimeout, func() bool {
@@ -822,7 +796,7 @@ func TestPlugin_LifeCycle(t *testing.T) {

 	podKey, _ := podtask.MakePodKey(api.NewDefaultContext(), pod.Name)
 	assertext.EventuallyTrue(t, util.ForeverTestTimeout, func() bool {
-		t, _ := lt.plugin.api.tasks().ForPod(podKey)
+		t, _ := lt.sched.Tasks().ForPod(podKey)
 		return t == nil
 	})

@@ -845,143 +819,3 @@ func TestPlugin_LifeCycle(t *testing.T) {
 	time.Sleep(time.Second / 2)
 	failPodFromExecutor(launchedTask.taskInfo)
 }
-
-func TestDeleteOne_NonexistentPod(t *testing.T) {
-	assert := assert.New(t)
-	obj := &MockScheduler{}
-	reg := podtask.NewInMemoryRegistry()
-	obj.On("tasks").Return(reg)
-
-	qr := newQueuer(nil)
-	assert.Equal(0, len(qr.podQueue.List()))
-	d := &deleter{
-		api: obj,
-		qr:  qr,
-	}
-	pod := &Pod{Pod: &api.Pod{
-		ObjectMeta: api.ObjectMeta{
-			Name:      "foo",
-			Namespace: api.NamespaceDefault,
-		}}}
-	err := d.deleteOne(pod)
-	assert.Equal(err, noSuchPodErr)
-	obj.AssertExpectations(t)
-}
-
-func TestDeleteOne_PendingPod(t *testing.T) {
-	assert := assert.New(t)
-	obj := &MockScheduler{}
-	reg := podtask.NewInMemoryRegistry()
-	obj.On("tasks").Return(reg)
-
-	pod := &Pod{Pod: &api.Pod{
-		ObjectMeta: api.ObjectMeta{
-			Name:      "foo",
-			UID:       "foo0",
-			Namespace: api.NamespaceDefault,
-		}}}
-
-	task, err := podtask.New(api.NewDefaultContext(), "bar", *pod.Pod, &mesos.ExecutorInfo{})
-	if err != nil {
-		t.Fatalf("failed to create task: %v", err)
-	}
-
-	_, err = reg.Register(task)
-	if err != nil {
-		t.Fatalf("failed to register task: %v", err)
-	}
-
-	// preconditions
-	qr := newQueuer(nil)
-	qr.podQueue.Add(pod, queue.ReplaceExisting)
-	assert.Equal(1, len(qr.podQueue.List()))
-	_, found := qr.podQueue.Get("default/foo")
-	assert.True(found)
-
-	// exec & post conditions
-	d := &deleter{
-		api: obj,
-		qr:  qr,
-	}
-	err = d.deleteOne(pod)
-	assert.Nil(err)
-	_, found = qr.podQueue.Get("foo0")
-	assert.False(found)
-	assert.Equal(0, len(qr.podQueue.List()))
-	obj.AssertExpectations(t)
-}
-
-func TestDeleteOne_Running(t *testing.T) {
-	assert := assert.New(t)
-	obj := &MockScheduler{}
-	reg := podtask.NewInMemoryRegistry()
-	obj.On("tasks").Return(reg)
-
-	pod := &Pod{Pod: &api.Pod{
-		ObjectMeta: api.ObjectMeta{
-			Name:      "foo",
-			UID:       "foo0",
-			Namespace: api.NamespaceDefault,
-		}}}
-
-	task, err := podtask.New(api.NewDefaultContext(), "bar", *pod.Pod, &mesos.ExecutorInfo{})
-	if err != nil {
-		t.Fatalf("unexpected error: %v", err)
-	}
-
-	task, err = reg.Register(task)
-	if err != nil {
-		t.Fatalf("unexpected error: %v", err)
-	}
-
-	task.Set(podtask.Launched)
-	err = reg.Update(task)
-	if err != nil {
-		t.Fatalf("unexpected error: %v", err)
-	}
-
-	// preconditions
-	qr := newQueuer(nil)
-	qr.podQueue.Add(pod, queue.ReplaceExisting)
-	assert.Equal(1, len(qr.podQueue.List()))
-	_, found := qr.podQueue.Get("default/foo")
-	assert.True(found)
-
-	obj.On("killTask", task.ID).Return(nil)
-
-	// exec & post conditions
-	d := &deleter{
-		api: obj,
-		qr:  qr,
-	}
-	err = d.deleteOne(pod)
-	assert.Nil(err)
-	_, found = qr.podQueue.Get("foo0")
-	assert.False(found)
-	assert.Equal(0, len(qr.podQueue.List()))
-	obj.AssertExpectations(t)
-}
-
-func TestDeleteOne_badPodNaming(t *testing.T) {
-	assert := assert.New(t)
-	obj := &MockScheduler{}
-	pod := &Pod{Pod: &api.Pod{}}
-	d := &deleter{
-		api: obj,
-		qr:  newQueuer(nil),
-	}
-
-	err := d.deleteOne(pod)
-	assert.NotNil(err)
-
-	pod.Pod.ObjectMeta.Name = "foo"
-	err = d.deleteOne(pod)
-	assert.NotNil(err)
-
-	pod.Pod.ObjectMeta.Name = ""
-	pod.Pod.ObjectMeta.Namespace = "bar"
-	err = d.deleteOne(pod)
-	assert.NotNil(err)
-
-	obj.AssertExpectations(t)
-}
--- a/contrib/mesos/pkg/scheduler/meta/annotations.go
+++ b/contrib/mesos/pkg/scheduler/meta/annotations.go
@@ -25,7 +25,6 @@ const (
 	TaskIdKey                = "k8s.mesosphere.io/taskId"
 	SlaveIdKey               = "k8s.mesosphere.io/slaveId"
 	OfferIdKey               = "k8s.mesosphere.io/offerId"
-	ExecutorIdKey            = "k8s.mesosphere.io/executorId"
 	PortMappingKeyPrefix     = "k8s.mesosphere.io/port_"
 	PortMappingKeyFormat     = PortMappingKeyPrefix + "%s_%d"
 	PortNameMappingKeyPrefix = "k8s.mesosphere.io/portName_"
--- a/contrib/mesos/pkg/scheduler/plugin.go
+++ b/contrib/mesos/pkg/scheduler/plugin.go
@@ -1,930 +0,0 @@
-/*
-Copyright 2015 The Kubernetes Authors All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package scheduler
-
-import (
-	"fmt"
-	"io"
-	"net/http"
-	"strconv"
-	"sync"
-	"time"
-
-	log "github.com/golang/glog"
-	mesos "github.com/mesos/mesos-go/mesosproto"
-	mutil "github.com/mesos/mesos-go/mesosutil"
-	"k8s.io/kubernetes/contrib/mesos/pkg/backoff"
-	"k8s.io/kubernetes/contrib/mesos/pkg/offers"
-	"k8s.io/kubernetes/contrib/mesos/pkg/queue"
-	"k8s.io/kubernetes/contrib/mesos/pkg/runtime"
-	annotation "k8s.io/kubernetes/contrib/mesos/pkg/scheduler/meta"
-	"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/podtask"
-	"k8s.io/kubernetes/pkg/api"
-	"k8s.io/kubernetes/pkg/api/errors"
-	"k8s.io/kubernetes/pkg/client/cache"
-	"k8s.io/kubernetes/pkg/client/record"
-	client "k8s.io/kubernetes/pkg/client/unversioned"
-	"k8s.io/kubernetes/pkg/fields"
-	"k8s.io/kubernetes/pkg/util"
-	plugin "k8s.io/kubernetes/plugin/pkg/scheduler"
-	"k8s.io/kubernetes/plugin/pkg/scheduler/algorithm"
-)
-
-const (
-	enqueuePopTimeout   = 200 * time.Millisecond
-	enqueueWaitTimeout  = 1 * time.Second
-	yieldPopTimeout     = 200 * time.Millisecond
-	yieldWaitTimeout    = 1 * time.Second
-	pluginRecoveryDelay = 100 * time.Millisecond // delay after scheduler plugin crashes, before we resume scheduling
-)
-
-const (
-	FailedScheduling = "FailedScheduling"
-	Scheduled        = "Scheduled"
-)
-
-// scheduler abstraction to allow for easier unit testing
-type schedulerInterface interface {
-	sync.Locker // synchronize scheduler plugin operations
-
-	SlaveIndex
-	algorithm() PodScheduler
-	offers() offers.Registry
-	tasks() podtask.Registry
-
-	// driver calls
-
-	killTask(taskId string) error
-	launchTask(*podtask.T) error
-
-	// convenience
-
-	createPodTask(api.Context, *api.Pod) (*podtask.T, error)
-}
-
-type k8smScheduler struct {
-	sync.Mutex
-	internal *KubernetesScheduler
-}
-
-func (k *k8smScheduler) algorithm() PodScheduler {
-	return k.internal
-}
-
-func (k *k8smScheduler) offers() offers.Registry {
-	return k.internal.offers
-}
-
-func (k *k8smScheduler) tasks() podtask.Registry {
-	return k.internal.taskRegistry
-}
-
-func (k *k8smScheduler) createPodTask(ctx api.Context, pod *api.Pod) (*podtask.T, error) {
-	return podtask.New(ctx, "", *pod, k.internal.executor)
-}
-
-func (k *k8smScheduler) slaveHostNameFor(id string) string {
-	return k.internal.slaveHostNames.HostName(id)
-}
-
-func (k *k8smScheduler) killTask(taskId string) error {
-	killTaskId := mutil.NewTaskID(taskId)
-	_, err := k.internal.driver.KillTask(killTaskId)
-	return err
-}
-
-func (k *k8smScheduler) launchTask(task *podtask.T) error {
-	// assume caller is holding scheduler lock
-	taskList := []*mesos.TaskInfo{task.BuildTaskInfo()}
-	offerIds := []*mesos.OfferID{task.Offer.Details().Id}
-	filters := &mesos.Filters{}
-	_, err := k.internal.driver.LaunchTasks(offerIds, taskList, filters)
-	return err
-}
-
-type binder struct {
-	api schedulerInterface
-}
-
-// implements binding.Registry, launches the pod-associated-task in mesos
-func (b *binder) Bind(binding *api.Binding) error {
-
-	ctx := api.WithNamespace(api.NewContext(), binding.Namespace)
-
-	// default upstream scheduler passes pod.Name as binding.Name
-	podKey, err := podtask.MakePodKey(ctx, binding.Name)
-	if err != nil {
-		return err
-	}
-
-	b.api.Lock()
-	defer b.api.Unlock()
-
-	switch task, state := b.api.tasks().ForPod(podKey); state {
-	case podtask.StatePending:
-		return b.bind(ctx, binding, task)
-	default:
-		// in this case it's likely that the pod has been deleted between Schedule
-		// and Bind calls
-		log.Infof("No pending task for pod %s", podKey)
-		return noSuchPodErr //TODO(jdef) this error is somewhat misleading since the task could be running?!
-	}
-}
-
-func (b *binder) rollback(task *podtask.T, err error) error {
-	task.Offer.Release()
-	task.Reset()
-	if err2 := b.api.tasks().Update(task); err2 != nil {
-		log.Errorf("failed to update pod task: %v", err2)
-	}
-	return err
-}
-
-// assumes that: caller has acquired scheduler lock and that the task is still pending
-//
-// bind does not actually do the binding itself, but launches the pod as a Mesos task. The
-// kubernetes executor on the slave will finally do the binding. This is different from the
-// upstream scheduler in the sense that the upstream scheduler does the binding and the
-// kubelet will notice that and launches the pod.
-func (b *binder) bind(ctx api.Context, binding *api.Binding, task *podtask.T) (err error) {
-	// sanity check: ensure that the task hasAcceptedOffer(), it's possible that between
-	// Schedule() and now that the offer for this task was rescinded or invalidated.
-	// ((we should never see this here))
-	if !task.HasAcceptedOffer() {
-		return fmt.Errorf("task has not accepted a valid offer %v", task.ID)
-	}
-
-	// By this time, there is a chance that the slave is disconnected.
-	offerId := task.GetOfferId()
-	if offer, ok := b.api.offers().Get(offerId); !ok || offer.HasExpired() {
-		// already rescinded or timed out or otherwise invalidated
-		return b.rollback(task, fmt.Errorf("failed prior to launchTask due to expired offer for task %v", task.ID))
-	}
-
-	if err = b.prepareTaskForLaunch(ctx, binding.Target.Name, task, offerId); err == nil {
-		log.V(2).Infof("launching task: %q on target %q slave %q for pod \"%v/%v\", cpu %.2f, mem %.2f MB",
-			task.ID, binding.Target.Name, task.Spec.SlaveID, task.Pod.Namespace, task.Pod.Name, task.Spec.CPU, task.Spec.Memory)
-		if err = b.api.launchTask(task); err == nil {
-			b.api.offers().Invalidate(offerId)
-			task.Set(podtask.Launched)
-			if err = b.api.tasks().Update(task); err != nil {
-				// this should only happen if the task has been removed or has changed status,
-				// which SHOULD NOT HAPPEN as long as we're synchronizing correctly
-				log.Errorf("failed to update task w/ Launched status: %v", err)
-			}
-			return
-		}
-	}
-	return b.rollback(task, fmt.Errorf("Failed to launch task %v: %v", task.ID, err))
-}
-
-//TODO(jdef) unit test this, ensure that task's copy of api.Pod is not modified
-func (b *binder) prepareTaskForLaunch(ctx api.Context, machine string, task *podtask.T, offerId string) error {
-	pod := task.Pod
-
-	// we make an effort here to avoid making changes to the task's copy of the pod, since
-	// we want that to reflect the initial user spec, and not the modified spec that we
-	// build for the executor to consume.
-	oemCt := pod.Spec.Containers
-	pod.Spec.Containers = append([]api.Container{}, oemCt...) // (shallow) clone before mod
-
-	if pod.Annotations == nil {
-		pod.Annotations = make(map[string]string)
-	}
-
-	task.SaveRecoveryInfo(pod.Annotations)
-	pod.Annotations[annotation.BindingHostKey] = task.Spec.AssignedSlave
-
-	for _, entry := range task.Spec.PortMap {
-		oemPorts := pod.Spec.Containers[entry.ContainerIdx].Ports
-		ports := append([]api.ContainerPort{}, oemPorts...)
-		p := &ports[entry.PortIdx]
-		p.HostPort = int(entry.OfferPort)
-		op := strconv.FormatUint(entry.OfferPort, 10)
-		pod.Annotations[fmt.Sprintf(annotation.PortMappingKeyFormat, p.Protocol, p.ContainerPort)] = op
-		if p.Name != "" {
-			pod.Annotations[fmt.Sprintf(annotation.PortNameMappingKeyFormat, p.Protocol, p.Name)] = op
-		}
-		pod.Spec.Containers[entry.ContainerIdx].Ports = ports
-	}
-
-	// the kubelet-executor uses this to instantiate the pod
-	log.V(3).Infof("prepared pod spec: %+v", pod)
-
-	data, err := api.Codec.Encode(&pod)
-	if err != nil {
-		log.V(2).Infof("Failed to marshal the pod spec: %v", err)
-		return err
-	}
-	task.Spec.Data = data
-	return nil
-}
-
-type kubeScheduler struct {
-	api        schedulerInterface
-	podUpdates queue.FIFO
-}
-
-// recoverAssignedSlave recovers the assigned Mesos slave from a pod by searching
-// the BindingHostKey. For tasks in the registry of the scheduler, the same
-// value is stored in T.Spec.AssignedSlave. Before launching, the BindingHostKey
-// annotation is added and the executor will eventually persist that to the
-// apiserver on binding.
-func recoverAssignedSlave(pod *api.Pod) string {
-	return pod.Annotations[annotation.BindingHostKey]
-}
-
-// Schedule implements the Scheduler interface of Kubernetes.
-// It returns the selectedMachine's name and error (if there's any).
-func (k *kubeScheduler) Schedule(pod *api.Pod, unused algorithm.NodeLister) (string, error) {
-	log.Infof("Try to schedule pod %v\n", pod.Name)
-	ctx := api.WithNamespace(api.NewDefaultContext(), pod.Namespace)
-
-	// default upstream scheduler passes pod.Name as binding.PodID
-	podKey, err := podtask.MakePodKey(ctx, pod.Name)
-	if err != nil {
-		return "", err
-	}
-
-	k.api.Lock()
-	defer k.api.Unlock()
-
-	switch task, state := k.api.tasks().ForPod(podKey); state {
-	case podtask.StateUnknown:
-		// There's a bit of a potential race here, a pod could have been yielded() and
-		// then before we get *here* it could be deleted.
-		// We use meta to index the pod in the store since that's what k8s reflector does.
-		podName, err := cache.MetaNamespaceKeyFunc(pod)
-		if err != nil {
-			log.Warningf("aborting Schedule, unable to understand pod object %+v", pod)
-			return "", noSuchPodErr
-		}
-		if deleted := k.podUpdates.Poll(podName, queue.DELETE_EVENT); deleted {
-			// avoid scheduling a pod that's been deleted between yieldPod() and Schedule()
-			log.Infof("aborting Schedule, pod has been deleted %+v", pod)
-			return "", noSuchPodErr
-		}
-
-		task, err := k.api.createPodTask(ctx, pod)
-		if err != nil {
-			return "", err
-		}
-
-		task, err = k.api.tasks().Register(task)
-		if err != nil {
-			return "", err
-		}
-
-		return k.doSchedule(task)
-
-	//TODO(jdef) it's possible that the pod state has diverged from what
-	//we knew previously, we should probably update the task.Pod state here
-	//before proceeding with scheduling
-	case podtask.StatePending:
-		if pod.UID != task.Pod.UID {
-			// we're dealing with a brand new pod spec here, so the old one must have been
-			// deleted -- and so our task store is out of sync w/ respect to reality
-			//TODO(jdef) reconcile task
-			return "", fmt.Errorf("task %v spec is out of sync with pod %v spec, aborting schedule", task.ID, pod.Name)
-		} else if task.Has(podtask.Launched) {
-			// task has been marked as "launched" but the pod binding creation may have failed in k8s,
-			// but we're going to let someone else handle it, probably the mesos task error handler
-			return "", fmt.Errorf("task %s has already been launched, aborting schedule", task.ID)
-		} else {
-			return k.doSchedule(task)
-		}
-
-	default:
-		return "", fmt.Errorf("task %s is not pending, nothing to schedule", task.ID)
-	}
-}
-
-// doSchedule schedules the given task and returns the machine the task is scheduled on
-// or an error if the scheduling failed.
-func (k *kubeScheduler) doSchedule(task *podtask.T) (string, error) {
-	var offer offers.Perishable
-	var err error
-
-	if task.HasAcceptedOffer() {
-		// verify that the offer is still on the table
-		var ok bool
-		offer, ok = k.api.offers().Get(task.GetOfferId())
-
-		if !ok || offer.HasExpired() {
-			task.Offer.Release()
-			task.Reset()
-			if err = k.api.tasks().Update(task); err != nil {
-				return "", err
-			}
-		}
-	}
-
-	if offer == nil {
-		offer, err = k.api.algorithm().SchedulePod(k.api.offers(), k.api, task)
-	}
-
-	if err != nil {
-		return "", err
-	}
-
-	details := offer.Details()
-	if details == nil {
-		return "", fmt.Errorf("offer already invalid/expired for task %v", task.ID)
-	}
-
-	slaveId := details.GetSlaveId().GetValue()
-	slaveHostName := k.api.slaveHostNameFor(slaveId)
-	if slaveHostName == "" {
-		// not much sense in Release()ing the offer here since its owner died
-		offer.Release()
-		k.api.offers().Invalidate(details.Id.GetValue())
-		return "", fmt.Errorf("Slave disappeared (%v) while scheduling task %v", slaveId, task.ID)
-	}
-
-	if task.Offer != nil && task.Offer != offer {
-		return "", fmt.Errorf("task.offer assignment must be idempotent, task %+v: offer %+v", task, offer)
-	}
-
-	task.Offer = offer
-	if err := k.api.algorithm().Procurement()(task, details); err != nil {
-		offer.Release()
-		task.Reset()
-		return "", err
-	}
-
-	if err := k.api.tasks().Update(task); err != nil {
-		offer.Release()
-		return "", err
-	}
-
-	return slaveHostName, nil
-}
-
-type queuer struct {
-	lock            sync.Mutex       // shared by condition variables of this struct
-	podUpdates      queue.FIFO       // queue of pod updates to be processed
-	podQueue        *queue.DelayFIFO // queue of pods to be scheduled
-	deltaCond       sync.Cond        // pod changes are available for processing
-	unscheduledCond sync.Cond        // there are unscheduled pods for processing
-}
-
-func newQueuer(store queue.FIFO) *queuer {
-	q := &queuer{
-		podQueue:   queue.NewDelayFIFO(),
-		podUpdates: store,
-	}
-	q.deltaCond.L = &q.lock
-	q.unscheduledCond.L = &q.lock
-	return q
-}
-
-func (q *queuer) installDebugHandlers(mux *http.ServeMux) {
-	mux.HandleFunc("/debug/scheduler/podqueue", func(w http.ResponseWriter, r *http.Request) {
-		for _, x := range q.podQueue.List() {
-			if _, err := io.WriteString(w, fmt.Sprintf("%+v\n", x)); err != nil {
-				break
-			}
-		}
-	})
-	mux.HandleFunc("/debug/scheduler/podstore", func(w http.ResponseWriter, r *http.Request) {
-		for _, x := range q.podUpdates.List() {
-			if _, err := io.WriteString(w, fmt.Sprintf("%+v\n", x)); err != nil {
-				break
-			}
-		}
-	})
-}
-
-// signal that there are probably pod updates waiting to be processed
-func (q *queuer) updatesAvailable() {
-	q.deltaCond.Broadcast()
-}
-
-// delete a pod from the to-be-scheduled queue
-func (q *queuer) dequeue(id string) {
-	q.podQueue.Delete(id)
-}
-
-// re-add a pod to the to-be-scheduled queue, will not overwrite existing pod data (that
-// may have already changed).
-func (q *queuer) requeue(pod *Pod) {
-	// use KeepExisting in case the pod has already been updated (can happen if binding fails
-	// due to constraint voilations); we don't want to overwrite a newer entry with stale data.
-	q.podQueue.Add(pod, queue.KeepExisting)
-	q.unscheduledCond.Broadcast()
-}
-
-// same as requeue but calls podQueue.Offer instead of podQueue.Add
-func (q *queuer) reoffer(pod *Pod) {
-	// use KeepExisting in case the pod has already been updated (can happen if binding fails
-	// due to constraint voilations); we don't want to overwrite a newer entry with stale data.
-	if q.podQueue.Offer(pod, queue.KeepExisting) {
-		q.unscheduledCond.Broadcast()
-	}
-}
-
-// spawns a go-routine to watch for unscheduled pods and queue them up
-// for scheduling. returns immediately.
-func (q *queuer) Run(done <-chan struct{}) {
-	go runtime.Until(func() {
-		log.Info("Watching for newly created pods")
-		q.lock.Lock()
-		defer q.lock.Unlock()
-
-		for {
-			// limit blocking here for short intervals so that scheduling
-			// may proceed even if there have been no recent pod changes
-			p := q.podUpdates.Await(enqueuePopTimeout)
-			if p == nil {
-				signalled := runtime.After(q.deltaCond.Wait)
-				// we've yielded the lock
-				select {
-				case <-time.After(enqueueWaitTimeout):
-					q.deltaCond.Broadcast() // abort Wait()
-					<-signalled             // wait for lock re-acquisition
-					log.V(4).Infoln("timed out waiting for a pod update")
-				case <-signalled:
-					// we've acquired the lock and there may be
-					// changes for us to process now
-				}
-				continue
-			}
-
-			pod := p.(*Pod)
-			if recoverAssignedSlave(pod.Pod) != "" {
-				log.V(3).Infof("dequeuing assigned pod for scheduling: %v", pod.Pod.Name)
-				q.dequeue(pod.GetUID())
-			} else {
-				// use ReplaceExisting because we are always pushing the latest state
-				now := time.Now()
-				pod.deadline = &now
-				if q.podQueue.Offer(pod, queue.ReplaceExisting) {
-					q.unscheduledCond.Broadcast()
-					log.V(3).Infof("queued pod for scheduling: %v", pod.Pod.Name)
-				} else {
-					log.Warningf("failed to queue pod for scheduling: %v", pod.Pod.Name)
-				}
-			}
-		}
-	}, 1*time.Second, done)
-}
-
-// implementation of scheduling plugin's NextPod func; see k8s plugin/pkg/scheduler
-func (q *queuer) yield() *api.Pod {
-	log.V(2).Info("attempting to yield a pod")
-	q.lock.Lock()
-	defer q.lock.Unlock()
-
-	for {
-		// limit blocking here to short intervals so that we don't block the
-		// enqueuer Run() routine for very long
-		kpod := q.podQueue.Await(yieldPopTimeout)
-		if kpod == nil {
-			signalled := runtime.After(q.unscheduledCond.Wait)
-			// lock is yielded at this point and we're going to wait for either
-			// a timeout, or a signal that there's data
-			select {
-			case <-time.After(yieldWaitTimeout):
-				q.unscheduledCond.Broadcast() // abort Wait()
-				<-signalled                   // wait for the go-routine, and the lock
-				log.V(4).Infoln("timed out waiting for a pod to yield")
-			case <-signalled:
-				// we have acquired the lock, and there
-				// may be a pod for us to pop now
-			}
-			continue
-		}
-
-		pod := kpod.(*Pod).Pod
-		if podName, err := cache.MetaNamespaceKeyFunc(pod); err != nil {
-			log.Warningf("yield unable to understand pod object %+v, will skip: %v", pod, err)
-		} else if !q.podUpdates.Poll(podName, queue.POP_EVENT) {
-			log.V(1).Infof("yield popped a transitioning pod, skipping: %+v", pod)
-		} else if recoverAssignedSlave(pod) != "" {
-			// should never happen if enqueuePods is filtering properly
-			log.Warningf("yield popped an already-scheduled pod, skipping: %+v", pod)
-		} else {
-			return pod
-		}
-	}
-}
-
-type errorHandler struct {
-	api     schedulerInterface
-	backoff *backoff.Backoff
-	qr      *queuer
-}
-
-// implementation of scheduling plugin's Error func; see plugin/pkg/scheduler
-func (k *errorHandler) handleSchedulingError(pod *api.Pod, schedulingErr error) {
-
-	if schedulingErr == noSuchPodErr {
-		log.V(2).Infof("Not rescheduling non-existent pod %v", pod.Name)
-		return
-	}
-
-	log.Infof("Error scheduling %v: %v; retrying", pod.Name, schedulingErr)
-	defer util.HandleCrash()
-
-	// default upstream scheduler passes pod.Name as binding.PodID
-	ctx := api.WithNamespace(api.NewDefaultContext(), pod.Namespace)
-	podKey, err := podtask.MakePodKey(ctx, pod.Name)
-	if err != nil {
-		log.Errorf("Failed to construct pod key, aborting scheduling for pod %v: %v", pod.Name, err)
-		return
-	}
-
-	k.backoff.GC()
-	k.api.Lock()
-	defer k.api.Unlock()
-
-	switch task, state := k.api.tasks().ForPod(podKey); state {
-	case podtask.StateUnknown:
-		// if we don't have a mapping here any more then someone deleted the pod
-		log.V(2).Infof("Could not resolve pod to task, aborting pod reschdule: %s", podKey)
-		return
-
-	case podtask.StatePending:
-		if task.Has(podtask.Launched) {
-			log.V(2).Infof("Skipping re-scheduling for already-launched pod %v", podKey)
-			return
-		}
-		breakoutEarly := queue.BreakChan(nil)
-		if schedulingErr == noSuitableOffersErr {
-			log.V(3).Infof("adding backoff breakout handler for pod %v", podKey)
-			breakoutEarly = queue.BreakChan(k.api.offers().Listen(podKey, func(offer *mesos.Offer) bool {
-				k.api.Lock()
-				defer k.api.Unlock()
-				switch task, state := k.api.tasks().Get(task.ID); state {
-				case podtask.StatePending:
-					// Assess fitness of pod with the current offer. The scheduler normally
-					// "backs off" when it can't find an offer that matches up with a pod.
-					// The backoff period for a pod can terminate sooner if an offer becomes
-					// available that matches up.
-					return !task.Has(podtask.Launched) && k.api.algorithm().FitPredicate()(task, offer, nil)
-				default:
-					// no point in continuing to check for matching offers
-					return true
-				}
-			}))
-		}
-		delay := k.backoff.Get(podKey)
-		log.V(3).Infof("requeuing pod %v with delay %v", podKey, delay)
-		k.qr.requeue(&Pod{Pod: pod, delay: &delay, notify: breakoutEarly})
-
-	default:
-		log.V(2).Infof("Task is no longer pending, aborting reschedule for pod %v", podKey)
-	}
-}
-
-type deleter struct {
-	api schedulerInterface
-	qr  *queuer
-}
-
-// currently monitors for "pod deleted" events, upon which handle()
-// is invoked.
-func (k *deleter) Run(updates <-chan queue.Entry, done <-chan struct{}) {
-	go runtime.Until(func() {
-		for {
-			entry := <-updates
-			pod := entry.Value().(*Pod)
-			if entry.Is(queue.DELETE_EVENT) {
-				if err := k.deleteOne(pod); err != nil {
-					log.Error(err)
-				}
-			} else if !entry.Is(queue.POP_EVENT) {
-				k.qr.updatesAvailable()
-			}
-		}
-	}, 1*time.Second, done)
-}
-
-func (k *deleter) deleteOne(pod *Pod) error {
-	ctx := api.WithNamespace(api.NewDefaultContext(), pod.Namespace)
-	podKey, err := podtask.MakePodKey(ctx, pod.Name)
-	if err != nil {
-		return err
-	}
-
-	log.V(2).Infof("pod deleted: %v", podKey)
-
-	// order is important here: we want to make sure we have the lock before
-	// removing the pod from the scheduling queue. this makes the concurrent
-	// execution of scheduler-error-handling and delete-handling easier to
-	// reason about.
-	k.api.Lock()
-	defer k.api.Unlock()
-
-	// prevent the scheduler from attempting to pop this; it's also possible that
-	// it's concurrently being scheduled (somewhere between pod scheduling and
-	// binding) - if so, then we'll end up removing it from taskRegistry which
-	// will abort Bind()ing
-	k.qr.dequeue(pod.GetUID())
-
-	switch task, state := k.api.tasks().ForPod(podKey); state {
-	case podtask.StateUnknown:
-		log.V(2).Infof("Could not resolve pod '%s' to task id", podKey)
-		return noSuchPodErr
-
-	// determine if the task has already been launched to mesos, if not then
-	// cleanup is easier (unregister) since there's no state to sync
-	case podtask.StatePending:
-		if !task.Has(podtask.Launched) {
-			// we've been invoked in between Schedule() and Bind()
-			if task.HasAcceptedOffer() {
-				task.Offer.Release()
-				task.Reset()
-				task.Set(podtask.Deleted)
-				//TODO(jdef) probably want better handling here
-				if err := k.api.tasks().Update(task); err != nil {
-					return err
-				}
-			}
-			k.api.tasks().Unregister(task)
-			return nil
-		}
-		fallthrough
-
-	case podtask.StateRunning:
-		// signal to watchers that the related pod is going down
-		task.Set(podtask.Deleted)
-		if err := k.api.tasks().Update(task); err != nil {
-			log.Errorf("failed to update task w/ Deleted status: %v", err)
-		}
-		return k.api.killTask(task.ID)
-
-	default:
-		log.Infof("cannot kill pod '%s': non-terminal task not found %v", podKey, task.ID)
-		return noSuchTaskErr
-	}
-}
-
-// Create creates a scheduler plugin and all supporting background functions.
-func (k *KubernetesScheduler) NewDefaultPluginConfig(terminate <-chan struct{}, mux *http.ServeMux) *PluginConfig {
-	// use ListWatch watching pods using the client by default
-	return k.NewPluginConfig(terminate, mux, createAllPodsLW(k.client))
-}
-
-func (k *KubernetesScheduler) NewPluginConfig(terminate <-chan struct{}, mux *http.ServeMux,
-	podsWatcher *cache.ListWatch) *PluginConfig {
-
-	// Watch and queue pods that need scheduling.
-	updates := make(chan queue.Entry, k.schedcfg.UpdatesBacklog)
-	podUpdates := &podStoreAdapter{queue.NewHistorical(updates)}
-	reflector := cache.NewReflector(podsWatcher, &api.Pod{}, podUpdates, 0)
-
-	// lock that guards critial sections that involve transferring pods from
-	// the store (cache) to the scheduling queue; its purpose is to maintain
-	// an ordering (vs interleaving) of operations that's easier to reason about.
-	kapi := &k8smScheduler{internal: k}
-	q := newQueuer(podUpdates)
-	podDeleter := &deleter{
-		api: kapi,
-		qr:  q,
-	}
-	eh := &errorHandler{
-		api:     kapi,
-		backoff: backoff.New(k.schedcfg.InitialPodBackoff.Duration, k.schedcfg.MaxPodBackoff.Duration),
-		qr:      q,
-	}
-	startLatch := make(chan struct{})
-	eventBroadcaster := record.NewBroadcaster()
-	runtime.On(startLatch, func() {
-		eventBroadcaster.StartRecordingToSink(k.client.Events(""))
-		reflector.Run() // TODO(jdef) should listen for termination
-		podDeleter.Run(updates, terminate)
-		q.Run(terminate)
-
-		q.installDebugHandlers(mux)
-		podtask.InstallDebugHandlers(k.taskRegistry, mux)
-	})
-	return &PluginConfig{
-		Config: &plugin.Config{
-			NodeLister: nil,
-			Algorithm: &kubeScheduler{
-				api:        kapi,
-				podUpdates: podUpdates,
-			},
-			Binder:   &binder{api: kapi},
-			NextPod:  q.yield,
-			Error:    eh.handleSchedulingError,
-			Recorder: eventBroadcaster.NewRecorder(api.EventSource{Component: "scheduler"}),
-		},
-		api:      kapi,
-		client:   k.client,
-		qr:       q,
-		deleter:  podDeleter,
-		starting: startLatch,
-	}
-}
-
-type PluginConfig struct {
-	*plugin.Config
-	api      schedulerInterface
-	client   *client.Client
-	qr       *queuer
-	deleter  *deleter
-	starting chan struct{} // startup latch
-}
-
-func NewPlugin(c *PluginConfig) PluginInterface {
-	return &schedulingPlugin{
-		config:   c.Config,
-		api:      c.api,
-		client:   c.client,
-		qr:       c.qr,
-		deleter:  c.deleter,
-		starting: c.starting,
-	}
-}
-
-type schedulingPlugin struct {
-	config   *plugin.Config
-	api      schedulerInterface
-	client   *client.Client
-	qr       *queuer
-	deleter  *deleter
-	starting chan struct{}
-}
-
-func (s *schedulingPlugin) Run(done <-chan struct{}) {
-	defer close(s.starting)
-	go runtime.Until(s.scheduleOne, pluginRecoveryDelay, done)
-}
-
-// hacked from GoogleCloudPlatform/kubernetes/plugin/pkg/scheduler/scheduler.go,
-// with the Modeler stuff removed since we don't use it because we have mesos.
-func (s *schedulingPlugin) scheduleOne() {
-	pod := s.config.NextPod()
-
-	// pods which are pre-scheduled (i.e. NodeName is set) are deleted by the kubelet
-	// in upstream. Not so in Mesos because the kubelet hasn't see that pod yet. Hence,
-	// the scheduler has to take care of this:
-	if pod.Spec.NodeName != "" && pod.DeletionTimestamp != nil {
-		log.V(3).Infof("deleting pre-scheduled, not yet running pod: %s/%s", pod.Namespace, pod.Name)
-		s.client.Pods(pod.Namespace).Delete(pod.Name, api.NewDeleteOptions(0))
-		return
-	}
-
-	log.V(3).Infof("Attempting to schedule: %+v", pod)
-	dest, err := s.config.Algorithm.Schedule(pod, s.config.NodeLister) // call kubeScheduler.Schedule
-	if err != nil {
-		log.V(1).Infof("Failed to schedule: %+v", pod)
-		s.config.Recorder.Eventf(pod, FailedScheduling, "Error scheduling: %v", err)
-		s.config.Error(pod, err)
-		return
-	}
-	b := &api.Binding{
-		ObjectMeta: api.ObjectMeta{Namespace: pod.Namespace, Name: pod.Name},
-		Target: api.ObjectReference{
-			Kind: "Node",
-			Name: dest,
-		},
-	}
-	if err := s.config.Binder.Bind(b); err != nil {
-		log.V(1).Infof("Failed to bind pod: %+v", err)
-		s.config.Recorder.Eventf(pod, FailedScheduling, "Binding rejected: %v", err)
-		s.config.Error(pod, err)
-		return
-	}
-	s.config.Recorder.Eventf(pod, Scheduled, "Successfully assigned %v to %v", pod.Name, dest)
-}
-
-// this pod may be out of sync with respect to the API server registry:
-//      this pod   |  apiserver registry
-//    -------------|----------------------
-//      host=.*    |  404           ; pod was deleted
-//      host=.*    |  5xx           ; failed to sync, try again later?
-//      host=""    |  host=""       ; perhaps no updates to process?
-//      host=""    |  host="..."    ; pod has been scheduled and assigned, is there a task assigned? (check TaskIdKey in binding?)
-//      host="..." |  host=""       ; pod is no longer scheduled, does it need to be re-queued?
-//      host="..." |  host="..."    ; perhaps no updates to process?
-//
-// TODO(jdef) this needs an integration test
-func (s *schedulingPlugin) reconcileTask(t *podtask.T) {
-	log.V(1).Infof("reconcile pod %v, assigned to slave %q", t.Pod.Name, t.Spec.AssignedSlave)
-	ctx := api.WithNamespace(api.NewDefaultContext(), t.Pod.Namespace)
-	pod, err := s.client.Pods(api.NamespaceValue(ctx)).Get(t.Pod.Name)
-	if err != nil {
-		if errors.IsNotFound(err) {
-			// attempt to delete
-			if err = s.deleter.deleteOne(&Pod{Pod: &t.Pod}); err != nil && err != noSuchPodErr && err != noSuchTaskErr {
-				log.Errorf("failed to delete pod: %v: %v", t.Pod.Name, err)
-			}
-		} else {
-			//TODO(jdef) other errors should probably trigger a retry (w/ backoff).
-			//For now, drop the pod on the floor
-			log.Warning("aborting reconciliation for pod %v: %v", t.Pod.Name, err)
-		}
-		return
-	}
-
-	log.Infof("pod %v scheduled on %q according to apiserver", pod.Name, pod.Spec.NodeName)
-	if t.Spec.AssignedSlave != pod.Spec.NodeName {
-		if pod.Spec.NodeName == "" {
-			// pod is unscheduled.
-			// it's possible that we dropped the pod in the scheduler error handler
-			// because of task misalignment with the pod (task.Has(podtask.Launched) == true)
-
-			podKey, err := podtask.MakePodKey(ctx, pod.Name)
-			if err != nil {
-				log.Error(err)
-				return
-			}
-
-			s.api.Lock()
-			defer s.api.Unlock()
-
-			if _, state := s.api.tasks().ForPod(podKey); state != podtask.StateUnknown {
-				//TODO(jdef) reconcile the task
-				log.Errorf("task already registered for pod %v", pod.Name)
-				return
-			}
-
-			now := time.Now()
-			log.V(3).Infof("reoffering pod %v", podKey)
-			s.qr.reoffer(&Pod{
-				Pod:      pod,
-				deadline: &now,
-			})
-		} else {
-			// pod is scheduled.
-			// not sure how this happened behind our backs. attempt to reconstruct
-			// at least a partial podtask.T record.
-			//TODO(jdef) reconcile the task
-			log.Errorf("pod already scheduled: %v", pod.Name)
-		}
-	} else {
-		//TODO(jdef) for now, ignore the fact that the rest of the spec may be different
-		//and assume that our knowledge of the pod aligns with that of the apiserver
-		log.Error("pod reconciliation does not support updates; not yet implemented")
-	}
-}
-
-func parseSelectorOrDie(s string) fields.Selector {
-	selector, err := fields.ParseSelector(s)
-	if err != nil {
-		panic(err)
-	}
-	return selector
-}
-
-// createAllPodsLW returns a listWatch that finds all pods
-func createAllPodsLW(cl *client.Client) *cache.ListWatch {
-	return cache.NewListWatchFromClient(cl, "pods", api.NamespaceAll, parseSelectorOrDie(""))
-}
-
-// Consumes *api.Pod, produces *Pod; the k8s reflector wants to push *api.Pod
-// objects at us, but we want to store more flexible (Pod) type defined in
-// this package. The adapter implementation facilitates this. It's a little
-// hackish since the object type going in is different than the object type
-// coming out -- you've been warned.
-type podStoreAdapter struct {
-	queue.FIFO
-}
-
-func (psa *podStoreAdapter) Add(obj interface{}) error {
-	pod := obj.(*api.Pod)
-	return psa.FIFO.Add(&Pod{Pod: pod})
-}
-
-func (psa *podStoreAdapter) Update(obj interface{}) error {
-	pod := obj.(*api.Pod)
-	return psa.FIFO.Update(&Pod{Pod: pod})
-}
-
-func (psa *podStoreAdapter) Delete(obj interface{}) error {
-	pod := obj.(*api.Pod)
-	return psa.FIFO.Delete(&Pod{Pod: pod})
-}
-
-func (psa *podStoreAdapter) Get(obj interface{}) (interface{}, bool, error) {
-	pod := obj.(*api.Pod)
-	return psa.FIFO.Get(&Pod{Pod: pod})
-}
-
-// Replace will delete the contents of the store, using instead the
-// given map. This store implementation does NOT take ownership of the map.
-func (psa *podStoreAdapter) Replace(objs []interface{}, resourceVersion string) error {
-	newobjs := make([]interface{}, len(objs))
-	for i, v := range objs {
-		pod := v.(*api.Pod)
-		newobjs[i] = &Pod{Pod: pod}
-	}
-	return psa.FIFO.Replace(newobjs, resourceVersion)
-}
--- a/contrib/mesos/pkg/scheduler/podtask/pod_task.go
+++ b/contrib/mesos/pkg/scheduler/podtask/pod_task.go
@@ -18,6 +18,7 @@ package podtask

 import (
 	"fmt"
+	"strings"
 	"time"

 	"github.com/gogo/protobuf/proto"
@@ -62,7 +63,6 @@ type T struct {
 	UpdatedTime time.Time // time of the most recent StatusUpdate we've seen from the mesos master

 	podStatus  api.PodStatus
-	executor   *mesos.ExecutorInfo // readonly
 	podKey     string
 	launchTime time.Time
 	bindTime   time.Time
@@ -130,21 +130,49 @@ func generateTaskName(pod *api.Pod) string {
 	return fmt.Sprintf("%s.%s.pods", pod.Name, ns)
 }

-func (t *T) BuildTaskInfo() *mesos.TaskInfo {
+func setCommandArgument(ei *mesos.ExecutorInfo, flag, value string, create bool) {
+	argv := []string{}
+	overwrite := false
+	if ei.Command != nil && ei.Command.Arguments != nil {
+		argv = ei.Command.Arguments
+		for i, arg := range argv {
+			if strings.HasPrefix(arg, flag+"=") {
+				overwrite = true
+				argv[i] = flag + "=" + value
+				break
+			}
+		}
+	}
+	if !overwrite && create {
+		argv = append(argv, flag+"="+value)
+		if ei.Command == nil {
+			ei.Command = &mesos.CommandInfo{}
+		}
+		ei.Command.Arguments = argv
+	}
+}
+
+func (t *T) BuildTaskInfo(prototype *mesos.ExecutorInfo) *mesos.TaskInfo {
 	info := &mesos.TaskInfo{
 		Name:     proto.String(generateTaskName(&t.Pod)),
 		TaskId:   mutil.NewTaskID(t.ID),
 		SlaveId:  mutil.NewSlaveID(t.Spec.SlaveID),
-		Executor: t.executor,
+		Executor: proto.Clone(prototype).(*mesos.ExecutorInfo),
 		Data:     t.Spec.Data,
 		Resources: []*mesos.Resource{
 			mutil.NewScalarResource("cpus", float64(t.Spec.CPU)),
 			mutil.NewScalarResource("mem", float64(t.Spec.Memory)),
 		},
 	}
+
 	if portsResource := rangeResource("ports", t.Spec.Ports); portsResource != nil {
 		info.Resources = append(info.Resources, portsResource)
 	}
+
+	// hostname needs of the executor needs to match that of the offer, otherwise
+	// the kubelet node status checker/updater is very unhappy
+	setCommandArgument(info.Executor, "--hostname-override", t.Spec.AssignedSlave, true)
+
 	return info
 }

@@ -170,10 +198,7 @@ func (t *T) Has(f FlagType) (exists bool) {
 	return
 }

-func New(ctx api.Context, id string, pod api.Pod, executor *mesos.ExecutorInfo) (*T, error) {
-	if executor == nil {
-		return nil, fmt.Errorf("illegal argument: executor was nil")
-	}
+func New(ctx api.Context, id string, pod *api.Pod) (*T, error) {
 	key, err := MakePodKey(ctx, pod.Name)
 	if err != nil {
 		return nil, err
@@ -182,13 +207,12 @@ func New(ctx api.Context, id string, pod api.Pod, executor *mesos.ExecutorInfo)
 		id = "pod." + uuid.NewUUID().String()
 	}
 	task := &T{
-		ID:       id,
-		Pod:      pod,
-		State:    StatePending,
-		podKey:   key,
-		mapper:   MappingTypeForPod(&pod),
-		Flags:    make(map[FlagType]struct{}),
-		executor: proto.Clone(executor).(*mesos.ExecutorInfo),
+		ID:     id,
+		Pod:    *pod,
+		State:  StatePending,
+		podKey: key,
+		mapper: MappingTypeForPod(pod),
+		Flags:  make(map[FlagType]struct{}),
 	}
 	task.CreateTime = time.Now()
 	return task, nil
@@ -198,7 +222,6 @@ func (t *T) SaveRecoveryInfo(dict map[string]string) {
 	dict[annotation.TaskIdKey] = t.ID
 	dict[annotation.SlaveIdKey] = t.Spec.SlaveID
 	dict[annotation.OfferIdKey] = t.Offer.Details().Id.GetValue()
-	dict[annotation.ExecutorIdKey] = t.executor.ExecutorId.GetValue()
 }

 // reconstruct a task from metadata stashed in a pod entry. there are limited pod states that
@@ -256,7 +279,6 @@ func RecoverFrom(pod api.Pod) (*T, bool, error) {
 		annotation.TaskIdKey,
 		annotation.SlaveIdKey,
 		annotation.OfferIdKey,
-		annotation.ExecutorIdKey,
 	} {
 		v, found := pod.Annotations[k]
 		if !found {
@@ -271,10 +293,6 @@ func RecoverFrom(pod api.Pod) (*T, bool, error) {
 			offerId = v
 		case annotation.TaskIdKey:
 			t.ID = v
-		case annotation.ExecutorIdKey:
-			// this is nowhere near sufficient to re-launch a task, but we really just
-			// want this for tracking
-			t.executor = &mesos.ExecutorInfo{ExecutorId: mutil.NewExecutorID(v)}
 		}
 	}
 	t.Offer = offers.Expired(offerId, t.Spec.AssignedSlave, 0)
--- a/contrib/mesos/pkg/scheduler/podtask/pod_task_test.go
+++ b/contrib/mesos/pkg/scheduler/podtask/pod_task_test.go
@@ -35,12 +35,12 @@ const (
 )

 func fakePodTask(id string) (*T, error) {
-	return New(api.NewDefaultContext(), "", api.Pod{
+	return New(api.NewDefaultContext(), "", &api.Pod{
 		ObjectMeta: api.ObjectMeta{
 			Name:      id,
 			Namespace: api.NamespaceDefault,
 		},
-	}, &mesos.ExecutorInfo{})
+	})
 }

 func TestUnlimitedResources(t *testing.T) {
--- a/contrib/mesos/pkg/scheduler/podtask/port_mapping_test.go
+++ b/contrib/mesos/pkg/scheduler/podtask/port_mapping_test.go
@@ -52,7 +52,7 @@ func TestDefaultHostPortMatching(t *testing.T) {
 			}},
 		}},
 	}
-	task, err = New(api.NewDefaultContext(), "", *pod, &mesos.ExecutorInfo{})
+	task, err = New(api.NewDefaultContext(), "", pod)
 	if err != nil {
 		t.Fatal(err)
 	}
@@ -100,7 +100,7 @@ func TestWildcardHostPortMatching(t *testing.T) {
 			}},
 		}},
 	}
-	task, err = New(api.NewDefaultContext(), "", *pod, &mesos.ExecutorInfo{})
+	task, err = New(api.NewDefaultContext(), "", pod)
 	if err != nil {
 		t.Fatal(err)
 	}
@@ -123,7 +123,7 @@ func TestWildcardHostPortMatching(t *testing.T) {
 			}},
 		}},
 	}
-	task, err = New(api.NewDefaultContext(), "", *pod, &mesos.ExecutorInfo{})
+	task, err = New(api.NewDefaultContext(), "", pod)
 	if err != nil {
 		t.Fatal(err)
 	}
@@ -144,7 +144,7 @@ func TestWildcardHostPortMatching(t *testing.T) {
 			}},
 		}},
 	}
-	task, err = New(api.NewDefaultContext(), "", *pod, &mesos.ExecutorInfo{})
+	task, err = New(api.NewDefaultContext(), "", pod)
 	if err != nil {
 		t.Fatal(err)
 	}
--- a/contrib/mesos/pkg/scheduler/podtask/procurement.go
+++ b/contrib/mesos/pkg/scheduler/podtask/procurement.go
@@ -17,8 +17,6 @@ limitations under the License.
 package podtask

 import (
-	"strings"
-
 	log "github.com/golang/glog"
 	mesos "github.com/mesos/mesos-go/mesosproto"
 	mresource "k8s.io/kubernetes/contrib/mesos/pkg/scheduler/resource"
@@ -74,31 +72,11 @@ func ValidateProcurement(t *T, offer *mesos.Offer) error {
 	return nil
 }

-func setCommandArgument(ei *mesos.ExecutorInfo, flag, value string, create bool) {
-	argv := ei.Command.Arguments
-	overwrite := false
-	for i, arg := range argv {
-		if strings.HasPrefix(arg, flag+"=") {
-			overwrite = true
-			argv[i] = flag + "=" + value
-			break
-		}
-	}
-	if !overwrite && create {
-		ei.Command.Arguments = append(argv, flag+"="+value)
-	}
-}
-
 // NodeProcurement updates t.Spec in preparation for the task to be launched on the
 // slave associated with the offer.
 func NodeProcurement(t *T, offer *mesos.Offer) error {
 	t.Spec.SlaveID = offer.GetSlaveId().GetValue()
 	t.Spec.AssignedSlave = offer.GetHostname()
-
-	// hostname needs of the executor needs to match that of the offer, otherwise
-	// the kubelet node status checker/updater is very unhappy
-	setCommandArgument(t.executor, "--hostname-override", offer.GetHostname(), true)
-
 	return nil
 }

--- a/contrib/mesos/pkg/scheduler/queuer/doc.go
+++ b/contrib/mesos/pkg/scheduler/queuer/doc.go
@@ -0,0 +1,19 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+// Package queuer implements a Pod Queuer which stores and yields pods waiting
+// being scheduled.
+package queuer
--- a/contrib/mesos/pkg/scheduler/queuer/pod.go
+++ b/contrib/mesos/pkg/scheduler/queuer/pod.go
@@ -14,7 +14,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 */

-package scheduler
+package queuer

 import (
 	"fmt"
@@ -29,8 +29,12 @@ import (
 type Pod struct {
 	*api.Pod
 	deadline *time.Time
-	delay    *time.Duration
-	notify   queue.BreakChan
+	Delay    *time.Duration
+	Notify   queue.BreakChan
+}
+
+func NewPodWithDeadline(pod *api.Pod, deadline *time.Time) *Pod {
+	return &Pod{Pod: pod, deadline: deadline}
 }

 // implements Copyable
@@ -54,21 +58,21 @@ func (p *Pod) GetUID() string {

 // implements Deadlined
 func (dp *Pod) Deadline() (time.Time, bool) {
-	if dp.deadline != nil {
+	if dp.Deadline != nil {
 		return *(dp.deadline), true
 	}
 	return time.Time{}, false
 }

 func (dp *Pod) GetDelay() time.Duration {
-	if dp.delay != nil {
-		return *(dp.delay)
+	if dp.Delay != nil {
+		return *(dp.Delay)
 	}
 	return 0
 }

 func (p *Pod) Breaker() queue.BreakChan {
-	return p.notify
+	return p.Notify
 }

 func (p *Pod) String() string {
--- a/contrib/mesos/pkg/scheduler/queuer/queuer.go
+++ b/contrib/mesos/pkg/scheduler/queuer/queuer.go
@@ -0,0 +1,209 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package queuer
+
+import (
+	"fmt"
+	"io"
+	"net/http"
+	"sync"
+	"time"
+
+	log "github.com/golang/glog"
+	"k8s.io/kubernetes/contrib/mesos/pkg/queue"
+	"k8s.io/kubernetes/contrib/mesos/pkg/runtime"
+	annotation "k8s.io/kubernetes/contrib/mesos/pkg/scheduler/meta"
+	"k8s.io/kubernetes/pkg/api"
+	"k8s.io/kubernetes/pkg/client/cache"
+)
+
+const (
+	enqueuePopTimeout  = 200 * time.Millisecond
+	enqueueWaitTimeout = 1 * time.Second
+	yieldPopTimeout    = 200 * time.Millisecond
+	yieldWaitTimeout   = 1 * time.Second
+)
+
+type Queuer interface {
+	InstallDebugHandlers(mux *http.ServeMux)
+	UpdatesAvailable()
+	Dequeue(id string)
+	Requeue(pod *Pod)
+	Reoffer(pod *Pod)
+
+	Yield() *api.Pod
+
+	Run(done <-chan struct{})
+}
+
+type queuer struct {
+	lock            sync.Mutex       // shared by condition variables of this struct
+	updates         queue.FIFO       // queue of pod updates to be processed
+	queue           *queue.DelayFIFO // queue of pods to be scheduled
+	deltaCond       sync.Cond        // pod changes are available for processing
+	unscheduledCond sync.Cond        // there are unscheduled pods for processing
+}
+
+func New(queue *queue.DelayFIFO, updates queue.FIFO) Queuer {
+	q := &queuer{
+		queue:   queue,
+		updates: updates,
+	}
+	q.deltaCond.L = &q.lock
+	q.unscheduledCond.L = &q.lock
+	return q
+}
+
+func (q *queuer) InstallDebugHandlers(mux *http.ServeMux) {
+	mux.HandleFunc("/debug/scheduler/podqueue", func(w http.ResponseWriter, r *http.Request) {
+		for _, x := range q.queue.List() {
+			if _, err := io.WriteString(w, fmt.Sprintf("%+v\n", x)); err != nil {
+				break
+			}
+		}
+	})
+	mux.HandleFunc("/debug/scheduler/podstore", func(w http.ResponseWriter, r *http.Request) {
+		for _, x := range q.updates.List() {
+			if _, err := io.WriteString(w, fmt.Sprintf("%+v\n", x)); err != nil {
+				break
+			}
+		}
+	})
+}
+
+// signal that there are probably pod updates waiting to be processed
+func (q *queuer) UpdatesAvailable() {
+	q.deltaCond.Broadcast()
+}
+
+// delete a pod from the to-be-scheduled queue
+func (q *queuer) Dequeue(id string) {
+	q.queue.Delete(id)
+}
+
+// re-add a pod to the to-be-scheduled queue, will not overwrite existing pod data (that
+// may have already changed).
+func (q *queuer) Requeue(pod *Pod) {
+	// use KeepExisting in case the pod has already been updated (can happen if binding fails
+	// due to constraint voilations); we don't want to overwrite a newer entry with stale data.
+	q.queue.Add(pod, queue.KeepExisting)
+	q.unscheduledCond.Broadcast()
+}
+
+// same as Requeue but calls podQueue.Offer instead of podQueue.Add
+func (q *queuer) Reoffer(pod *Pod) {
+	// use KeepExisting in case the pod has already been updated (can happen if binding fails
+	// due to constraint voilations); we don't want to overwrite a newer entry with stale data.
+	if q.queue.Offer(pod, queue.KeepExisting) {
+		q.unscheduledCond.Broadcast()
+	}
+}
+
+// spawns a go-routine to watch for unscheduled pods and queue them up
+// for scheduling. returns immediately.
+func (q *queuer) Run(done <-chan struct{}) {
+	go runtime.Until(func() {
+		log.Info("Watching for newly created pods")
+		q.lock.Lock()
+		defer q.lock.Unlock()
+
+		for {
+			// limit blocking here for short intervals so that scheduling
+			// may proceed even if there have been no recent pod changes
+			p := q.updates.Await(enqueuePopTimeout)
+			if p == nil {
+				signalled := runtime.After(q.deltaCond.Wait)
+				// we've yielded the lock
+				select {
+				case <-time.After(enqueueWaitTimeout):
+					q.deltaCond.Broadcast() // abort Wait()
+					<-signalled             // wait for lock re-acquisition
+					log.V(4).Infoln("timed out waiting for a pod update")
+				case <-signalled:
+					// we've acquired the lock and there may be
+					// changes for us to process now
+				}
+				continue
+			}
+
+			pod := p.(*Pod)
+			if recoverAssignedSlave(pod.Pod) != "" {
+				log.V(3).Infof("dequeuing assigned pod for scheduling: %v", pod.Pod.Name)
+				q.Dequeue(pod.GetUID())
+			} else {
+				// use ReplaceExisting because we are always pushing the latest state
+				now := time.Now()
+				pod.deadline = &now
+				if q.queue.Offer(pod, queue.ReplaceExisting) {
+					q.unscheduledCond.Broadcast()
+					log.V(3).Infof("queued pod for scheduling: %v", pod.Pod.Name)
+				} else {
+					log.Warningf("failed to queue pod for scheduling: %v", pod.Pod.Name)
+				}
+			}
+		}
+	}, 1*time.Second, done)
+}
+
+// implementation of scheduling plugin's NextPod func; see k8s plugin/pkg/scheduler
+func (q *queuer) Yield() *api.Pod {
+	log.V(2).Info("attempting to yield a pod")
+	q.lock.Lock()
+	defer q.lock.Unlock()
+
+	for {
+		// limit blocking here to short intervals so that we don't block the
+		// enqueuer Run() routine for very long
+		kpod := q.queue.Await(yieldPopTimeout)
+		if kpod == nil {
+			signalled := runtime.After(q.unscheduledCond.Wait)
+			// lock is yielded at this point and we're going to wait for either
+			// a timeout, or a signal that there's data
+			select {
+			case <-time.After(yieldWaitTimeout):
+				q.unscheduledCond.Broadcast() // abort Wait()
+				<-signalled                   // wait for the go-routine, and the lock
+				log.V(4).Infoln("timed out waiting for a pod to yield")
+			case <-signalled:
+				// we have acquired the lock, and there
+				// may be a pod for us to pop now
+			}
+			continue
+		}
+
+		pod := kpod.(*Pod).Pod
+		if podName, err := cache.MetaNamespaceKeyFunc(pod); err != nil {
+			log.Warningf("yield unable to understand pod object %+v, will skip: %v", pod, err)
+		} else if !q.updates.Poll(podName, queue.POP_EVENT) {
+			log.V(1).Infof("yield popped a transitioning pod, skipping: %+v", pod)
+		} else if recoverAssignedSlave(pod) != "" {
+			// should never happen if enqueuePods is filtering properly
+			log.Warningf("yield popped an already-scheduled pod, skipping: %+v", pod)
+		} else {
+			return pod
+		}
+	}
+}
+
+// recoverAssignedSlave recovers the assigned Mesos slave from a pod by searching
+// the BindingHostKey. For tasks in the registry of the scheduler, the same
+// value is stored in T.Spec.AssignedSlave. Before launching, the BindingHostKey
+// annotation is added and the executor will eventually persist that to the
+// apiserver on binding.
+func recoverAssignedSlave(pod *api.Pod) string {
+	return pod.Annotations[annotation.BindingHostKey]
+}
--- a/contrib/mesos/pkg/scheduler/scheduler.go
+++ b/contrib/mesos/pkg/scheduler/scheduler.go
@@ -17,905 +17,22 @@ limitations under the License.
 package scheduler

 import (
-	"fmt"
-	"io"
-	"math"
-	"net/http"
 	"sync"
-	"time"

-	log "github.com/golang/glog"
-	mesos "github.com/mesos/mesos-go/mesosproto"
-	mutil "github.com/mesos/mesos-go/mesosutil"
-	bindings "github.com/mesos/mesos-go/scheduler"
-	execcfg "k8s.io/kubernetes/contrib/mesos/pkg/executor/config"
-	"k8s.io/kubernetes/contrib/mesos/pkg/executor/messages"
-	"k8s.io/kubernetes/contrib/mesos/pkg/node"
 	"k8s.io/kubernetes/contrib/mesos/pkg/offers"
-	offermetrics "k8s.io/kubernetes/contrib/mesos/pkg/offers/metrics"
-	"k8s.io/kubernetes/contrib/mesos/pkg/proc"
-	"k8s.io/kubernetes/contrib/mesos/pkg/runtime"
-	schedcfg "k8s.io/kubernetes/contrib/mesos/pkg/scheduler/config"
-	"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/meta"
-	"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/metrics"
 	"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/podtask"
-	"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/slave"
-	"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/uid"
-	"k8s.io/kubernetes/pkg/api"
-	"k8s.io/kubernetes/pkg/api/errors"
-	client "k8s.io/kubernetes/pkg/client/unversioned"
-	"k8s.io/kubernetes/pkg/fields"
-	"k8s.io/kubernetes/pkg/kubelet/container"
-	kubetypes "k8s.io/kubernetes/pkg/kubelet/types"
-	"k8s.io/kubernetes/pkg/labels"
-	"k8s.io/kubernetes/pkg/tools"
-	"k8s.io/kubernetes/pkg/util/sets"
 )

-type PluginInterface interface {
-	// the apiserver may have a different state for the pod than we do
-	// so reconcile our records, but only for this one pod
-	reconcileTask(*podtask.T)
+// Scheduler abstracts everything other components of the scheduler need
+// to access from eachother
+type Scheduler interface {
+	Tasks() podtask.Registry
+	sync.Locker // synchronize changes to tasks, i.e. lock, get task, change task, store task, unlock

-	// execute the Scheduling plugin, should start a go routine and return immediately
-	Run(<-chan struct{})
-}
-
-// KubernetesScheduler implements:
-// 1: A mesos scheduler.
-// 2: A kubernetes scheduler plugin.
-// 3: A kubernetes pod.Registry.
-type KubernetesScheduler struct {
-	// We use a lock here to avoid races
-	// between invoking the mesos callback
-	// and the invoking the pod registry interfaces.
-	// In particular, changes to podtask.T objects are currently guarded by this lock.
-	*sync.RWMutex
-	PodScheduler
-
-	// Config related, write-once
-
-	schedcfg          *schedcfg.Config
-	executor          *mesos.ExecutorInfo
-	executorGroup     uint64
-	client            *client.Client
-	etcdClient        tools.EtcdClient
-	failoverTimeout   float64 // in seconds
-	reconcileInterval int64
-	nodeRegistrator   node.Registrator
-
-	// Mesos context.
-
-	driver         bindings.SchedulerDriver // late initialization
-	frameworkId    *mesos.FrameworkID
-	masterInfo     *mesos.MasterInfo
-	registered     bool
-	registration   chan struct{} // signal chan that closes upon first successful registration
-	onRegistration sync.Once
-	offers         offers.Registry
-	slaveHostNames *slave.Registry
-
-	// unsafe state, needs to be guarded
-
-	taskRegistry podtask.Registry
-
-	// via deferred init
-
-	plugin             PluginInterface
-	reconciler         *Reconciler
-	reconcileCooldown  time.Duration
-	asRegisteredMaster proc.Doer
-	terminate          <-chan struct{} // signal chan, closes when we should kill background tasks
-}
-
-type Config struct {
-	Schedcfg          schedcfg.Config
-	Executor          *mesos.ExecutorInfo
-	Scheduler         PodScheduler
-	Client            *client.Client
-	EtcdClient        tools.EtcdClient
-	FailoverTimeout   float64
-	ReconcileInterval int64
-	ReconcileCooldown time.Duration
-	LookupNode        node.LookupFunc
-}
-
-// New creates a new KubernetesScheduler
-func New(config Config) *KubernetesScheduler {
-	var k *KubernetesScheduler
-	k = &KubernetesScheduler{
-		schedcfg:          &config.Schedcfg,
-		RWMutex:           new(sync.RWMutex),
-		executor:          config.Executor,
-		executorGroup:     uid.Parse(config.Executor.ExecutorId.GetValue()).Group(),
-		PodScheduler:      config.Scheduler,
-		client:            config.Client,
-		etcdClient:        config.EtcdClient,
-		failoverTimeout:   config.FailoverTimeout,
-		reconcileInterval: config.ReconcileInterval,
-		nodeRegistrator:   node.NewRegistrator(config.Client, config.LookupNode),
-		offers: offers.CreateRegistry(offers.RegistryConfig{
-			Compat: func(o *mesos.Offer) bool {
-				// the node must be registered and have up-to-date labels
-				n := config.LookupNode(o.GetHostname())
-				if n == nil || !node.IsUpToDate(n, node.SlaveAttributesToLabels(o.GetAttributes())) {
-					return false
-				}
-
-				// the executor IDs must not identify a kubelet-executor with a group that doesn't match ours
-				for _, eid := range o.GetExecutorIds() {
-					execuid := uid.Parse(eid.GetValue())
-					if execuid.Name() == execcfg.DefaultInfoID && execuid.Group() != k.executorGroup {
-						return false
-					}
-				}
-
-				return true
-			},
-			DeclineOffer: func(id string) <-chan error {
-				errOnce := proc.NewErrorOnce(k.terminate)
-				errOuter := k.asRegisteredMaster.Do(func() {
-					var err error
-					defer errOnce.Report(err)
-					offerId := mutil.NewOfferID(id)
-					filters := &mesos.Filters{}
-					_, err = k.driver.DeclineOffer(offerId, filters)
-				})
-				return errOnce.Send(errOuter).Err()
-			},
-			// remember expired offers so that we can tell if a previously scheduler offer relies on one
-			LingerTTL:     config.Schedcfg.OfferLingerTTL.Duration,
-			TTL:           config.Schedcfg.OfferTTL.Duration,
-			ListenerDelay: config.Schedcfg.ListenerDelay.Duration,
-		}),
-		slaveHostNames:    slave.NewRegistry(),
-		taskRegistry:      podtask.NewInMemoryRegistry(),
-		reconcileCooldown: config.ReconcileCooldown,
-		registration:      make(chan struct{}),
-		asRegisteredMaster: proc.DoerFunc(func(proc.Action) <-chan error {
-			return proc.ErrorChanf("cannot execute action with unregistered scheduler")
-		}),
-	}
-	return k
-}
-
-func (k *KubernetesScheduler) Init(electedMaster proc.Process, pl PluginInterface, mux *http.ServeMux) error {
-	log.V(1).Infoln("initializing kubernetes mesos scheduler")
-
-	k.asRegisteredMaster = proc.DoerFunc(func(a proc.Action) <-chan error {
-		if !k.registered {
-			return proc.ErrorChanf("failed to execute action, scheduler is disconnected")
-		}
-		return electedMaster.Do(a)
-	})
-	k.terminate = electedMaster.Done()
-	k.plugin = pl
-	k.offers.Init(k.terminate)
-	k.InstallDebugHandlers(mux)
-	k.nodeRegistrator.Run(k.terminate)
-	return k.recoverTasks()
-}
-
-func (k *KubernetesScheduler) asMaster() proc.Doer {
-	k.RLock()
-	defer k.RUnlock()
-	return k.asRegisteredMaster
-}
-
-func (k *KubernetesScheduler) InstallDebugHandlers(mux *http.ServeMux) {
-	wrappedHandler := func(uri string, h http.Handler) {
-		mux.HandleFunc(uri, func(w http.ResponseWriter, r *http.Request) {
-			ch := make(chan struct{})
-			closer := runtime.Closer(ch)
-			proc.OnError(k.asMaster().Do(func() {
-				defer closer()
-				h.ServeHTTP(w, r)
-			}), func(err error) {
-				defer closer()
-				log.Warningf("failed HTTP request for %s: %v", uri, err)
-				w.WriteHeader(http.StatusServiceUnavailable)
-			}, k.terminate)
-			select {
-			case <-time.After(k.schedcfg.HttpHandlerTimeout.Duration):
-				log.Warningf("timed out waiting for request to be processed")
-				w.WriteHeader(http.StatusServiceUnavailable)
-				return
-			case <-ch: // noop
-			}
-		})
-	}
-	requestReconciliation := func(uri string, requestAction func()) {
-		wrappedHandler(uri, http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
-			requestAction()
-			w.WriteHeader(http.StatusNoContent)
-		}))
-	}
-	requestReconciliation("/debug/actions/requestExplicit", k.reconciler.RequestExplicit)
-	requestReconciliation("/debug/actions/requestImplicit", k.reconciler.RequestImplicit)
-
-	wrappedHandler("/debug/actions/kamikaze", http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
-		slaves := k.slaveHostNames.SlaveIDs()
-		for _, slaveId := range slaves {
-			_, err := k.driver.SendFrameworkMessage(
-				k.executor.ExecutorId,
-				mutil.NewSlaveID(slaveId),
-				messages.Kamikaze)
-			if err != nil {
-				log.Warningf("failed to send kamikaze message to slave %s: %v", slaveId, err)
-			} else {
-				io.WriteString(w, fmt.Sprintf("kamikaze slave %s\n", slaveId))
-			}
-		}
-		io.WriteString(w, "OK")
-	}))
-}
-
-func (k *KubernetesScheduler) Registration() <-chan struct{} {
-	return k.registration
-}
-
-// Registered is called when the scheduler registered with the master successfully.
-func (k *KubernetesScheduler) Registered(drv bindings.SchedulerDriver, fid *mesos.FrameworkID, mi *mesos.MasterInfo) {
-	log.Infof("Scheduler registered with the master: %v with frameworkId: %v\n", mi, fid)
-
-	k.driver = drv
-	k.frameworkId = fid
-	k.masterInfo = mi
-	k.registered = true
-
-	k.onRegistration.Do(func() { k.onInitialRegistration(drv) })
-	k.reconciler.RequestExplicit()
-}
-
-func (k *KubernetesScheduler) storeFrameworkId() {
-	// TODO(jdef): port FrameworkId store to generic Kubernetes config store as soon as available
-	_, err := k.etcdClient.Set(meta.FrameworkIDKey, k.frameworkId.GetValue(), uint64(k.failoverTimeout))
-	if err != nil {
-		log.Errorf("failed to renew frameworkId TTL: %v", err)
-	}
-}
-
-// Reregistered is called when the scheduler re-registered with the master successfully.
-// This happends when the master fails over.
-func (k *KubernetesScheduler) Reregistered(drv bindings.SchedulerDriver, mi *mesos.MasterInfo) {
-	log.Infof("Scheduler reregistered with the master: %v\n", mi)
-
-	k.driver = drv
-	k.masterInfo = mi
-	k.registered = true
-
-	k.onRegistration.Do(func() { k.onInitialRegistration(drv) })
-	k.reconciler.RequestExplicit()
-}
-
-// perform one-time initialization actions upon the first registration event received from Mesos.
-func (k *KubernetesScheduler) onInitialRegistration(driver bindings.SchedulerDriver) {
-	defer close(k.registration)
-
-	if k.failoverTimeout > 0 {
-		refreshInterval := k.schedcfg.FrameworkIdRefreshInterval.Duration
-		if k.failoverTimeout < k.schedcfg.FrameworkIdRefreshInterval.Duration.Seconds() {
-			refreshInterval = time.Duration(math.Max(1, k.failoverTimeout/2)) * time.Second
-		}
-		go runtime.Until(k.storeFrameworkId, refreshInterval, k.terminate)
-	}
-
-	r1 := k.makeTaskRegistryReconciler()
-	r2 := k.makePodRegistryReconciler()
-
-	k.reconciler = newReconciler(k.asRegisteredMaster, k.makeCompositeReconciler(r1, r2),
-		k.reconcileCooldown, k.schedcfg.ExplicitReconciliationAbortTimeout.Duration, k.terminate)
-	go k.reconciler.Run(driver)
-
-	if k.reconcileInterval > 0 {
-		ri := time.Duration(k.reconcileInterval) * time.Second
-		time.AfterFunc(k.schedcfg.InitialImplicitReconciliationDelay.Duration, func() { runtime.Until(k.reconciler.RequestImplicit, ri, k.terminate) })
-		log.Infof("will perform implicit task reconciliation at interval: %v after %v", ri, k.schedcfg.InitialImplicitReconciliationDelay.Duration)
-	}
-}
-
-// Disconnected is called when the scheduler loses connection to the master.
-func (k *KubernetesScheduler) Disconnected(driver bindings.SchedulerDriver) {
-	log.Infof("Master disconnected!\n")
-
-	k.registered = false
-
-	// discard all cached offers to avoid unnecessary TASK_LOST updates
-	k.offers.Invalidate("")
-}
-
-// ResourceOffers is called when the scheduler receives some offers from the master.
-func (k *KubernetesScheduler) ResourceOffers(driver bindings.SchedulerDriver, offers []*mesos.Offer) {
-	log.V(2).Infof("Received offers %+v", offers)
-
-	// Record the offers in the global offer map as well as each slave's offer map.
-	k.offers.Add(offers)
-	for _, offer := range offers {
-		slaveId := offer.GetSlaveId().GetValue()
-		k.slaveHostNames.Register(slaveId, offer.GetHostname())
-
-		// create api object if not existing already
-		if k.nodeRegistrator != nil {
-			labels := node.SlaveAttributesToLabels(offer.GetAttributes())
-			_, err := k.nodeRegistrator.Register(offer.GetHostname(), labels)
-			if err != nil {
-				log.Error(err)
-			}
-		}
-	}
-}
-
-// OfferRescinded is called when the resources are recinded from the scheduler.
-func (k *KubernetesScheduler) OfferRescinded(driver bindings.SchedulerDriver, offerId *mesos.OfferID) {
-	log.Infof("Offer rescinded %v\n", offerId)
-
-	oid := offerId.GetValue()
-	k.offers.Delete(oid, offermetrics.OfferRescinded)
-}
-
-// StatusUpdate is called when a status update message is sent to the scheduler.
-func (k *KubernetesScheduler) StatusUpdate(driver bindings.SchedulerDriver, taskStatus *mesos.TaskStatus) {
-
-	source, reason := "none", "none"
-	if taskStatus.Source != nil {
-		source = (*taskStatus.Source).String()
-	}
-	if taskStatus.Reason != nil {
-		reason = (*taskStatus.Reason).String()
-	}
-	taskState := taskStatus.GetState()
-	metrics.StatusUpdates.WithLabelValues(source, reason, taskState.String()).Inc()
-
-	message := "none"
-	if taskStatus.Message != nil {
-		message = *taskStatus.Message
-	}
-
-	log.Infof(
-		"task status update %q from %q for task %q on slave %q executor %q for reason %q with message %q",
-		taskState.String(),
-		source,
-		taskStatus.TaskId.GetValue(),
-		taskStatus.SlaveId.GetValue(),
-		taskStatus.ExecutorId.GetValue(),
-		reason,
-		message,
-	)
-
-	switch taskState {
-	case mesos.TaskState_TASK_RUNNING, mesos.TaskState_TASK_FINISHED, mesos.TaskState_TASK_STARTING, mesos.TaskState_TASK_STAGING:
-		if _, state := k.taskRegistry.UpdateStatus(taskStatus); state == podtask.StateUnknown {
-			if taskState != mesos.TaskState_TASK_FINISHED {
-				//TODO(jdef) what if I receive this after a TASK_LOST or TASK_KILLED?
-				//I don't want to reincarnate then..  TASK_LOST is a special case because
-				//the master is stateless and there are scenarios where I may get TASK_LOST
-				//followed by TASK_RUNNING.
-				//TODO(jdef) consider running this asynchronously since there are API server
-				//calls that may be made
-				k.reconcileNonTerminalTask(driver, taskStatus)
-			} // else, we don't really care about FINISHED tasks that aren't registered
-			return
-		}
-		if hostName := k.slaveHostNames.HostName(taskStatus.GetSlaveId().GetValue()); hostName == "" {
-			// a registered task has an update reported by a slave that we don't recognize.
-			// this should never happen! So we don't reconcile it.
-			log.Errorf("Ignore status %+v because the slave does not exist", taskStatus)
-			return
-		}
-	case mesos.TaskState_TASK_FAILED, mesos.TaskState_TASK_ERROR:
-		if task, _ := k.taskRegistry.UpdateStatus(taskStatus); task != nil {
-			if task.Has(podtask.Launched) && !task.Has(podtask.Bound) {
-				go k.plugin.reconcileTask(task)
-				return
-			}
-		} else {
-			// unknown task failed, not much we can do about it
-			return
-		}
-		// last-ditch effort to reconcile our records
-		fallthrough
-	case mesos.TaskState_TASK_LOST, mesos.TaskState_TASK_KILLED:
-		k.reconcileTerminalTask(driver, taskStatus)
-	default:
-		log.Errorf(
-			"unknown task status %q from %q for task %q on slave %q executor %q for reason %q with message %q",
-			taskState.String(),
-			source,
-			taskStatus.TaskId.GetValue(),
-			taskStatus.SlaveId.GetValue(),
-			taskStatus.ExecutorId.GetValue(),
-			reason,
-			message,
-		)
-	}
-}
-
-func (k *KubernetesScheduler) reconcileTerminalTask(driver bindings.SchedulerDriver, taskStatus *mesos.TaskStatus) {
-	task, state := k.taskRegistry.UpdateStatus(taskStatus)
-
-	if (state == podtask.StateRunning || state == podtask.StatePending) &&
-		((taskStatus.GetSource() == mesos.TaskStatus_SOURCE_MASTER && taskStatus.GetReason() == mesos.TaskStatus_REASON_RECONCILIATION) ||
-			(taskStatus.GetSource() == mesos.TaskStatus_SOURCE_SLAVE && taskStatus.GetReason() == mesos.TaskStatus_REASON_EXECUTOR_TERMINATED) ||
-			(taskStatus.GetSource() == mesos.TaskStatus_SOURCE_SLAVE && taskStatus.GetReason() == mesos.TaskStatus_REASON_EXECUTOR_UNREGISTERED) ||
-			(taskStatus.GetSource() == mesos.TaskStatus_SOURCE_EXECUTOR && taskStatus.GetMessage() == messages.ContainersDisappeared)) {
-		//--
-		// pod-task has metadata that refers to:
-		// (1) a task that Mesos no longer knows about, or else
-		// (2) a pod that the Kubelet will never report as "failed"
-		// (3) a pod that the kubeletExecutor reported as lost (likely due to docker daemon crash/restart)
-		// For now, destroy the pod and hope that there's a replication controller backing it up.
-		// TODO(jdef) for case #2 don't delete the pod, just update it's status to Failed
-		pod := &task.Pod
-		log.Warningf("deleting rogue pod %v/%v for lost task %v", pod.Namespace, pod.Name, task.ID)
-		if err := k.client.Pods(pod.Namespace).Delete(pod.Name, api.NewDeleteOptions(0)); err != nil && !errors.IsNotFound(err) {
-			log.Errorf("failed to delete pod %v/%v for terminal task %v: %v", pod.Namespace, pod.Name, task.ID, err)
-		}
-	} else if taskStatus.GetReason() == mesos.TaskStatus_REASON_EXECUTOR_TERMINATED || taskStatus.GetReason() == mesos.TaskStatus_REASON_EXECUTOR_UNREGISTERED {
-		// attempt to prevent dangling pods in the pod and task registries
-		log.V(1).Infof("request explicit reconciliation to clean up for task %v after executor reported (terminated/unregistered)", taskStatus.TaskId.GetValue())
-		k.reconciler.RequestExplicit()
-	} else if taskStatus.GetState() == mesos.TaskState_TASK_LOST && state == podtask.StateRunning && taskStatus.ExecutorId != nil && taskStatus.SlaveId != nil {
-		//TODO(jdef) this may not be meaningful once we have proper checkpointing and master detection
-		//If we're reconciling and receive this then the executor may be
-		//running a task that we need it to kill. It's possible that the framework
-		//is unrecognized by the master at this point, so KillTask is not guaranteed
-		//to do anything. The underlying driver transport may be able to send a
-		//FrameworkMessage directly to the slave to terminate the task.
-		log.V(2).Info("forwarding TASK_LOST message to executor %v on slave %v", taskStatus.ExecutorId, taskStatus.SlaveId)
-		data := fmt.Sprintf("%s:%s", messages.TaskLost, task.ID) //TODO(jdef) use a real message type
-		if _, err := driver.SendFrameworkMessage(taskStatus.ExecutorId, taskStatus.SlaveId, data); err != nil {
-			log.Error(err.Error())
-		}
-	}
-}
-
-// reconcile an unknown (from the perspective of our registry) non-terminal task
-func (k *KubernetesScheduler) reconcileNonTerminalTask(driver bindings.SchedulerDriver, taskStatus *mesos.TaskStatus) {
-	// attempt to recover task from pod info:
-	// - task data may contain an api.PodStatusResult; if status.reason == REASON_RECONCILIATION then status.data == nil
-	// - the Name can be parsed by container.ParseFullName() to yield a pod Name and Namespace
-	// - pull the pod metadata down from the api server
-	// - perform task recovery based on pod metadata
-	taskId := taskStatus.TaskId.GetValue()
-	if taskStatus.GetReason() == mesos.TaskStatus_REASON_RECONCILIATION && taskStatus.GetSource() == mesos.TaskStatus_SOURCE_MASTER {
-		// there will be no data in the task status that we can use to determine the associated pod
-		switch taskStatus.GetState() {
-		case mesos.TaskState_TASK_STAGING:
-			// there is still hope for this task, don't kill it just yet
-			//TODO(jdef) there should probably be a limit for how long we tolerate tasks stuck in this state
-			return
-		default:
-			// for TASK_{STARTING,RUNNING} we should have already attempted to recoverTasks() for.
-			// if the scheduler failed over before the executor fired TASK_STARTING, then we should *not*
-			// be processing this reconciliation update before we process the one from the executor.
-			// point: we don't know what this task is (perhaps there was unrecoverable metadata in the pod),
-			// so it gets killed.
-			log.Errorf("killing non-terminal, unrecoverable task %v", taskId)
-		}
-	} else if podStatus, err := podtask.ParsePodStatusResult(taskStatus); err != nil {
-		// possible rogue pod exists at this point because we can't identify it; should kill the task
-		log.Errorf("possible rogue pod; illegal task status data for task %v, expected an api.PodStatusResult: %v", taskId, err)
-	} else if name, namespace, err := container.ParsePodFullName(podStatus.Name); err != nil {
-		// possible rogue pod exists at this point because we can't identify it; should kill the task
-		log.Errorf("possible rogue pod; illegal api.PodStatusResult, unable to parse full pod name from: '%v' for task %v: %v",
-			podStatus.Name, taskId, err)
-	} else if pod, err := k.client.Pods(namespace).Get(name); err == nil {
-		if t, ok, err := podtask.RecoverFrom(*pod); ok {
-			log.Infof("recovered task %v from metadata in pod %v/%v", taskId, namespace, name)
-			_, err := k.taskRegistry.Register(t)
-			if err != nil {
-				// someone beat us to it?!
-				log.Warningf("failed to register recovered task: %v", err)
-				return
-			} else {
-				k.taskRegistry.UpdateStatus(taskStatus)
-			}
-			return
-		} else if err != nil {
-			//should kill the pod and the task
-			log.Errorf("killing pod, failed to recover task from pod %v/%v: %v", namespace, name, err)
-			if err := k.client.Pods(namespace).Delete(name, nil); err != nil {
-				log.Errorf("failed to delete pod %v/%v: %v", namespace, name, err)
-			}
-		} else {
-			//this is pretty unexpected: we received a TASK_{STARTING,RUNNING} message, but the apiserver's pod
-			//metadata is not appropriate for task reconstruction -- which should almost certainly never
-			//be the case unless someone swapped out the pod on us (and kept the same namespace/name) while
-			//we were failed over.
-
-			//kill this task, allow the newly launched scheduler to schedule the new pod
-			log.Warningf("unexpected pod metadata for task %v in apiserver, assuming new unscheduled pod spec: %+v", taskId, pod)
-		}
-	} else if errors.IsNotFound(err) {
-		// pod lookup failed, should delete the task since the pod is no longer valid; may be redundant, that's ok
-		log.Infof("killing task %v since pod %v/%v no longer exists", taskId, namespace, name)
-	} else if errors.IsServerTimeout(err) {
-		log.V(2).Infof("failed to reconcile task due to API server timeout: %v", err)
-		return
-	} else {
-		log.Errorf("unexpected API server error, aborting reconcile for task %v: %v", taskId, err)
-		return
-	}
-	if _, err := driver.KillTask(taskStatus.TaskId); err != nil {
-		log.Errorf("failed to kill task %v: %v", taskId, err)
-	}
-}
-
-// FrameworkMessage is called when the scheduler receives a message from the executor.
-func (k *KubernetesScheduler) FrameworkMessage(driver bindings.SchedulerDriver,
-	executorId *mesos.ExecutorID, slaveId *mesos.SlaveID, message string) {
-	log.Infof("Received messages from executor %v of slave %v, %v\n", executorId, slaveId, message)
-}
-
-// SlaveLost is called when some slave is lost.
-func (k *KubernetesScheduler) SlaveLost(driver bindings.SchedulerDriver, slaveId *mesos.SlaveID) {
-	log.Infof("Slave %v is lost\n", slaveId)
-
-	sid := slaveId.GetValue()
-	k.offers.InvalidateForSlave(sid)
-
-	// TODO(jdef): delete slave from our internal list? probably not since we may need to reconcile
-	// tasks. it would be nice to somehow flag the slave as lost so that, perhaps, we can periodically
-	// flush lost slaves older than X, and for which no tasks or pods reference.
-
-	// unfinished tasks/pods will be dropped. use a replication controller if you want pods to
-	// be restarted when slaves die.
-}
-
-// ExecutorLost is called when some executor is lost.
-func (k *KubernetesScheduler) ExecutorLost(driver bindings.SchedulerDriver, executorId *mesos.ExecutorID, slaveId *mesos.SlaveID, status int) {
-	log.Infof("Executor %v of slave %v is lost, status: %v\n", executorId, slaveId, status)
-	// TODO(yifan): Restart any unfinished tasks of the executor.
-}
-
-// Error is called when there is an unrecoverable error in the scheduler or scheduler driver.
-// The driver should have been aborted before this is invoked.
-func (k *KubernetesScheduler) Error(driver bindings.SchedulerDriver, message string) {
-	log.Fatalf("fatal scheduler error: %v\n", message)
-}
-
-// filter func used for explicit task reconciliation, selects only non-terminal tasks which
-// have been communicated to mesos (read: launched).
-func explicitTaskFilter(t *podtask.T) bool {
-	switch t.State {
-	case podtask.StateRunning:
-		return true
-	case podtask.StatePending:
-		return t.Has(podtask.Launched)
-	default:
-		return false
-	}
-}
-
-// invoke the given ReconcilerAction funcs in sequence, aborting the sequence if reconciliation
-// is cancelled. if any other errors occur the composite reconciler will attempt to complete the
-// sequence, reporting only the last generated error.
-func (k *KubernetesScheduler) makeCompositeReconciler(actions ...ReconcilerAction) ReconcilerAction {
-	if x := len(actions); x == 0 {
-		// programming error
-		panic("no actions specified for composite reconciler")
-	} else if x == 1 {
-		return actions[0]
-	}
-	chained := func(d bindings.SchedulerDriver, c <-chan struct{}, a, b ReconcilerAction) <-chan error {
-		ech := a(d, c)
-		ch := make(chan error, 1)
-		go func() {
-			select {
-			case <-k.terminate:
-			case <-c:
-			case e := <-ech:
-				if e != nil {
-					ch <- e
-					return
-				}
-				ech = b(d, c)
-				select {
-				case <-k.terminate:
-				case <-c:
-				case e := <-ech:
-					if e != nil {
-						ch <- e
-						return
-					}
-					close(ch)
-					return
-				}
-			}
-			ch <- fmt.Errorf("aborting composite reconciler action")
-		}()
-		return ch
-	}
-	result := func(d bindings.SchedulerDriver, c <-chan struct{}) <-chan error {
-		return chained(d, c, actions[0], actions[1])
-	}
-	for i := 2; i < len(actions); i++ {
-		i := i
-		next := func(d bindings.SchedulerDriver, c <-chan struct{}) <-chan error {
-			return chained(d, c, ReconcilerAction(result), actions[i])
-		}
-		result = next
-	}
-	return ReconcilerAction(result)
-}
-
-// reconciler action factory, performs explicit task reconciliation for non-terminal
-// tasks listed in the scheduler's internal taskRegistry.
-func (k *KubernetesScheduler) makeTaskRegistryReconciler() ReconcilerAction {
-	return ReconcilerAction(func(drv bindings.SchedulerDriver, cancel <-chan struct{}) <-chan error {
-		taskToSlave := make(map[string]string)
-		for _, t := range k.taskRegistry.List(explicitTaskFilter) {
-			if t.Spec.SlaveID != "" {
-				taskToSlave[t.ID] = t.Spec.SlaveID
-			}
-		}
-		return proc.ErrorChan(k.explicitlyReconcileTasks(drv, taskToSlave, cancel))
-	})
-}
-
-// reconciler action factory, performs explicit task reconciliation for non-terminal
-// tasks identified by annotations in the Kubernetes pod registry.
-func (k *KubernetesScheduler) makePodRegistryReconciler() ReconcilerAction {
-	return ReconcilerAction(func(drv bindings.SchedulerDriver, cancel <-chan struct{}) <-chan error {
-		podList, err := k.client.Pods(api.NamespaceAll).List(labels.Everything(), fields.Everything())
-		if err != nil {
-			return proc.ErrorChanf("failed to reconcile pod registry: %v", err)
-		}
-		taskToSlave := make(map[string]string)
-		for _, pod := range podList.Items {
-			if len(pod.Annotations) == 0 {
-				continue
-			}
-			taskId, found := pod.Annotations[meta.TaskIdKey]
-			if !found {
-				continue
-			}
-			slaveId, found := pod.Annotations[meta.SlaveIdKey]
-			if !found {
-				continue
-			}
-			taskToSlave[taskId] = slaveId
-		}
-		return proc.ErrorChan(k.explicitlyReconcileTasks(drv, taskToSlave, cancel))
-	})
-}
-
-// execute an explicit task reconciliation, as per http://mesos.apache.org/documentation/latest/reconciliation/
-func (k *KubernetesScheduler) explicitlyReconcileTasks(driver bindings.SchedulerDriver, taskToSlave map[string]string, cancel <-chan struct{}) error {
-	log.Info("explicit reconcile tasks")
-
-	// tell mesos to send us the latest status updates for all the non-terminal tasks that we know about
-	statusList := []*mesos.TaskStatus{}
-	remaining := sets.StringKeySet(taskToSlave)
-	for taskId, slaveId := range taskToSlave {
-		if slaveId == "" {
-			delete(taskToSlave, taskId)
-			continue
-		}
-		statusList = append(statusList, &mesos.TaskStatus{
-			TaskId:  mutil.NewTaskID(taskId),
-			SlaveId: mutil.NewSlaveID(slaveId),
-			State:   mesos.TaskState_TASK_RUNNING.Enum(), // req'd field, doesn't have to reflect reality
-		})
-	}
-
-	select {
-	case <-cancel:
-		return reconciliationCancelledErr
-	default:
-		if _, err := driver.ReconcileTasks(statusList); err != nil {
-			return err
-		}
-	}
-
-	start := time.Now()
-	first := true
-	for backoff := 1 * time.Second; first || remaining.Len() > 0; backoff = backoff * 2 {
-		first = false
-		// nothing to do here other than wait for status updates..
-		if backoff > k.schedcfg.ExplicitReconciliationMaxBackoff.Duration {
-			backoff = k.schedcfg.ExplicitReconciliationMaxBackoff.Duration
-		}
-		select {
-		case <-cancel:
-			return reconciliationCancelledErr
-		case <-time.After(backoff):
-			for taskId := range remaining {
-				if task, _ := k.taskRegistry.Get(taskId); task != nil && explicitTaskFilter(task) && task.UpdatedTime.Before(start) {
-					// keep this task in remaining list
-					continue
-				}
-				remaining.Delete(taskId)
-			}
-		}
-	}
-	return nil
-}
-
-var (
-	reconciliationCancelledErr = fmt.Errorf("explicit task reconciliation cancelled")
-)
-
-type ReconcilerAction func(driver bindings.SchedulerDriver, cancel <-chan struct{}) <-chan error
-
-type Reconciler struct {
-	proc.Doer
-	Action                             ReconcilerAction
-	explicit                           chan struct{}   // send an empty struct to trigger explicit reconciliation
-	implicit                           chan struct{}   // send an empty struct to trigger implicit reconciliation
-	done                               <-chan struct{} // close this when you want the reconciler to exit
-	cooldown                           time.Duration
-	explicitReconciliationAbortTimeout time.Duration
-}
-
-func newReconciler(doer proc.Doer, action ReconcilerAction,
-	cooldown, explicitReconciliationAbortTimeout time.Duration, done <-chan struct{}) *Reconciler {
-	return &Reconciler{
-		Doer:     doer,
-		explicit: make(chan struct{}, 1),
-		implicit: make(chan struct{}, 1),
-		cooldown: cooldown,
-		explicitReconciliationAbortTimeout: explicitReconciliationAbortTimeout,
-		done: done,
-		Action: func(driver bindings.SchedulerDriver, cancel <-chan struct{}) <-chan error {
-			// trigged the reconciler action in the doer's execution context,
-			// but it could take a while and the scheduler needs to be able to
-			// process updates, the callbacks for which ALSO execute in the SAME
-			// deferred execution context -- so the action MUST be executed async.
-			errOnce := proc.NewErrorOnce(cancel)
-			return errOnce.Send(doer.Do(func() {
-				// only triggers the action if we're the currently elected,
-				// registered master and runs the action async.
-				go func() {
-					var err <-chan error
-					defer errOnce.Send(err)
-					err = action(driver, cancel)
-				}()
-			})).Err()
-		},
-	}
-}
-
-func (r *Reconciler) RequestExplicit() {
-	select {
-	case r.explicit <- struct{}{}: // noop
-	default: // request queue full; noop
-	}
-}
-
-func (r *Reconciler) RequestImplicit() {
-	select {
-	case r.implicit <- struct{}{}: // noop
-	default: // request queue full; noop
-	}
-}
-
-// execute task reconciliation, returns when r.done is closed. intended to run as a goroutine.
-// if reconciliation is requested while another is in progress, the in-progress operation will be
-// cancelled before the new reconciliation operation begins.
-func (r *Reconciler) Run(driver bindings.SchedulerDriver) {
-	var cancel, finished chan struct{}
-requestLoop:
-	for {
-		select {
-		case <-r.done:
-			return
-		default: // proceed
-		}
-		select {
-		case <-r.implicit:
-			metrics.ReconciliationRequested.WithLabelValues("implicit").Inc()
-			select {
-			case <-r.done:
-				return
-			case <-r.explicit:
-				break // give preference to a pending request for explicit
-			default: // continue
-				// don't run implicit reconciliation while explicit is ongoing
-				if finished != nil {
-					select {
-					case <-finished: // continue w/ implicit
-					default:
-						log.Infoln("skipping implicit reconcile because explicit reconcile is ongoing")
-						continue requestLoop
-					}
-				}
-				errOnce := proc.NewErrorOnce(r.done)
-				errCh := r.Do(func() {
-					var err error
-					defer errOnce.Report(err)
-					log.Infoln("implicit reconcile tasks")
-					metrics.ReconciliationExecuted.WithLabelValues("implicit").Inc()
-					if _, err = driver.ReconcileTasks([]*mesos.TaskStatus{}); err != nil {
-						log.V(1).Infof("failed to request implicit reconciliation from mesos: %v", err)
-					}
-				})
-				proc.OnError(errOnce.Send(errCh).Err(), func(err error) {
-					log.Errorf("failed to run implicit reconciliation: %v", err)
-				}, r.done)
-				goto slowdown
-			}
-		case <-r.done:
-			return
-		case <-r.explicit: // continue
-			metrics.ReconciliationRequested.WithLabelValues("explicit").Inc()
-		}
-
-		if cancel != nil {
-			close(cancel)
-			cancel = nil
-
-			// play nice and wait for the prior operation to finish, complain
-			// if it doesn't
-			select {
-			case <-r.done:
-				return
-			case <-finished: // noop, expected
-			case <-time.After(r.explicitReconciliationAbortTimeout): // very unexpected
-				log.Error("reconciler action failed to stop upon cancellation")
-			}
-		}
-		// copy 'finished' to 'fin' here in case we end up with simultaneous go-routines,
-		// if cancellation takes too long or fails - we don't want to close the same chan
-		// more than once
-		cancel = make(chan struct{})
-		finished = make(chan struct{})
-		go func(fin chan struct{}) {
-			startedAt := time.Now()
-			defer func() {
-				metrics.ReconciliationLatency.Observe(metrics.InMicroseconds(time.Since(startedAt)))
-			}()
-
-			metrics.ReconciliationExecuted.WithLabelValues("explicit").Inc()
-			defer close(fin)
-			err := <-r.Action(driver, cancel)
-			if err == reconciliationCancelledErr {
-				metrics.ReconciliationCancelled.WithLabelValues("explicit").Inc()
-				log.Infoln(err.Error())
-			} else if err != nil {
-				log.Errorf("reconciler action failed: %v", err)
-			}
-		}(finished)
-	slowdown:
-		// don't allow reconciliation to run very frequently, either explicit or implicit
-		select {
-		case <-r.done:
-			return
-		case <-time.After(r.cooldown): // noop
-		}
-	} // for
-}
-
-func (ks *KubernetesScheduler) recoverTasks() error {
-	podList, err := ks.client.Pods(api.NamespaceAll).List(labels.Everything(), fields.Everything())
-	if err != nil {
-		log.V(1).Infof("failed to recover pod registry, madness may ensue: %v", err)
-		return err
-	}
-	recoverSlave := func(t *podtask.T) {
-
-		slaveId := t.Spec.SlaveID
-		ks.slaveHostNames.Register(slaveId, t.Offer.Host())
-	}
-	for _, pod := range podList.Items {
-		if _, isMirrorPod := pod.Annotations[kubetypes.ConfigMirrorAnnotationKey]; isMirrorPod {
-			// mirrored pods are never reconciled because the scheduler isn't responsible for
-			// scheduling them; they're started by the executor/kubelet upon instantiation and
-			// reflected in the apiserver afterward. the scheduler has no knowledge of them.
-			continue
-		}
-		if t, ok, err := podtask.RecoverFrom(pod); err != nil {
-			log.Errorf("failed to recover task from pod, will attempt to delete '%v/%v': %v", pod.Namespace, pod.Name, err)
-			err := ks.client.Pods(pod.Namespace).Delete(pod.Name, nil)
-			//TODO(jdef) check for temporary or not-found errors
-			if err != nil {
-				log.Errorf("failed to delete pod '%v/%v': %v", pod.Namespace, pod.Name, err)
-			}
-		} else if ok {
-			ks.taskRegistry.Register(t)
-			recoverSlave(t)
-			log.Infof("recovered task %v from pod %v/%v", t.ID, pod.Namespace, pod.Name)
-		}
-	}
-	return nil
+	Offers() offers.Registry
+	Reconcile(t *podtask.T)
+	KillTask(id string) error
+	LaunchTask(t *podtask.T) error
+
+	Run(done <-chan struct{})
 }
--- a/contrib/mesos/pkg/scheduler/scheduler_mock.go
+++ b/contrib/mesos/pkg/scheduler/scheduler_mock.go
@@ -0,0 +1,74 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package scheduler
+
+import (
+	"sync"
+
+	"github.com/stretchr/testify/mock"
+	"k8s.io/kubernetes/contrib/mesos/pkg/offers"
+	"k8s.io/kubernetes/contrib/mesos/pkg/runtime"
+	"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/podtask"
+	"time"
+)
+
+// MockScheduler implements SchedulerApi
+type MockScheduler struct {
+	sync.RWMutex
+	mock.Mock
+}
+
+func (m *MockScheduler) Run(done <-chan struct{}) {
+	_ = m.Called()
+	runtime.Until(func() {
+		time.Sleep(time.Second)
+	}, time.Second, done)
+	return
+}
+
+func (m *MockScheduler) Offers() (f offers.Registry) {
+	args := m.Called()
+	x := args.Get(0)
+	if x != nil {
+		f = x.(offers.Registry)
+	}
+	return
+}
+
+func (m *MockScheduler) Tasks() (f podtask.Registry) {
+	args := m.Called()
+	x := args.Get(0)
+	if x != nil {
+		f = x.(podtask.Registry)
+	}
+	return
+}
+
+func (m *MockScheduler) KillTask(taskId string) error {
+	args := m.Called(taskId)
+	return args.Error(0)
+}
+
+func (m *MockScheduler) LaunchTask(task *podtask.T) error {
+	args := m.Called(task)
+	return args.Error(0)
+}
+
+func (m *MockScheduler) Reconcile(task *podtask.T) {
+	_ = m.Called()
+	return
+}
--- a/contrib/mesos/pkg/scheduler/service/publish.go
+++ b/contrib/mesos/pkg/scheduler/service/publish.go
@@ -42,7 +42,7 @@ func (m *SchedulerServer) newServiceWriter(stop <-chan struct{}) func() {
 				glog.Errorf("Can't create scheduler service: %v", err)
 			}

-			if err := m.setEndpoints(SCHEDULER_SERVICE_NAME, net.IP(m.Address), m.Port); err != nil {
+			if err := m.setEndpoints(SCHEDULER_SERVICE_NAME, net.IP(m.address), m.port); err != nil {
 				glog.Errorf("Can't create scheduler endpoints: %v", err)
 			}

@@ -76,8 +76,8 @@ func (m *SchedulerServer) createSchedulerServiceIfNeeded(serviceName string, ser
 			SessionAffinity: api.ServiceAffinityNone,
 		},
 	}
-	if m.ServiceAddress != nil {
-		svc.Spec.ClusterIP = m.ServiceAddress.String()
+	if m.serviceAddress != nil {
+		svc.Spec.ClusterIP = m.serviceAddress.String()
 	}
 	_, err := m.client.Services(api.NamespaceValue(ctx)).Create(svc)
 	if err != nil && errors.IsAlreadyExists(err) {
--- a/contrib/mesos/pkg/scheduler/service/service.go
+++ b/contrib/mesos/pkg/scheduler/service/service.go
@@ -54,7 +54,9 @@ import (
 	minioncfg "k8s.io/kubernetes/contrib/mesos/pkg/minion/config"
 	"k8s.io/kubernetes/contrib/mesos/pkg/profile"
 	"k8s.io/kubernetes/contrib/mesos/pkg/runtime"
-	"k8s.io/kubernetes/contrib/mesos/pkg/scheduler"
+	"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/components"
+	"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/components/algorithm/podschedulers"
+	"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/components/framework"
 	schedcfg "k8s.io/kubernetes/contrib/mesos/pkg/scheduler/config"
 	"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/ha"
 	"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/meta"
@@ -65,6 +67,7 @@ import (
 	"k8s.io/kubernetes/pkg/api"
 	"k8s.io/kubernetes/pkg/api/resource"
 	"k8s.io/kubernetes/pkg/client/cache"
+	"k8s.io/kubernetes/pkg/client/record"
 	client "k8s.io/kubernetes/pkg/client/unversioned"
 	clientauth "k8s.io/kubernetes/pkg/client/unversioned/auth"
 	"k8s.io/kubernetes/pkg/fields"
@@ -86,72 +89,72 @@ const (
 )

 type SchedulerServer struct {
-	Port                int
-	Address             net.IP
-	EnableProfiling     bool
-	AuthPath            string
-	APIServerList       []string
-	EtcdServerList      []string
-	EtcdConfigFile      string
-	AllowPrivileged     bool
-	ExecutorPath        string
-	ProxyPath           string
-	MesosMaster         string
-	MesosUser           string
-	MesosRole           string
-	MesosAuthPrincipal  string
-	MesosAuthSecretFile string
-	MesosCgroupPrefix   string
-	MesosExecutorCPUs   mresource.CPUShares
-	MesosExecutorMem    mresource.MegaBytes
-	Checkpoint          bool
-	FailoverTimeout     float64
+	port                int
+	address             net.IP
+	enableProfiling     bool
+	authPath            string
+	apiServerList       []string
+	etcdServerList      []string
+	etcdConfigFile      string
+	allowPrivileged     bool
+	executorPath        string
+	proxyPath           string
+	mesosMaster         string
+	mesosUser           string
+	mesosRole           string
+	mesosAuthPrincipal  string
+	mesosAuthSecretFile string
+	mesosCgroupPrefix   string
+	mesosExecutorCPUs   mresource.CPUShares
+	mesosExecutorMem    mresource.MegaBytes
+	checkpoint          bool
+	failoverTimeout     float64

-	ExecutorLogV           int
-	ExecutorBindall        bool
-	ExecutorSuicideTimeout time.Duration
-	LaunchGracePeriod      time.Duration
+	executorLogV           int
+	executorBindall        bool
+	executorSuicideTimeout time.Duration
+	launchGracePeriod      time.Duration

-	RunProxy     bool
-	ProxyBindall bool
-	ProxyLogV    int
+	runProxy     bool
+	proxyBindall bool
+	proxyLogV    int

-	MinionPathOverride    string
-	MinionLogMaxSize      resource.Quantity
-	MinionLogMaxBackups   int
-	MinionLogMaxAgeInDays int
+	minionPathOverride    string
+	minionLogMaxSize      resource.Quantity
+	minionLogMaxBackups   int
+	minionLogMaxAgeInDays int

-	MesosAuthProvider             string
-	DriverPort                    uint
-	HostnameOverride              string
-	ReconcileInterval             int64
-	ReconcileCooldown             time.Duration
-	DefaultContainerCPULimit      mresource.CPUShares
-	DefaultContainerMemLimit      mresource.MegaBytes
-	SchedulerConfigFileName       string
-	Graceful                      bool
-	FrameworkName                 string
-	FrameworkWebURI               string
-	HA                            bool
-	AdvertisedAddress             string
-	ServiceAddress                net.IP
-	HADomain                      string
-	KMPath                        string
-	ClusterDNS                    net.IP
-	ClusterDomain                 string
-	KubeletRootDirectory          string
-	KubeletDockerEndpoint         string
-	KubeletPodInfraContainerImage string
-	KubeletCadvisorPort           uint
-	KubeletHostNetworkSources     string
-	KubeletSyncFrequency          time.Duration
-	KubeletNetworkPluginName      string
-	StaticPodsConfigPath          string
-	DockerCfgPath                 string
-	ContainPodResources           bool
-	AccountForPodResources        bool
+	mesosAuthProvider             string
+	driverPort                    uint
+	hostnameOverride              string
+	reconcileInterval             int64
+	reconcileCooldown             time.Duration
+	defaultContainerCPULimit      mresource.CPUShares
+	defaultContainerMemLimit      mresource.MegaBytes
+	schedulerConfigFileName       string
+	graceful                      bool
+	frameworkName                 string
+	frameworkWebURI               string
+	ha                            bool
+	advertisedAddress             string
+	serviceAddress                net.IP
+	haDomain                      string
+	kmPath                        string
+	clusterDNS                    net.IP
+	clusterDomain                 string
+	kubeletRootDirectory          string
+	kubeletDockerEndpoint         string
+	kubeletPodInfraContainerImage string
+	kubeletCadvisorPort           uint
+	kubeletHostNetworkSources     string
+	kubeletSyncFrequency          time.Duration
+	kubeletNetworkPluginName      string
+	staticPodsConfigPath          string
+	dockerCfgPath                 string
+	containPodResources           bool
+	accountForPodResources        bool
 	nodeRelistPeriod              time.Duration
-	SandboxOverlay                string
+	sandboxOverlay                string

 	executable  string // path to the binary running this service
 	client      *client.Client
@@ -170,36 +173,36 @@ type schedulerProcessInterface interface {
 // NewSchedulerServer creates a new SchedulerServer with default parameters
 func NewSchedulerServer() *SchedulerServer {
 	s := SchedulerServer{
-		Port:            ports.SchedulerPort,
-		Address:         net.ParseIP("127.0.0.1"),
-		FailoverTimeout: time.Duration((1 << 62) - 1).Seconds(),
+		port:            ports.SchedulerPort,
+		address:         net.ParseIP("127.0.0.1"),
+		failoverTimeout: time.Duration((1 << 62) - 1).Seconds(),

-		RunProxy:                 true,
-		ExecutorSuicideTimeout:   execcfg.DefaultSuicideTimeout,
-		LaunchGracePeriod:        execcfg.DefaultLaunchGracePeriod,
-		DefaultContainerCPULimit: mresource.DefaultDefaultContainerCPULimit,
-		DefaultContainerMemLimit: mresource.DefaultDefaultContainerMemLimit,
+		runProxy:                 true,
+		executorSuicideTimeout:   execcfg.DefaultSuicideTimeout,
+		launchGracePeriod:        execcfg.DefaultLaunchGracePeriod,
+		defaultContainerCPULimit: mresource.DefaultDefaultContainerCPULimit,
+		defaultContainerMemLimit: mresource.DefaultDefaultContainerMemLimit,

-		MinionLogMaxSize:      minioncfg.DefaultLogMaxSize(),
-		MinionLogMaxBackups:   minioncfg.DefaultLogMaxBackups,
-		MinionLogMaxAgeInDays: minioncfg.DefaultLogMaxAgeInDays,
+		minionLogMaxSize:      minioncfg.DefaultLogMaxSize(),
+		minionLogMaxBackups:   minioncfg.DefaultLogMaxBackups,
+		minionLogMaxAgeInDays: minioncfg.DefaultLogMaxAgeInDays,

-		MesosAuthProvider:      sasl.ProviderName,
-		MesosCgroupPrefix:      minioncfg.DefaultCgroupPrefix,
-		MesosMaster:            defaultMesosMaster,
-		MesosUser:              defaultMesosUser,
-		MesosExecutorCPUs:      defaultExecutorCPUs,
-		MesosExecutorMem:       defaultExecutorMem,
-		ReconcileInterval:      defaultReconcileInterval,
-		ReconcileCooldown:      defaultReconcileCooldown,
-		Checkpoint:             true,
-		FrameworkName:          defaultFrameworkName,
-		HA:                     false,
+		mesosAuthProvider:      sasl.ProviderName,
+		mesosCgroupPrefix:      minioncfg.DefaultCgroupPrefix,
+		mesosMaster:            defaultMesosMaster,
+		mesosUser:              defaultMesosUser,
+		mesosExecutorCPUs:      defaultExecutorCPUs,
+		mesosExecutorMem:       defaultExecutorMem,
+		reconcileInterval:      defaultReconcileInterval,
+		reconcileCooldown:      defaultReconcileCooldown,
+		checkpoint:             true,
+		frameworkName:          defaultFrameworkName,
+		ha:                     false,
 		mux:                    http.NewServeMux(),
-		KubeletCadvisorPort:    4194, // copied from github.com/GoogleCloudPlatform/kubernetes/blob/release-0.14/cmd/kubelet/app/server.go
-		KubeletSyncFrequency:   10 * time.Second,
-		ContainPodResources:    true,
-		AccountForPodResources: true,
+		kubeletCadvisorPort:    4194, // copied from github.com/GoogleCloudPlatform/kubernetes/blob/release-0.14/cmd/kubelet/app/server.go
+		kubeletSyncFrequency:   10 * time.Second,
+		containPodResources:    true,
+		accountForPodResources: true,
 		nodeRelistPeriod:       defaultNodeRelistPeriod,
 	}
 	// cache this for later use. also useful in case the original binary gets deleted, e.g.
@@ -208,76 +211,76 @@ func NewSchedulerServer() *SchedulerServer {
 		log.Fatalf("failed to determine path to currently running executable: %v", err)
 	} else {
 		s.executable = filename
-		s.KMPath = filename
+		s.kmPath = filename
 	}

 	return &s
 }

 func (s *SchedulerServer) addCoreFlags(fs *pflag.FlagSet) {
-	fs.IntVar(&s.Port, "port", s.Port, "The port that the scheduler's http service runs on")
-	fs.IPVar(&s.Address, "address", s.Address, "The IP address to serve on (set to 0.0.0.0 for all interfaces)")
-	fs.BoolVar(&s.EnableProfiling, "profiling", s.EnableProfiling, "Enable profiling via web interface host:port/debug/pprof/")
-	fs.StringSliceVar(&s.APIServerList, "api-servers", s.APIServerList, "List of Kubernetes API servers for publishing events, and reading pods and services. (ip:port), comma separated.")
-	fs.StringVar(&s.AuthPath, "auth-path", s.AuthPath, "Path to .kubernetes_auth file, specifying how to authenticate to API server.")
-	fs.StringSliceVar(&s.EtcdServerList, "etcd-servers", s.EtcdServerList, "List of etcd servers to watch (http://ip:port), comma separated. Mutually exclusive with --etcd-config")
-	fs.StringVar(&s.EtcdConfigFile, "etcd-config", s.EtcdConfigFile, "The config file for the etcd client. Mutually exclusive with --etcd-servers.")
-	fs.BoolVar(&s.AllowPrivileged, "allow-privileged", s.AllowPrivileged, "If true, allow privileged containers.")
-	fs.StringVar(&s.ClusterDomain, "cluster-domain", s.ClusterDomain, "Domain for this cluster.  If set, kubelet will configure all containers to search this domain in addition to the host's search domains")
-	fs.IPVar(&s.ClusterDNS, "cluster-dns", s.ClusterDNS, "IP address for a cluster DNS server. If set, kubelet will configure all containers to use this for DNS resolution in addition to the host's DNS servers")
-	fs.StringVar(&s.StaticPodsConfigPath, "static-pods-config", s.StaticPodsConfigPath, "Path for specification of static pods. Path should point to dir containing the staticPods configuration files. Defaults to none.")
+	fs.IntVar(&s.port, "port", s.port, "The port that the scheduler's http service runs on")
+	fs.IPVar(&s.address, "address", s.address, "The IP address to serve on (set to 0.0.0.0 for all interfaces)")
+	fs.BoolVar(&s.enableProfiling, "profiling", s.enableProfiling, "Enable profiling via web interface host:port/debug/pprof/")
+	fs.StringSliceVar(&s.apiServerList, "api-servers", s.apiServerList, "List of Kubernetes API servers for publishing events, and reading pods and services. (ip:port), comma separated.")
+	fs.StringVar(&s.authPath, "auth-path", s.authPath, "Path to .kubernetes_auth file, specifying how to authenticate to API server.")
+	fs.StringSliceVar(&s.etcdServerList, "etcd-servers", s.etcdServerList, "List of etcd servers to watch (http://ip:port), comma separated. Mutually exclusive with --etcd-config")
+	fs.StringVar(&s.etcdConfigFile, "etcd-config", s.etcdConfigFile, "The config file for the etcd client. Mutually exclusive with --etcd-servers.")
+	fs.BoolVar(&s.allowPrivileged, "allow-privileged", s.allowPrivileged, "If true, allow privileged containers.")
+	fs.StringVar(&s.clusterDomain, "cluster-domain", s.clusterDomain, "Domain for this cluster.  If set, kubelet will configure all containers to search this domain in addition to the host's search domains")
+	fs.IPVar(&s.clusterDNS, "cluster-dns", s.clusterDNS, "IP address for a cluster DNS server. If set, kubelet will configure all containers to use this for DNS resolution in addition to the host's DNS servers")
+	fs.StringVar(&s.staticPodsConfigPath, "static-pods-config", s.staticPodsConfigPath, "Path for specification of static pods. Path should point to dir containing the staticPods configuration files. Defaults to none.")

-	fs.StringVar(&s.MesosMaster, "mesos-master", s.MesosMaster, "Location of the Mesos master. The format is a comma-delimited list of of hosts like zk://host1:port,host2:port/mesos. If using ZooKeeper, pay particular attention to the leading zk:// and trailing /mesos! If not using ZooKeeper, standard URLs like http://localhost are also acceptable.")
-	fs.StringVar(&s.MesosUser, "mesos-user", s.MesosUser, "Mesos user for this framework, defaults to root.")
-	fs.StringVar(&s.MesosRole, "mesos-role", s.MesosRole, "Mesos role for this framework, defaults to none.")
-	fs.StringVar(&s.MesosAuthPrincipal, "mesos-authentication-principal", s.MesosAuthPrincipal, "Mesos authentication principal.")
-	fs.StringVar(&s.MesosAuthSecretFile, "mesos-authentication-secret-file", s.MesosAuthSecretFile, "Mesos authentication secret file.")
-	fs.StringVar(&s.MesosAuthProvider, "mesos-authentication-provider", s.MesosAuthProvider, fmt.Sprintf("Authentication provider to use, default is SASL that supports mechanisms: %+v", mech.ListSupported()))
-	fs.StringVar(&s.DockerCfgPath, "dockercfg-path", s.DockerCfgPath, "Path to a dockercfg file that will be used by the docker instance of the minions.")
-	fs.StringVar(&s.MesosCgroupPrefix, "mesos-cgroup-prefix", s.MesosCgroupPrefix, "The cgroup prefix concatenated with MESOS_DIRECTORY must give the executor cgroup set by Mesos")
-	fs.Var(&s.MesosExecutorCPUs, "mesos-executor-cpus", "Initial CPU shares to allocate for each Mesos executor container.")
-	fs.Var(&s.MesosExecutorMem, "mesos-executor-mem", "Initial memory (MB) to allocate for each Mesos executor container.")
-	fs.BoolVar(&s.Checkpoint, "checkpoint", s.Checkpoint, "Enable/disable checkpointing for the kubernetes-mesos framework.")
-	fs.Float64Var(&s.FailoverTimeout, "failover-timeout", s.FailoverTimeout, fmt.Sprintf("Framework failover timeout, in sec."))
-	fs.UintVar(&s.DriverPort, "driver-port", s.DriverPort, "Port that the Mesos scheduler driver process should listen on.")
-	fs.StringVar(&s.HostnameOverride, "hostname-override", s.HostnameOverride, "If non-empty, will use this string as identification instead of the actual hostname.")
-	fs.Int64Var(&s.ReconcileInterval, "reconcile-interval", s.ReconcileInterval, "Interval at which to execute task reconciliation, in sec. Zero disables.")
-	fs.DurationVar(&s.ReconcileCooldown, "reconcile-cooldown", s.ReconcileCooldown, "Minimum rest period between task reconciliation operations.")
-	fs.StringVar(&s.SchedulerConfigFileName, "scheduler-config", s.SchedulerConfigFileName, "An ini-style configuration file with low-level scheduler settings.")
-	fs.BoolVar(&s.Graceful, "graceful", s.Graceful, "Indicator of a graceful failover, intended for internal use only.")
-	fs.BoolVar(&s.HA, "ha", s.HA, "Run the scheduler in high availability mode with leader election. All peers should be configured exactly the same.")
-	fs.StringVar(&s.FrameworkName, "framework-name", s.FrameworkName, "The framework name to register with Mesos.")
-	fs.StringVar(&s.FrameworkWebURI, "framework-weburi", s.FrameworkWebURI, "A URI that points to a web-based interface for interacting with the framework.")
-	fs.StringVar(&s.AdvertisedAddress, "advertised-address", s.AdvertisedAddress, "host:port address that is advertised to clients. May be used to construct artifact download URIs.")
-	fs.IPVar(&s.ServiceAddress, "service-address", s.ServiceAddress, "The service portal IP address that the scheduler should register with (if unset, chooses randomly)")
-	fs.Var(&s.DefaultContainerCPULimit, "default-container-cpu-limit", "Containers without a CPU resource limit are admitted this much CPU shares")
-	fs.Var(&s.DefaultContainerMemLimit, "default-container-mem-limit", "Containers without a memory resource limit are admitted this much amount of memory in MB")
-	fs.BoolVar(&s.ContainPodResources, "contain-pod-resources", s.ContainPodResources, "Reparent pod containers into mesos cgroups; disable if you're having strange mesos/docker/systemd interactions.")
-	fs.BoolVar(&s.AccountForPodResources, "account-for-pod-resources", s.AccountForPodResources, "Allocate pod CPU and memory resources from offers (Default: true)")
+	fs.StringVar(&s.mesosMaster, "mesos-master", s.mesosMaster, "Location of the Mesos master. The format is a comma-delimited list of of hosts like zk://host1:port,host2:port/mesos. If using ZooKeeper, pay particular attention to the leading zk:// and trailing /mesos! If not using ZooKeeper, standard URLs like http://localhost are also acceptable.")
+	fs.StringVar(&s.mesosUser, "mesos-user", s.mesosUser, "Mesos user for this framework, defaults to root.")
+	fs.StringVar(&s.mesosRole, "mesos-role", s.mesosRole, "Mesos role for this framework, defaults to none.")
+	fs.StringVar(&s.mesosAuthPrincipal, "mesos-authentication-principal", s.mesosAuthPrincipal, "Mesos authentication principal.")
+	fs.StringVar(&s.mesosAuthSecretFile, "mesos-authentication-secret-file", s.mesosAuthSecretFile, "Mesos authentication secret file.")
+	fs.StringVar(&s.mesosAuthProvider, "mesos-authentication-provider", s.mesosAuthProvider, fmt.Sprintf("Authentication provider to use, default is SASL that supports mechanisms: %+v", mech.ListSupported()))
+	fs.StringVar(&s.dockerCfgPath, "dockercfg-path", s.dockerCfgPath, "Path to a dockercfg file that will be used by the docker instance of the minions.")
+	fs.StringVar(&s.mesosCgroupPrefix, "mesos-cgroup-prefix", s.mesosCgroupPrefix, "The cgroup prefix concatenated with MESOS_DIRECTORY must give the executor cgroup set by Mesos")
+	fs.Var(&s.mesosExecutorCPUs, "mesos-executor-cpus", "Initial CPU shares to allocate for each Mesos executor container.")
+	fs.Var(&s.mesosExecutorMem, "mesos-executor-mem", "Initial memory (MB) to allocate for each Mesos executor container.")
+	fs.BoolVar(&s.checkpoint, "checkpoint", s.checkpoint, "Enable/disable checkpointing for the kubernetes-mesos framework.")
+	fs.Float64Var(&s.failoverTimeout, "failover-timeout", s.failoverTimeout, fmt.Sprintf("Framework failover timeout, in sec."))
+	fs.UintVar(&s.driverPort, "driver-port", s.driverPort, "Port that the Mesos scheduler driver process should listen on.")
+	fs.StringVar(&s.hostnameOverride, "hostname-override", s.hostnameOverride, "If non-empty, will use this string as identification instead of the actual hostname.")
+	fs.Int64Var(&s.reconcileInterval, "reconcile-interval", s.reconcileInterval, "Interval at which to execute task reconciliation, in sec. Zero disables.")
+	fs.DurationVar(&s.reconcileCooldown, "reconcile-cooldown", s.reconcileCooldown, "Minimum rest period between task reconciliation operations.")
+	fs.StringVar(&s.schedulerConfigFileName, "scheduler-config", s.schedulerConfigFileName, "An ini-style configuration file with low-level scheduler settings.")
+	fs.BoolVar(&s.graceful, "graceful", s.graceful, "Indicator of a graceful failover, intended for internal use only.")
+	fs.BoolVar(&s.ha, "ha", s.ha, "Run the scheduler in high availability mode with leader election. All peers should be configured exactly the same.")
+	fs.StringVar(&s.frameworkName, "framework-name", s.frameworkName, "The framework name to register with Mesos.")
+	fs.StringVar(&s.frameworkWebURI, "framework-weburi", s.frameworkWebURI, "A URI that points to a web-based interface for interacting with the framework.")
+	fs.StringVar(&s.advertisedAddress, "advertised-address", s.advertisedAddress, "host:port address that is advertised to clients. May be used to construct artifact download URIs.")
+	fs.IPVar(&s.serviceAddress, "service-address", s.serviceAddress, "The service portal IP address that the scheduler should register with (if unset, chooses randomly)")
+	fs.Var(&s.defaultContainerCPULimit, "default-container-cpu-limit", "Containers without a CPU resource limit are admitted this much CPU shares")
+	fs.Var(&s.defaultContainerMemLimit, "default-container-mem-limit", "Containers without a memory resource limit are admitted this much amount of memory in MB")
+	fs.BoolVar(&s.containPodResources, "contain-pod-resources", s.containPodResources, "Reparent pod containers into mesos cgroups; disable if you're having strange mesos/docker/systemd interactions.")
+	fs.BoolVar(&s.accountForPodResources, "account-for-pod-resources", s.accountForPodResources, "Allocate pod CPU and memory resources from offers (Default: true)")
 	fs.DurationVar(&s.nodeRelistPeriod, "node-monitor-period", s.nodeRelistPeriod, "Period between relisting of all nodes from the apiserver.")

-	fs.IntVar(&s.ExecutorLogV, "executor-logv", s.ExecutorLogV, "Logging verbosity of spawned minion and executor processes.")
-	fs.BoolVar(&s.ExecutorBindall, "executor-bindall", s.ExecutorBindall, "When true will set -address of the executor to 0.0.0.0.")
-	fs.DurationVar(&s.ExecutorSuicideTimeout, "executor-suicide-timeout", s.ExecutorSuicideTimeout, "Executor self-terminates after this period of inactivity. Zero disables suicide watch.")
-	fs.DurationVar(&s.LaunchGracePeriod, "mesos-launch-grace-period", s.LaunchGracePeriod, "Launch grace period after which launching tasks will be cancelled. Zero disables launch cancellation.")
-	fs.StringVar(&s.SandboxOverlay, "mesos-sandbox-overlay", s.SandboxOverlay, "Path to an archive (tar.gz, tar.bz2 or zip) extracted into the sandbox.")
+	fs.IntVar(&s.executorLogV, "executor-logv", s.executorLogV, "Logging verbosity of spawned minion and executor processes.")
+	fs.BoolVar(&s.executorBindall, "executor-bindall", s.executorBindall, "When true will set -address of the executor to 0.0.0.0.")
+	fs.DurationVar(&s.executorSuicideTimeout, "executor-suicide-timeout", s.executorSuicideTimeout, "Executor self-terminates after this period of inactivity. Zero disables suicide watch.")
+	fs.DurationVar(&s.launchGracePeriod, "mesos-launch-grace-period", s.launchGracePeriod, "Launch grace period after which launching tasks will be cancelled. Zero disables launch cancellation.")
+	fs.StringVar(&s.sandboxOverlay, "mesos-sandbox-overlay", s.sandboxOverlay, "Path to an archive (tar.gz, tar.bz2 or zip) extracted into the sandbox.")

-	fs.BoolVar(&s.ProxyBindall, "proxy-bindall", s.ProxyBindall, "When true pass -proxy-bindall to the executor.")
-	fs.BoolVar(&s.RunProxy, "run-proxy", s.RunProxy, "Run the kube-proxy as a side process of the executor.")
-	fs.IntVar(&s.ProxyLogV, "proxy-logv", s.ProxyLogV, "Logging verbosity of spawned minion proxy processes.")
+	fs.BoolVar(&s.proxyBindall, "proxy-bindall", s.proxyBindall, "When true pass -proxy-bindall to the executor.")
+	fs.BoolVar(&s.runProxy, "run-proxy", s.runProxy, "Run the kube-proxy as a side process of the executor.")
+	fs.IntVar(&s.proxyLogV, "proxy-logv", s.proxyLogV, "Logging verbosity of spawned minion proxy processes.")

-	fs.StringVar(&s.MinionPathOverride, "minion-path-override", s.MinionPathOverride, "Override the PATH in the environment of the minion sub-processes.")
-	fs.Var(resource.NewQuantityFlagValue(&s.MinionLogMaxSize), "minion-max-log-size", "Maximum log file size for the executor and proxy before rotation")
-	fs.IntVar(&s.MinionLogMaxAgeInDays, "minion-max-log-age", s.MinionLogMaxAgeInDays, "Maximum log file age of the executor and proxy in days")
-	fs.IntVar(&s.MinionLogMaxBackups, "minion-max-log-backups", s.MinionLogMaxBackups, "Maximum log file backups of the executor and proxy to keep after rotation")
+	fs.StringVar(&s.minionPathOverride, "minion-path-override", s.minionPathOverride, "Override the PATH in the environment of the minion sub-processes.")
+	fs.Var(resource.NewQuantityFlagValue(&s.minionLogMaxSize), "minion-max-log-size", "Maximum log file size for the executor and proxy before rotation")
+	fs.IntVar(&s.minionLogMaxAgeInDays, "minion-max-log-age", s.minionLogMaxAgeInDays, "Maximum log file age of the executor and proxy in days")
+	fs.IntVar(&s.minionLogMaxBackups, "minion-max-log-backups", s.minionLogMaxBackups, "Maximum log file backups of the executor and proxy to keep after rotation")

-	fs.StringVar(&s.KubeletRootDirectory, "kubelet-root-dir", s.KubeletRootDirectory, "Directory path for managing kubelet files (volume mounts,etc). Defaults to executor sandbox.")
-	fs.StringVar(&s.KubeletDockerEndpoint, "kubelet-docker-endpoint", s.KubeletDockerEndpoint, "If non-empty, kubelet will use this for the docker endpoint to communicate with.")
-	fs.StringVar(&s.KubeletPodInfraContainerImage, "kubelet-pod-infra-container-image", s.KubeletPodInfraContainerImage, "The image whose network/ipc namespaces containers in each pod will use.")
-	fs.UintVar(&s.KubeletCadvisorPort, "kubelet-cadvisor-port", s.KubeletCadvisorPort, "The port of the kubelet's local cAdvisor endpoint")
-	fs.StringVar(&s.KubeletHostNetworkSources, "kubelet-host-network-sources", s.KubeletHostNetworkSources, "Comma-separated list of sources from which the Kubelet allows pods to use of host network. For all sources use \"*\" [default=\"file\"]")
-	fs.DurationVar(&s.KubeletSyncFrequency, "kubelet-sync-frequency", s.KubeletSyncFrequency, "Max period between synchronizing running containers and config")
-	fs.StringVar(&s.KubeletNetworkPluginName, "kubelet-network-plugin", s.KubeletNetworkPluginName, "<Warning: Alpha feature> The name of the network plugin to be invoked for various events in kubelet/pod lifecycle")
+	fs.StringVar(&s.kubeletRootDirectory, "kubelet-root-dir", s.kubeletRootDirectory, "Directory path for managing kubelet files (volume mounts,etc). Defaults to executor sandbox.")
+	fs.StringVar(&s.kubeletDockerEndpoint, "kubelet-docker-endpoint", s.kubeletDockerEndpoint, "If non-empty, kubelet will use this for the docker endpoint to communicate with.")
+	fs.StringVar(&s.kubeletPodInfraContainerImage, "kubelet-pod-infra-container-image", s.kubeletPodInfraContainerImage, "The image whose network/ipc namespaces containers in each pod will use.")
+	fs.UintVar(&s.kubeletCadvisorPort, "kubelet-cadvisor-port", s.kubeletCadvisorPort, "The port of the kubelet's local cAdvisor endpoint")
+	fs.StringVar(&s.kubeletHostNetworkSources, "kubelet-host-network-sources", s.kubeletHostNetworkSources, "Comma-separated list of sources from which the Kubelet allows pods to use of host network. For all sources use \"*\" [default=\"file\"]")
+	fs.DurationVar(&s.kubeletSyncFrequency, "kubelet-sync-frequency", s.kubeletSyncFrequency, "Max period between synchronizing running containers and config")
+	fs.StringVar(&s.kubeletNetworkPluginName, "kubelet-network-plugin", s.kubeletNetworkPluginName, "<Warning: Alpha feature> The name of the network plugin to be invoked for various events in kubelet/pod lifecycle")

 	//TODO(jdef) support this flag once we have a better handle on mesos-dns and k8s DNS integration
 	//fs.StringVar(&s.HADomain, "ha-domain", s.HADomain, "Domain of the HA scheduler service, only used in HA mode. If specified may be used to construct artifact download URIs.")
@@ -285,12 +288,12 @@ func (s *SchedulerServer) addCoreFlags(fs *pflag.FlagSet) {

 func (s *SchedulerServer) AddStandaloneFlags(fs *pflag.FlagSet) {
 	s.addCoreFlags(fs)
-	fs.StringVar(&s.ExecutorPath, "executor-path", s.ExecutorPath, "Location of the kubernetes executor executable")
+	fs.StringVar(&s.executorPath, "executor-path", s.executorPath, "Location of the kubernetes executor executable")
 }

 func (s *SchedulerServer) AddHyperkubeFlags(fs *pflag.FlagSet) {
 	s.addCoreFlags(fs)
-	fs.StringVar(&s.KMPath, "km-path", s.KMPath, "Location of the km executable, may be a URI or an absolute file path.")
+	fs.StringVar(&s.kmPath, "km-path", s.kmPath, "Location of the km executable, may be a URI or an absolute file path.")
 }

 // returns (downloadURI, basename(path))
@@ -310,12 +313,12 @@ func (s *SchedulerServer) serveFrameworkArtifactWithFilename(path string, filena
 	serveFile("/"+filename, path)

 	hostURI := ""
-	if s.AdvertisedAddress != "" {
-		hostURI = fmt.Sprintf("http://%s/%s", s.AdvertisedAddress, filename)
-	} else if s.HA && s.HADomain != "" {
-		hostURI = fmt.Sprintf("http://%s.%s:%d/%s", SCHEDULER_SERVICE_NAME, s.HADomain, ports.SchedulerPort, filename)
+	if s.advertisedAddress != "" {
+		hostURI = fmt.Sprintf("http://%s/%s", s.advertisedAddress, filename)
+	} else if s.ha && s.haDomain != "" {
+		hostURI = fmt.Sprintf("http://%s.%s:%d/%s", SCHEDULER_SERVICE_NAME, s.haDomain, ports.SchedulerPort, filename)
 	} else {
-		hostURI = fmt.Sprintf("http://%s:%d/%s", s.Address.String(), s.Port, filename)
+		hostURI = fmt.Sprintf("http://%s:%d/%s", s.address.String(), s.port, filename)
 	}
 	log.V(2).Infof("Hosting artifact '%s' at '%s'", filename, hostURI)

@@ -327,21 +330,21 @@ func (s *SchedulerServer) prepareExecutorInfo(hks hyperkube.Interface) (*mesos.E
 		Shell: proto.Bool(false),
 	}

-	if s.ExecutorPath != "" {
-		uri, executorCmd := s.serveFrameworkArtifact(s.ExecutorPath)
+	if s.executorPath != "" {
+		uri, executorCmd := s.serveFrameworkArtifact(s.executorPath)
 		ci.Uris = append(ci.Uris, &mesos.CommandInfo_URI{Value: proto.String(uri), Executable: proto.Bool(true)})
 		ci.Value = proto.String(fmt.Sprintf("./%s", executorCmd))
 	} else if !hks.FindServer(hyperkube.CommandMinion) {
 		return nil, nil, fmt.Errorf("either run this scheduler via km or else --executor-path is required")
 	} else {
-		if strings.Index(s.KMPath, "://") > 0 {
+		if strings.Index(s.kmPath, "://") > 0 {
 			// URI could point directly to executable, e.g. hdfs:///km
 			// or else indirectly, e.g. http://acmestorage/tarball.tgz
 			// so we assume that for this case the command will always "km"
-			ci.Uris = append(ci.Uris, &mesos.CommandInfo_URI{Value: proto.String(s.KMPath), Executable: proto.Bool(true)})
+			ci.Uris = append(ci.Uris, &mesos.CommandInfo_URI{Value: proto.String(s.kmPath), Executable: proto.Bool(true)})
 			ci.Value = proto.String("./km") // TODO(jdef) extract constant
-		} else if s.KMPath != "" {
-			uri, kmCmd := s.serveFrameworkArtifact(s.KMPath)
+		} else if s.kmPath != "" {
+			uri, kmCmd := s.serveFrameworkArtifact(s.kmPath)
 			ci.Uris = append(ci.Uris, &mesos.CommandInfo_URI{Value: proto.String(uri), Executable: proto.Bool(true)})
 			ci.Value = proto.String(fmt.Sprintf("./%s", kmCmd))
 		} else {
@@ -351,55 +354,55 @@ func (s *SchedulerServer) prepareExecutorInfo(hks hyperkube.Interface) (*mesos.E
 		}
 		ci.Arguments = append(ci.Arguments, hyperkube.CommandMinion)

-		ci.Arguments = append(ci.Arguments, fmt.Sprintf("--run-proxy=%v", s.RunProxy))
-		ci.Arguments = append(ci.Arguments, fmt.Sprintf("--proxy-bindall=%v", s.ProxyBindall))
-		ci.Arguments = append(ci.Arguments, fmt.Sprintf("--proxy-logv=%d", s.ProxyLogV))
+		ci.Arguments = append(ci.Arguments, fmt.Sprintf("--run-proxy=%v", s.runProxy))
+		ci.Arguments = append(ci.Arguments, fmt.Sprintf("--proxy-bindall=%v", s.proxyBindall))
+		ci.Arguments = append(ci.Arguments, fmt.Sprintf("--proxy-logv=%d", s.proxyLogV))

-		ci.Arguments = append(ci.Arguments, fmt.Sprintf("--path-override=%s", s.MinionPathOverride))
-		ci.Arguments = append(ci.Arguments, fmt.Sprintf("--max-log-size=%v", s.MinionLogMaxSize.String()))
-		ci.Arguments = append(ci.Arguments, fmt.Sprintf("--max-log-backups=%d", s.MinionLogMaxBackups))
-		ci.Arguments = append(ci.Arguments, fmt.Sprintf("--max-log-age=%d", s.MinionLogMaxAgeInDays))
+		ci.Arguments = append(ci.Arguments, fmt.Sprintf("--path-override=%s", s.minionPathOverride))
+		ci.Arguments = append(ci.Arguments, fmt.Sprintf("--max-log-size=%v", s.minionLogMaxSize.String()))
+		ci.Arguments = append(ci.Arguments, fmt.Sprintf("--max-log-backups=%d", s.minionLogMaxBackups))
+		ci.Arguments = append(ci.Arguments, fmt.Sprintf("--max-log-age=%d", s.minionLogMaxAgeInDays))
 	}

-	if s.SandboxOverlay != "" {
-		if _, err := os.Stat(s.SandboxOverlay); os.IsNotExist(err) {
-			log.Fatalf("Sandbox overlay archive not found: %s", s.SandboxOverlay)
+	if s.sandboxOverlay != "" {
+		if _, err := os.Stat(s.sandboxOverlay); os.IsNotExist(err) {
+			return nil, nil, fmt.Errorf("Sandbox overlay archive not found: %s", s.sandboxOverlay)
 		}
-		uri, _ := s.serveFrameworkArtifact(s.SandboxOverlay)
+		uri, _ := s.serveFrameworkArtifact(s.sandboxOverlay)
 		ci.Uris = append(ci.Uris, &mesos.CommandInfo_URI{Value: proto.String(uri), Executable: proto.Bool(false), Extract: proto.Bool(true)})
 	}

-	if s.DockerCfgPath != "" {
-		uri := s.serveFrameworkArtifactWithFilename(s.DockerCfgPath, ".dockercfg")
+	if s.dockerCfgPath != "" {
+		uri := s.serveFrameworkArtifactWithFilename(s.dockerCfgPath, ".dockercfg")
 		ci.Uris = append(ci.Uris, &mesos.CommandInfo_URI{Value: proto.String(uri), Executable: proto.Bool(false), Extract: proto.Bool(false)})
 	}

 	//TODO(jdef): provide some way (env var?) for users to customize executor config
 	//TODO(jdef): set -address to 127.0.0.1 if `address` is 127.0.0.1

-	apiServerArgs := strings.Join(s.APIServerList, ",")
+	apiServerArgs := strings.Join(s.apiServerList, ",")
 	ci.Arguments = append(ci.Arguments, fmt.Sprintf("--api-servers=%s", apiServerArgs))
-	ci.Arguments = append(ci.Arguments, fmt.Sprintf("--v=%d", s.ExecutorLogV)) // this also applies to the minion
-	ci.Arguments = append(ci.Arguments, fmt.Sprintf("--allow-privileged=%t", s.AllowPrivileged))
-	ci.Arguments = append(ci.Arguments, fmt.Sprintf("--suicide-timeout=%v", s.ExecutorSuicideTimeout))
-	ci.Arguments = append(ci.Arguments, fmt.Sprintf("--mesos-launch-grace-period=%v", s.LaunchGracePeriod))
+	ci.Arguments = append(ci.Arguments, fmt.Sprintf("--v=%d", s.executorLogV)) // this also applies to the minion
+	ci.Arguments = append(ci.Arguments, fmt.Sprintf("--allow-privileged=%t", s.allowPrivileged))
+	ci.Arguments = append(ci.Arguments, fmt.Sprintf("--suicide-timeout=%v", s.executorSuicideTimeout))
+	ci.Arguments = append(ci.Arguments, fmt.Sprintf("--mesos-launch-grace-period=%v", s.launchGracePeriod))

-	if s.ExecutorBindall {
+	if s.executorBindall {
 		//TODO(jdef) determine whether hostname-override is really needed for bindall because
 		//it conflicts with kubelet node status checks/updates
 		//ci.Arguments = append(ci.Arguments, "--hostname-override=0.0.0.0")
 		ci.Arguments = append(ci.Arguments, "--address=0.0.0.0")
 	}

-	ci.Arguments = append(ci.Arguments, fmt.Sprintf("--mesos-cgroup-prefix=%v", s.MesosCgroupPrefix))
-	ci.Arguments = append(ci.Arguments, fmt.Sprintf("--cadvisor-port=%v", s.KubeletCadvisorPort))
-	ci.Arguments = append(ci.Arguments, fmt.Sprintf("--sync-frequency=%v", s.KubeletSyncFrequency))
-	ci.Arguments = append(ci.Arguments, fmt.Sprintf("--contain-pod-resources=%t", s.ContainPodResources))
-	ci.Arguments = append(ci.Arguments, fmt.Sprintf("--enable-debugging-handlers=%t", s.EnableProfiling))
+	ci.Arguments = append(ci.Arguments, fmt.Sprintf("--mesos-cgroup-prefix=%v", s.mesosCgroupPrefix))
+	ci.Arguments = append(ci.Arguments, fmt.Sprintf("--cadvisor-port=%v", s.kubeletCadvisorPort))
+	ci.Arguments = append(ci.Arguments, fmt.Sprintf("--sync-frequency=%v", s.kubeletSyncFrequency))
+	ci.Arguments = append(ci.Arguments, fmt.Sprintf("--contain-pod-resources=%t", s.containPodResources))
+	ci.Arguments = append(ci.Arguments, fmt.Sprintf("--enable-debugging-handlers=%t", s.enableProfiling))

-	if s.AuthPath != "" {
+	if s.authPath != "" {
 		//TODO(jdef) should probably support non-local files, e.g. hdfs:///some/config/file
-		uri, basename := s.serveFrameworkArtifact(s.AuthPath)
+		uri, basename := s.serveFrameworkArtifact(s.authPath)
 		ci.Uris = append(ci.Uris, &mesos.CommandInfo_URI{Value: proto.String(uri)})
 		ci.Arguments = append(ci.Arguments, fmt.Sprintf("--auth-path=%s", basename))
 	}
@@ -408,15 +411,15 @@ func (s *SchedulerServer) prepareExecutorInfo(hks hyperkube.Interface) (*mesos.E
 			ci.Arguments = append(ci.Arguments, fmt.Sprintf("--%s=%s", name, value))
 		}
 	}
-	if s.ClusterDNS != nil {
-		appendOptional("cluster-dns", s.ClusterDNS.String())
+	if s.clusterDNS != nil {
+		appendOptional("cluster-dns", s.clusterDNS.String())
 	}
-	appendOptional("cluster-domain", s.ClusterDomain)
-	appendOptional("root-dir", s.KubeletRootDirectory)
-	appendOptional("docker-endpoint", s.KubeletDockerEndpoint)
-	appendOptional("pod-infra-container-image", s.KubeletPodInfraContainerImage)
-	appendOptional("host-network-sources", s.KubeletHostNetworkSources)
-	appendOptional("network-plugin", s.KubeletNetworkPluginName)
+	appendOptional("cluster-domain", s.clusterDomain)
+	appendOptional("root-dir", s.kubeletRootDirectory)
+	appendOptional("docker-endpoint", s.kubeletDockerEndpoint)
+	appendOptional("pod-infra-container-image", s.kubeletPodInfraContainerImage)
+	appendOptional("host-network-sources", s.kubeletHostNetworkSources)
+	appendOptional("network-plugin", s.kubeletNetworkPluginName)

 	log.V(1).Infof("prepared executor command %q with args '%+v'", ci.GetValue(), ci.Arguments)

@@ -429,8 +432,8 @@ func (s *SchedulerServer) prepareExecutorInfo(hks hyperkube.Interface) (*mesos.E

 	// Check for staticPods
 	var staticPodCPUs, staticPodMem float64
-	if s.StaticPodsConfigPath != "" {
-		bs, paths, err := archive.ZipDir(s.StaticPodsConfigPath)
+	if s.staticPodsConfigPath != "" {
+		bs, paths, err := archive.ZipDir(s.staticPodsConfigPath)
 		if err != nil {
 			return nil, nil, err
 		}
@@ -451,8 +454,8 @@ func (s *SchedulerServer) prepareExecutorInfo(hks hyperkube.Interface) (*mesos.E
 			}

 			// TODO(sttts): allow unlimited static pods as well and patch in the default resource limits
-			unlimitedCPU := mresource.LimitPodCPU(&pod, s.DefaultContainerCPULimit)
-			unlimitedMem := mresource.LimitPodMem(&pod, s.DefaultContainerMemLimit)
+			unlimitedCPU := mresource.LimitPodCPU(&pod, s.defaultContainerCPULimit)
+			unlimitedMem := mresource.LimitPodMem(&pod, s.defaultContainerMemLimit)
 			if unlimitedCPU {
 				return nil, nil, fmt.Errorf("found static pod without limit on cpu resources: %v", podPath)
 			}
@@ -473,8 +476,8 @@ func (s *SchedulerServer) prepareExecutorInfo(hks hyperkube.Interface) (*mesos.E
 	}

 	execInfo.Resources = []*mesos.Resource{
-		mutil.NewScalarResource("cpus", float64(s.MesosExecutorCPUs)+staticPodCPUs),
-		mutil.NewScalarResource("mem", float64(s.MesosExecutorMem)+staticPodMem),
+		mutil.NewScalarResource("cpus", float64(s.mesosExecutorCPUs)+staticPodCPUs),
+		mutil.NewScalarResource("mem", float64(s.mesosExecutorMem)+staticPodMem),
 	}

 	// calculate ExecutorInfo hash to be used for validating compatibility
@@ -489,7 +492,7 @@ func (s *SchedulerServer) prepareExecutorInfo(hks hyperkube.Interface) (*mesos.E
 // TODO(jdef): hacked from kubelet/server/server.go
 // TODO(k8s): replace this with clientcmd
 func (s *SchedulerServer) createAPIServerClient() (*client.Client, error) {
-	authInfo, err := clientauth.LoadFromFile(s.AuthPath)
+	authInfo, err := clientauth.LoadFromFile(s.authPath)
 	if err != nil {
 		log.Warningf("Could not load kubernetes auth path: %v. Continuing with defaults.", err)
 	}
@@ -501,14 +504,14 @@ func (s *SchedulerServer) createAPIServerClient() (*client.Client, error) {
 	if err != nil {
 		return nil, err
 	}
-	if len(s.APIServerList) < 1 {
+	if len(s.apiServerList) < 1 {
 		return nil, fmt.Errorf("no api servers specified")
 	}
 	// TODO: adapt Kube client to support LB over several servers
-	if len(s.APIServerList) > 1 {
+	if len(s.apiServerList) > 1 {
 		log.Infof("Multiple api servers specified.  Picking first one")
 	}
-	clientConfig.Host = s.APIServerList[0]
+	clientConfig.Host = s.apiServerList[0]
 	c, err := client.New(&clientConfig)
 	if err != nil {
 		return nil, err
@@ -531,8 +534,8 @@ func (s *SchedulerServer) getDriver() (driver bindings.SchedulerDriver) {
 func (s *SchedulerServer) Run(hks hyperkube.Interface, _ []string) error {
 	// get scheduler low-level config
 	sc := schedcfg.CreateDefaultConfig()
-	if s.SchedulerConfigFileName != "" {
-		f, err := os.Open(s.SchedulerConfigFileName)
+	if s.schedulerConfigFileName != "" {
+		f, err := os.Open(s.schedulerConfigFileName)
 		if err != nil {
 			log.Fatalf("Cannot open scheduler config file: %v", err)
 		}
@@ -545,18 +548,18 @@ func (s *SchedulerServer) Run(hks hyperkube.Interface, _ []string) error {

 	schedulerProcess, driverFactory, etcdClient, eid := s.bootstrap(hks, sc)

-	if s.EnableProfiling {
+	if s.enableProfiling {
 		profile.InstallHandler(s.mux)
 	}
 	go runtime.Until(func() {
 		log.V(1).Info("Starting HTTP interface")
-		log.Error(http.ListenAndServe(net.JoinHostPort(s.Address.String(), strconv.Itoa(s.Port)), s.mux))
+		log.Error(http.ListenAndServe(net.JoinHostPort(s.address.String(), strconv.Itoa(s.port)), s.mux))
 	}, sc.HttpBindInterval.Duration, schedulerProcess.Terminal())

-	if s.HA {
+	if s.ha {
 		validation := ha.ValidationFunc(validateLeadershipTransition)
 		srv := ha.NewCandidate(schedulerProcess, driverFactory, validation)
-		path := fmt.Sprintf(meta.DefaultElectionFormat, s.FrameworkName)
+		path := fmt.Sprintf(meta.DefaultElectionFormat, s.frameworkName)
 		sid := uid.New(eid.Group(), "").String()
 		log.Infof("registering for election at %v with id %v", path, sid)
 		go election.Notify(election.NewEtcdMasterElector(etcdClient), path, sid, srv, nil)
@@ -595,7 +598,7 @@ func (s *SchedulerServer) awaitFailover(schedulerProcess schedulerProcessInterfa
 		case <-schedulerProcess.Failover():
 			err = doFailover()
 		default:
-			if s.HA {
+			if s.ha {
 				err = fmt.Errorf("ha scheduler exiting instead of failing over")
 			} else {
 				log.Infof("exiting scheduler")
@@ -637,22 +640,22 @@ func newEtcd(etcdConfigFile string, etcdServerList []string) (client tools.EtcdC

 func (s *SchedulerServer) bootstrap(hks hyperkube.Interface, sc *schedcfg.Config) (*ha.SchedulerProcess, ha.DriverFactory, tools.EtcdClient, *uid.UID) {

-	s.FrameworkName = strings.TrimSpace(s.FrameworkName)
-	if s.FrameworkName == "" {
+	s.frameworkName = strings.TrimSpace(s.frameworkName)
+	if s.frameworkName == "" {
 		log.Fatalf("framework-name must be a non-empty string")
 	}
-	s.FrameworkWebURI = strings.TrimSpace(s.FrameworkWebURI)
+	s.frameworkWebURI = strings.TrimSpace(s.frameworkWebURI)

 	metrics.Register()
 	runtime.Register()
 	s.mux.Handle("/metrics", prometheus.Handler())
 	healthz.InstallHandler(s.mux)

-	if (s.EtcdConfigFile != "" && len(s.EtcdServerList) != 0) || (s.EtcdConfigFile == "" && len(s.EtcdServerList) == 0) {
+	if (s.etcdConfigFile != "" && len(s.etcdServerList) != 0) || (s.etcdConfigFile == "" && len(s.etcdServerList) == 0) {
 		log.Fatalf("specify either --etcd-servers or --etcd-config")
 	}

-	if len(s.APIServerList) < 1 {
+	if len(s.apiServerList) < 1 {
 		log.Fatal("No api servers specified.")
 	}

@@ -662,9 +665,9 @@ func (s *SchedulerServer) bootstrap(hks hyperkube.Interface, sc *schedcfg.Config
 	}
 	s.client = client

-	if s.ReconcileCooldown < defaultReconcileCooldown {
-		s.ReconcileCooldown = defaultReconcileCooldown
-		log.Warningf("user-specified reconcile cooldown too small, defaulting to %v", s.ReconcileCooldown)
+	if s.reconcileCooldown < defaultReconcileCooldown {
+		s.reconcileCooldown = defaultReconcileCooldown
+		log.Warningf("user-specified reconcile cooldown too small, defaulting to %v", s.reconcileCooldown)
 	}

 	executor, eid, err := s.prepareExecutorInfo(hks)
@@ -676,25 +679,25 @@ func (s *SchedulerServer) bootstrap(hks hyperkube.Interface, sc *schedcfg.Config
 	// (1) the generic config store is available for the FrameworkId storage
 	// (2) the generic master election is provided by the apiserver
 	// Compare docs/proposals/high-availability.md
-	etcdClient, err := newEtcd(s.EtcdConfigFile, s.EtcdServerList)
+	etcdClient, err := newEtcd(s.etcdConfigFile, s.etcdServerList)
 	if err != nil {
 		log.Fatalf("misconfigured etcd: %v", err)
 	}

-	as := scheduler.NewAllocationStrategy(
+	as := podschedulers.NewAllocationStrategy(
 		podtask.NewDefaultPredicate(
-			s.DefaultContainerCPULimit,
-			s.DefaultContainerMemLimit,
+			s.defaultContainerCPULimit,
+			s.defaultContainerMemLimit,
 		),
 		podtask.NewDefaultProcurement(
-			s.DefaultContainerCPULimit,
-			s.DefaultContainerMemLimit,
+			s.defaultContainerCPULimit,
+			s.defaultContainerMemLimit,
 		),
 	)

 	// downgrade allocation strategy if user disables "account-for-pod-resources"
-	if !s.AccountForPodResources {
-		as = scheduler.NewAllocationStrategy(
+	if !s.accountForPodResources {
+		as = podschedulers.NewAllocationStrategy(
 			podtask.DefaultMinimalPredicate,
 			podtask.DefaultMinimalProcurement)
 	}
@@ -716,48 +719,61 @@ func (s *SchedulerServer) bootstrap(hks hyperkube.Interface, sc *schedcfg.Config
 		return n.(*api.Node)
 	}

-	fcfs := scheduler.NewFCFSPodScheduler(as, lookupNode)
-	mesosPodScheduler := scheduler.New(scheduler.Config{
-		Schedcfg:          *sc,
+	fcfs := podschedulers.NewFCFSPodScheduler(as, lookupNode)
+	framework := framework.New(framework.Config{
+		SchedulerConfig:   *sc,
 		Executor:          executor,
-		Scheduler:         fcfs,
 		Client:            client,
-		EtcdClient:        etcdClient,
-		FailoverTimeout:   s.FailoverTimeout,
-		ReconcileInterval: s.ReconcileInterval,
-		ReconcileCooldown: s.ReconcileCooldown,
+		FailoverTimeout:   s.failoverTimeout,
+		ReconcileInterval: s.reconcileInterval,
+		ReconcileCooldown: s.reconcileCooldown,
 		LookupNode:        lookupNode,
+		StoreFrameworkId: func(id string) {
+			// TODO(jdef): port FrameworkId store to generic Kubernetes config store as soon as available
+			_, err := etcdClient.Set(meta.FrameworkIDKey, id, uint64(s.failoverTimeout))
+			if err != nil {
+				log.Errorf("failed to renew frameworkId TTL: %v", err)
+			}
+		},
 	})

-	masterUri := s.MesosMaster
+	masterUri := s.mesosMaster
 	info, cred, err := s.buildFrameworkInfo()
 	if err != nil {
 		log.Fatalf("Misconfigured mesos framework: %v", err)
 	}

-	schedulerProcess := ha.New(mesosPodScheduler)
+	schedulerProcess := ha.New(framework)
 	dconfig := &bindings.DriverConfig{
 		Scheduler:        schedulerProcess,
 		Framework:        info,
 		Master:           masterUri,
 		Credential:       cred,
-		BindingAddress:   s.Address,
-		BindingPort:      uint16(s.DriverPort),
-		HostnameOverride: s.HostnameOverride,
+		BindingAddress:   s.address,
+		BindingPort:      uint16(s.driverPort),
+		HostnameOverride: s.hostnameOverride,
 		WithAuthContext: func(ctx context.Context) context.Context {
-			ctx = auth.WithLoginProvider(ctx, s.MesosAuthProvider)
-			ctx = sasl.WithBindingAddress(ctx, s.Address)
+			ctx = auth.WithLoginProvider(ctx, s.mesosAuthProvider)
+			ctx = sasl.WithBindingAddress(ctx, s.address)
 			return ctx
 		},
 	}

-	kpl := scheduler.NewPlugin(mesosPodScheduler.NewDefaultPluginConfig(schedulerProcess.Terminal(), s.mux))
-	runtime.On(mesosPodScheduler.Registration(), func() { kpl.Run(schedulerProcess.Terminal()) })
-	runtime.On(mesosPodScheduler.Registration(), s.newServiceWriter(schedulerProcess.Terminal()))
+	// create event recorder sending events to the "" namespace of the apiserver
+	broadcaster := record.NewBroadcaster()
+	recorder := broadcaster.NewRecorder(api.EventSource{Component: "scheduler"})
+	broadcaster.StartRecordingToSink(client.Events(""))
+
+	// create scheduler core with all components arranged around it
+	lw := cache.NewListWatchFromClient(client, "pods", api.NamespaceAll, fields.Everything())
+	sched := components.New(sc, framework, fcfs, client, recorder, schedulerProcess.Terminal(), s.mux, lw)
+
+	runtime.On(framework.Registration(), func() { sched.Run(schedulerProcess.Terminal()) })
+	runtime.On(framework.Registration(), s.newServiceWriter(schedulerProcess.Terminal()))

 	driverFactory := ha.DriverFactory(func() (drv bindings.SchedulerDriver, err error) {
 		log.V(1).Infoln("performing deferred initialization")
-		if err = mesosPodScheduler.Init(schedulerProcess.Master(), kpl, s.mux); err != nil {
+		if err = framework.Init(sched, schedulerProcess.Master(), s.mux); err != nil {
 			return nil, fmt.Errorf("failed to initialize pod scheduler: %v", err)
 		}
 		log.V(1).Infoln("deferred init complete")
@@ -806,14 +822,14 @@ func (s *SchedulerServer) failover(driver bindings.SchedulerDriver, hks hyperkub
 			args = append(args, fmt.Sprintf("--%s=%s", flag.Name, flag.Value.String()))
 		}
 	})
-	if !s.Graceful {
+	if !s.graceful {
 		args = append(args, "--graceful")
 	}
-	if len(s.APIServerList) > 0 {
-		args = append(args, "--api-servers="+strings.Join(s.APIServerList, ","))
+	if len(s.apiServerList) > 0 {
+		args = append(args, "--api-servers="+strings.Join(s.apiServerList, ","))
 	}
-	if len(s.EtcdServerList) > 0 {
-		args = append(args, "--etcd-servers="+strings.Join(s.EtcdServerList, ","))
+	if len(s.etcdServerList) > 0 {
+		args = append(args, "--etcd-servers="+strings.Join(s.etcdServerList, ","))
 	}
 	args = append(args, flags.Args()...)

@@ -846,30 +862,30 @@ func (s *SchedulerServer) buildFrameworkInfo() (info *mesos.FrameworkInfo, cred
 	}
 	log.V(2).Infof("Framework configured with mesos user %v", username)
 	info = &mesos.FrameworkInfo{
-		Name:       proto.String(s.FrameworkName),
+		Name:       proto.String(s.frameworkName),
 		User:       proto.String(username),
-		Checkpoint: proto.Bool(s.Checkpoint),
+		Checkpoint: proto.Bool(s.checkpoint),
 	}
-	if s.FrameworkWebURI != "" {
-		info.WebuiUrl = proto.String(s.FrameworkWebURI)
+	if s.frameworkWebURI != "" {
+		info.WebuiUrl = proto.String(s.frameworkWebURI)
 	}
-	if s.FailoverTimeout > 0 {
-		info.FailoverTimeout = proto.Float64(s.FailoverTimeout)
+	if s.failoverTimeout > 0 {
+		info.FailoverTimeout = proto.Float64(s.failoverTimeout)
 	}
-	if s.MesosRole != "" {
-		info.Role = proto.String(s.MesosRole)
+	if s.mesosRole != "" {
+		info.Role = proto.String(s.mesosRole)
 	}
-	if s.MesosAuthPrincipal != "" {
-		info.Principal = proto.String(s.MesosAuthPrincipal)
-		if s.MesosAuthSecretFile == "" {
+	if s.mesosAuthPrincipal != "" {
+		info.Principal = proto.String(s.mesosAuthPrincipal)
+		if s.mesosAuthSecretFile == "" {
 			return nil, nil, errors.New("authentication principal specified without the required credentials file")
 		}
-		secret, err := ioutil.ReadFile(s.MesosAuthSecretFile)
+		secret, err := ioutil.ReadFile(s.mesosAuthSecretFile)
 		if err != nil {
 			return nil, nil, err
 		}
 		cred = &mesos.Credential{
-			Principal: proto.String(s.MesosAuthPrincipal),
+			Principal: proto.String(s.mesosAuthPrincipal),
 			Secret:    secret,
 		}
 	}
@@ -877,7 +893,7 @@ func (s *SchedulerServer) buildFrameworkInfo() (info *mesos.FrameworkInfo, cred
 }

 func (s *SchedulerServer) fetchFrameworkID(client tools.EtcdClient) (*mesos.FrameworkID, error) {
-	if s.FailoverTimeout > 0 {
+	if s.failoverTimeout > 0 {
 		if response, err := client.Get(meta.FrameworkIDKey, false, false); err != nil {
 			if !etcdstorage.IsEtcdNotFound(err) {
 				return nil, fmt.Errorf("unexpected failure attempting to load framework ID from etcd: %v", err)
@@ -900,7 +916,7 @@ func (s *SchedulerServer) fetchFrameworkID(client tools.EtcdClient) (*mesos.Fram
 }

 func (s *SchedulerServer) getUsername() (username string, err error) {
-	username = s.MesosUser
+	username = s.mesosUser
 	if username == "" {
 		if u, err := user.Current(); err == nil {
 			username = u.Username
--- a/contrib/mesos/pkg/scheduler/service/service_test.go
+++ b/contrib/mesos/pkg/scheduler/service/service_test.go
@@ -121,8 +121,8 @@ func Test_DefaultResourceLimits(t *testing.T) {
 	assert := assert.New(t)

 	s := NewSchedulerServer()
-	assert.Equal(s.DefaultContainerCPULimit, mresource.DefaultDefaultContainerCPULimit)
-	assert.Equal(s.DefaultContainerMemLimit, mresource.DefaultDefaultContainerMemLimit)
+	assert.Equal(s.defaultContainerCPULimit, mresource.DefaultDefaultContainerCPULimit)
+	assert.Equal(s.defaultContainerMemLimit, mresource.DefaultDefaultContainerMemLimit)
 }

 func Test_StaticPods(t *testing.T) {