Kubernetes Mesos integration

This commit includes the fundamental components of the Kubernetes Mesos integration: * Kubernetes-Mesos scheduler * Kubernetes-Mesos executor * Supporting libs Dependencies and upstream changes are included in a separate commit for easy review. After this initial upstream, there'll be two PRs following. * km (hypercube) and k8sm-controller-manager #9265 * Static pods support #9077 Fixes applied: - Precise metrics subsystems definitions - mesosphere/kubernetes-mesos#331 - https://github.com/GoogleCloudPlatform/kubernetes/pull/8882#discussion_r31875232 - https://github.com/GoogleCloudPlatform/kubernetes/pull/8882#discussion_r31875240 - Improve comments and add clarifications - Fixes https://github.com/GoogleCloudPlatform/kubernetes/pull/8882#discussion-diff-31875208 - Fixes https://github.com/GoogleCloudPlatform/kubernetes/pull/8882#discussion-diff-31875226 - Fixes https://github.com/GoogleCloudPlatform/kubernetes/pull/8882#discussion-diff-31875227 - Fixes https://github.com/GoogleCloudPlatform/kubernetes/pull/8882#discussion-diff-31875228 - Fixes https://github.com/GoogleCloudPlatform/kubernetes/pull/8882#discussion-diff-31875239 - Fixes https://github.com/GoogleCloudPlatform/kubernetes/pull/8882#discussion-diff-31875243 - Fixes https://github.com/GoogleCloudPlatform/kubernetes/pull/8882#discussion-diff-31875234 - Fixes https://github.com/GoogleCloudPlatform/kubernetes/pull/8882#discussion-diff-31875256 - Fixes https://github.com/GoogleCloudPlatform/kubernetes/pull/8882#discussion-diff-31875255 - Fixes https://github.com/GoogleCloudPlatform/kubernetes/pull/8882#discussion-diff-31875251 - Clarify which Schedule function is actually called - Fixes https://github.com/GoogleCloudPlatform/kubernetes/pull/8882#discussion-diff-31875246
2015-06-10 20:58:22 +00:00
parent 7d66559725
commit 932c58a497
105 changed files with 15162 additions and 0 deletions
--- a/contrib/mesos/cmd/k8sm-executor/doc.go
+++ b/contrib/mesos/cmd/k8sm-executor/doc.go
@@ -0,0 +1,18 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+// This package main implements the executable Kubernetes Mesos executor.
+package main
--- a/contrib/mesos/cmd/k8sm-executor/main.go
+++ b/contrib/mesos/cmd/k8sm-executor/main.go
@@ -0,0 +1,47 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package main
+
+import (
+	"fmt"
+	"os"
+	"runtime"
+
+	"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/executor/service"
+	"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/hyperkube"
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/util"
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/version/verflag"
+	"github.com/spf13/pflag"
+)
+
+func main() {
+	runtime.GOMAXPROCS(runtime.NumCPU())
+
+	s := service.NewKubeletExecutorServer()
+	s.AddStandaloneFlags(pflag.CommandLine)
+
+	util.InitFlags()
+	util.InitLogs()
+	defer util.FlushLogs()
+
+	verflag.PrintAndExitIfRequested()
+
+	if err := s.Run(hyperkube.Nil(), pflag.CommandLine.Args()); err != nil {
+		fmt.Fprintf(os.Stderr, err.Error())
+		os.Exit(1)
+	}
+}
--- a/contrib/mesos/cmd/k8sm-redirfd/doc.go
+++ b/contrib/mesos/cmd/k8sm-redirfd/doc.go
@@ -0,0 +1,21 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+// This package main is used for testing the redirfd package.
+// Inspired by http://skarnet.org/software/execline/redirfd.html.
+// Usage:
+//     k8sm-redirfb [-n] [-b] {mode} {fd} {file} {prog...}
+package main
--- a/contrib/mesos/cmd/k8sm-redirfd/redirfd.go
+++ b/contrib/mesos/cmd/k8sm-redirfd/redirfd.go
@@ -0,0 +1,105 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package main
+
+import (
+	"flag"
+	"fmt"
+	"os"
+	"os/exec"
+	"syscall"
+
+	"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/redirfd"
+)
+
+func main() {
+	nonblock := flag.Bool("n", false, "open file in non-blocking mode")
+	changemode := flag.Bool("b", false, "change mode of file after opening it: to non-blocking mode if the -n option was not given, to blocking mode if it was")
+	flag.Parse()
+
+	args := flag.Args()
+	if len(args) < 4 {
+		fmt.Fprintf(os.Stderr, "expected {mode} {fd} {file} instead of: %v\n", args)
+		os.Exit(1)
+	}
+
+	var mode redirfd.RedirectMode
+	switch m := args[0]; m {
+	case "r":
+		mode = redirfd.Read
+	case "w":
+		mode = redirfd.Write
+	case "u":
+		mode = redirfd.Update
+	case "a":
+		mode = redirfd.Append
+	case "c":
+		mode = redirfd.AppendExisting
+	case "x":
+		mode = redirfd.WriteNew
+	default:
+		fmt.Fprintf(os.Stderr, "unrecognized mode %q\n", mode)
+		os.Exit(1)
+	}
+
+	fd, err := redirfd.ParseFileDescriptor(args[1])
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "failed to parse file descriptor: %v\n", err)
+		os.Exit(1)
+	}
+	file := args[2]
+
+	f, err := mode.Redirect(*nonblock, *changemode, fd, file)
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "redirect failed: %q, %v\n", args[1], err)
+		os.Exit(1)
+	}
+	var pargs []string
+	if len(args) > 4 {
+		pargs = args[4:]
+	}
+	cmd := exec.Command(args[3], pargs...)
+	cmd.Stdin = os.Stdin
+	cmd.Stdout = os.Stdout
+	cmd.Stderr = os.Stderr
+	switch fd {
+	case redirfd.Stdin:
+		cmd.Stdin = f
+	case redirfd.Stdout:
+		cmd.Stdout = f
+	case redirfd.Stderr:
+		cmd.Stderr = f
+	default:
+		cmd.ExtraFiles = []*os.File{f}
+	}
+	defer f.Close()
+	if err = cmd.Run(); err != nil {
+		exiterr := err.(*exec.ExitError)
+		state := exiterr.ProcessState
+		if state != nil {
+			sys := state.Sys()
+			if waitStatus, ok := sys.(syscall.WaitStatus); ok {
+				if waitStatus.Signaled() {
+					os.Exit(256 + int(waitStatus.Signal()))
+				} else {
+					os.Exit(waitStatus.ExitStatus())
+				}
+			}
+		}
+		os.Exit(3)
+	}
+}
--- a/contrib/mesos/cmd/k8sm-scheduler/doc.go
+++ b/contrib/mesos/cmd/k8sm-scheduler/doc.go
@@ -0,0 +1,18 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+// This package main implements the executable Kubernetes Mesos scheduler.
+package main
--- a/contrib/mesos/cmd/k8sm-scheduler/main.go
+++ b/contrib/mesos/cmd/k8sm-scheduler/main.go
@@ -0,0 +1,46 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package main
+
+import (
+	"fmt"
+	"os"
+	"runtime"
+
+	"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/hyperkube"
+	"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/scheduler/service"
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/util"
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/version/verflag"
+	"github.com/spf13/pflag"
+)
+
+func main() {
+	runtime.GOMAXPROCS(runtime.NumCPU())
+	s := service.NewSchedulerServer()
+	s.AddStandaloneFlags(pflag.CommandLine)
+
+	util.InitFlags()
+	util.InitLogs()
+	defer util.FlushLogs()
+
+	verflag.PrintAndExitIfRequested()
+
+	if err := s.Run(hyperkube.Nil(), pflag.CommandLine.Args()); err != nil {
+		fmt.Fprintf(os.Stderr, err.Error())
+		os.Exit(1)
+	}
+}
--- a/contrib/mesos/pkg/assert/assert.go
+++ b/contrib/mesos/pkg/assert/assert.go
@@ -0,0 +1,43 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package assert
+
+import (
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/assert"
+)
+
+// EventuallyTrue asserts that the given predicate becomes true within the given timeout. It
+// checks the predicate regularly each 100ms.
+func EventuallyTrue(t *testing.T, timeout time.Duration, fn func() bool, msgAndArgs ...interface{}) bool {
+	start := time.Now()
+	for {
+		if fn() {
+			return true
+		}
+		if time.Now().Sub(start) > timeout {
+			if len(msgAndArgs) > 0 {
+				return assert.Fail(t, msgAndArgs[0].(string), msgAndArgs[1:]...)
+			} else {
+				return assert.Fail(t, "predicate fn has not been true after %v", timeout.String())
+			}
+		}
+		time.Sleep(100 * time.Millisecond)
+	}
+}
--- a/contrib/mesos/pkg/assert/doc.go
+++ b/contrib/mesos/pkg/assert/doc.go
@@ -0,0 +1,19 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+// Package assert is an utility package containing reusable testing functionality
+// extending github.com/stretchr/testify/assert
+package assert
--- a/contrib/mesos/pkg/backoff/backoff.go
+++ b/contrib/mesos/pkg/backoff/backoff.go
@@ -0,0 +1,96 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package backoff
+
+import (
+	"math/rand"
+	"sync"
+	"time"
+
+	log "github.com/golang/glog"
+)
+
+type clock interface {
+	Now() time.Time
+}
+
+type realClock struct{}
+
+func (realClock) Now() time.Time {
+	return time.Now()
+}
+
+type backoffEntry struct {
+	backoff    time.Duration
+	lastUpdate time.Time
+}
+
+type Backoff struct {
+	perItemBackoff  map[string]*backoffEntry
+	lock            sync.Mutex
+	clock           clock
+	defaultDuration time.Duration
+	maxDuration     time.Duration
+}
+
+func New(initial, max time.Duration) *Backoff {
+	return &Backoff{
+		perItemBackoff:  map[string]*backoffEntry{},
+		clock:           realClock{},
+		defaultDuration: initial,
+		maxDuration:     max,
+	}
+}
+
+func (p *Backoff) getEntry(id string) *backoffEntry {
+	p.lock.Lock()
+	defer p.lock.Unlock()
+	entry, ok := p.perItemBackoff[id]
+	if !ok {
+		entry = &backoffEntry{backoff: p.defaultDuration}
+		p.perItemBackoff[id] = entry
+	}
+	entry.lastUpdate = p.clock.Now()
+	return entry
+}
+
+func (p *Backoff) Get(id string) time.Duration {
+	entry := p.getEntry(id)
+	duration := entry.backoff
+	entry.backoff *= 2
+	if entry.backoff > p.maxDuration {
+		entry.backoff = p.maxDuration
+	}
+	//TODO(jdef) parameterize use of jitter?
+	// add jitter, get better backoff distribution
+	duration = time.Duration(rand.Int63n(int64(duration)))
+	log.V(3).Infof("Backing off %v for pod %s", duration, id)
+	return duration
+}
+
+// Garbage collect records that have aged past maxDuration. Backoff users are expected
+// to invoke this periodically.
+func (p *Backoff) GC() {
+	p.lock.Lock()
+	defer p.lock.Unlock()
+	now := p.clock.Now()
+	for id, entry := range p.perItemBackoff {
+		if now.Sub(entry.lastUpdate) > p.maxDuration {
+			delete(p.perItemBackoff, id)
+		}
+	}
+}
--- a/contrib/mesos/pkg/backoff/doc.go
+++ b/contrib/mesos/pkg/backoff/doc.go
@@ -0,0 +1,19 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+// Package backoff provides backoff functionality with a simple API.
+// Originally copied from Kubernetes: plugin/pkg/scheduler/factory/factory.go
+package backoff
--- a/contrib/mesos/pkg/election/doc.go
+++ b/contrib/mesos/pkg/election/doc.go
@@ -0,0 +1,18 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+// Package election provides interfaces used for master election.
+package election
--- a/contrib/mesos/pkg/election/etcd_master.go
+++ b/contrib/mesos/pkg/election/etcd_master.go
@@ -0,0 +1,185 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package election
+
+import (
+	"fmt"
+	"time"
+
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/tools"
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/util"
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/watch"
+	"github.com/coreos/go-etcd/etcd"
+	"github.com/golang/glog"
+)
+
+// Master is used to announce the current elected master.
+type Master string
+
+// IsAnAPIObject is used solely so we can work with the watch package.
+// TODO(k8s): Either fix watch so this isn't necessary, or make this a real API Object.
+// TODO(k8s): when it becomes clear how this package will be used, move these declarations to
+// to the proper place.
+func (Master) IsAnAPIObject() {}
+
+// NewEtcdMasterElector returns an implementation of election.MasterElector backed by etcd.
+func NewEtcdMasterElector(h tools.EtcdGetSet) MasterElector {
+	return &etcdMasterElector{etcd: h}
+}
+
+type empty struct{}
+
+// internal implementation struct
+type etcdMasterElector struct {
+	etcd   tools.EtcdGetSet
+	done   chan empty
+	events chan watch.Event
+}
+
+// Elect implements the election.MasterElector interface.
+func (e *etcdMasterElector) Elect(path, id string) watch.Interface {
+	e.done = make(chan empty)
+	e.events = make(chan watch.Event)
+	go util.Forever(func() { e.run(path, id) }, time.Second*5)
+	return e
+}
+
+func (e *etcdMasterElector) run(path, id string) {
+	masters := make(chan string)
+	errors := make(chan error)
+	go e.master(path, id, 30, masters, errors, e.done) // TODO(jdef) extract constant
+	for {
+		select {
+		case m := <-masters:
+			e.events <- watch.Event{
+				Type:   watch.Modified,
+				Object: Master(m),
+			}
+		case e := <-errors:
+			glog.Errorf("error in election: %v", e)
+		}
+	}
+}
+
+// ResultChan implements the watch.Interface interface.
+func (e *etcdMasterElector) ResultChan() <-chan watch.Event {
+	return e.events
+}
+
+// extendMaster attempts to extend ownership of a master lock for TTL seconds.
+// returns "", nil if extension failed
+// returns id, nil if extension succeeded
+// returns "", err if an error occurred
+func (e *etcdMasterElector) extendMaster(path, id string, ttl uint64, res *etcd.Response) (string, error) {
+	// If it matches the passed in id, extend the lease by writing a new entry.
+	// Uses compare and swap, so that if we TTL out in the meantime, the write will fail.
+	// We don't handle the TTL delete w/o a write case here, it's handled in the next loop
+	// iteration.
+	_, err := e.etcd.CompareAndSwap(path, id, ttl, "", res.Node.ModifiedIndex)
+	if err != nil && !tools.IsEtcdTestFailed(err) {
+		return "", err
+	}
+	if err != nil && tools.IsEtcdTestFailed(err) {
+		return "", nil
+	}
+	return id, nil
+}
+
+// becomeMaster attempts to become the master for this lock.
+// returns "", nil if the attempt failed
+// returns id, nil if the attempt succeeded
+// returns "", err if an error occurred
+func (e *etcdMasterElector) becomeMaster(path, id string, ttl uint64) (string, error) {
+	_, err := e.etcd.Create(path, id, ttl)
+	if err != nil && !tools.IsEtcdNodeExist(err) {
+		// unexpected error
+		return "", err
+	}
+	if err != nil && tools.IsEtcdNodeExist(err) {
+		return "", nil
+	}
+	return id, nil
+}
+
+// handleMaster performs one loop of master locking.
+// on success it returns <master>, nil
+// on error it returns "", err
+// in situations where you should try again due to concurrent state changes (e.g. another actor simultaneously acquiring the lock)
+// it returns "", nil
+func (e *etcdMasterElector) handleMaster(path, id string, ttl uint64) (string, error) {
+	res, err := e.etcd.Get(path, false, false)
+
+	// Unexpected error, bail out
+	if err != nil && !tools.IsEtcdNotFound(err) {
+		return "", err
+	}
+
+	// There is no master, try to become the master.
+	if err != nil && tools.IsEtcdNotFound(err) {
+		return e.becomeMaster(path, id, ttl)
+	}
+
+	// This should never happen.
+	if res.Node == nil {
+		return "", fmt.Errorf("unexpected response: %#v", res)
+	}
+
+	// We're not the master, just return the current value
+	if res.Node.Value != id {
+		return res.Node.Value, nil
+	}
+
+	// We are the master, try to extend out lease
+	return e.extendMaster(path, id, ttl, res)
+}
+
+// master provices a distributed master election lock, maintains lock until failure, or someone sends something in the done channel.
+// The basic algorithm is:
+// while !done
+//   Get the current master
+//   If there is no current master
+//      Try to become the master
+//   Otherwise
+//      If we are the master, extend the lease
+//      If the master is different than the last time through the loop, report the master
+//   Sleep 80% of TTL
+func (e *etcdMasterElector) master(path, id string, ttl uint64, masters chan<- string, errors chan<- error, done <-chan empty) {
+	lastMaster := ""
+	for {
+		master, err := e.handleMaster(path, id, ttl)
+		if err != nil {
+			errors <- err
+		} else if len(master) == 0 {
+			continue
+		} else if master != lastMaster {
+			lastMaster = master
+			masters <- master
+		}
+		// TODO(k8s): Add Watch here, skip the polling for faster reactions
+		// If done is closed, break out.
+		select {
+		case <-done:
+			return
+		case <-time.After(time.Duration((ttl*8)/10) * time.Second):
+		}
+	}
+}
+
+// ResultChan implements the watch.Interface interface
+func (e *etcdMasterElector) Stop() {
+	close(e.done)
+}
--- a/contrib/mesos/pkg/election/etcd_master_test.go
+++ b/contrib/mesos/pkg/election/etcd_master_test.go
@@ -0,0 +1,98 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package election
+
+import (
+	"testing"
+
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/tools"
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/watch"
+	"github.com/coreos/go-etcd/etcd"
+)
+
+func TestEtcdMasterOther(t *testing.T) {
+	path := "foo"
+	etcd := tools.NewFakeEtcdClient(t)
+	etcd.Set(path, "baz", 0)
+	master := NewEtcdMasterElector(etcd)
+	w := master.Elect(path, "bar")
+	result := <-w.ResultChan()
+	if result.Type != watch.Modified || result.Object.(Master) != "baz" {
+		t.Errorf("unexpected event: %#v", result)
+	}
+	w.Stop()
+}
+
+func TestEtcdMasterNoOther(t *testing.T) {
+	path := "foo"
+	e := tools.NewFakeEtcdClient(t)
+	e.TestIndex = true
+	e.Data["foo"] = tools.EtcdResponseWithError{
+		R: &etcd.Response{
+			Node: nil,
+		},
+		E: &etcd.EtcdError{
+			ErrorCode: tools.EtcdErrorCodeNotFound,
+		},
+	}
+	master := NewEtcdMasterElector(e)
+	w := master.Elect(path, "bar")
+	result := <-w.ResultChan()
+	if result.Type != watch.Modified || result.Object.(Master) != "bar" {
+		t.Errorf("unexpected event: %#v", result)
+	}
+	w.Stop()
+}
+
+func TestEtcdMasterNoOtherThenConflict(t *testing.T) {
+	path := "foo"
+	e := tools.NewFakeEtcdClient(t)
+	e.TestIndex = true
+	// Ok, so we set up a chain of responses from etcd:
+	//   1) Nothing there
+	//   2) conflict (someone else wrote)
+	//   3) new value (the data they wrote)
+	empty := tools.EtcdResponseWithError{
+		R: &etcd.Response{
+			Node: nil,
+		},
+		E: &etcd.EtcdError{
+			ErrorCode: tools.EtcdErrorCodeNotFound,
+		},
+	}
+	empty.N = &tools.EtcdResponseWithError{
+		R: &etcd.Response{},
+		E: &etcd.EtcdError{
+			ErrorCode: tools.EtcdErrorCodeNodeExist,
+		},
+	}
+	empty.N.N = &tools.EtcdResponseWithError{
+		R: &etcd.Response{
+			Node: &etcd.Node{
+				Value: "baz",
+			},
+		},
+	}
+	e.Data["foo"] = empty
+	master := NewEtcdMasterElector(e)
+	w := master.Elect(path, "bar")
+	result := <-w.ResultChan()
+	if result.Type != watch.Modified || result.Object.(Master) != "bar" {
+		t.Errorf("unexpected event: %#v", result)
+	}
+	w.Stop()
+}
--- a/contrib/mesos/pkg/election/fake.go
+++ b/contrib/mesos/pkg/election/fake.go
@@ -0,0 +1,53 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package election
+
+import (
+	"sync"
+
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/watch"
+)
+
+// Fake allows for testing of anything consuming a MasterElector.
+type Fake struct {
+	mux           *watch.Broadcaster
+	currentMaster Master
+	lock          sync.Mutex // Protect access of currentMaster
+}
+
+// NewFake makes a new fake MasterElector.
+func NewFake() *Fake {
+	// 0 means block for clients.
+	return &Fake{mux: watch.NewBroadcaster(0, watch.WaitIfChannelFull)}
+}
+
+func (f *Fake) ChangeMaster(newMaster Master) {
+	f.lock.Lock()
+	defer f.lock.Unlock()
+	f.mux.Action(watch.Modified, newMaster)
+	f.currentMaster = newMaster
+}
+
+func (f *Fake) Elect(path, id string) watch.Interface {
+	f.lock.Lock()
+	defer f.lock.Unlock()
+	w := f.mux.Watch()
+	if f.currentMaster != "" {
+		f.mux.Action(watch.Modified, f.currentMaster)
+	}
+	return w
+}
--- a/contrib/mesos/pkg/election/master.go
+++ b/contrib/mesos/pkg/election/master.go
@@ -0,0 +1,134 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package election
+
+import (
+	"sync"
+
+	"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/runtime"
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/watch"
+
+	"github.com/golang/glog"
+)
+
+// MasterElector is an interface for services that can elect masters.
+// Important Note: MasterElectors are not inter-operable, all participants in the election need to be
+//  using the same underlying implementation of this interface for correct behavior.
+type MasterElector interface {
+	// Elect makes the caller represented by 'id' enter into a master election for the
+	// distributed lock defined by 'path'
+	// The returned watch.Interface provides a stream of Master objects which
+	// contain the current master.
+	// Calling Stop on the returned interface relinquishes ownership (if currently possesed)
+	// and removes the caller from the election
+	Elect(path, id string) watch.Interface
+}
+
+// Service represents anything that can start and stop on demand.
+type Service interface {
+	Validate(desired, current Master)
+	Start()
+	Stop()
+}
+
+type notifier struct {
+	lock sync.Mutex
+	cond *sync.Cond
+
+	// desired is updated with every change, current is updated after
+	// Start()/Stop() finishes. 'cond' is used to signal that a change
+	// might be needed. This handles the case where mastership flops
+	// around without calling Start()/Stop() excessively.
+	desired, current Master
+
+	// for comparison, to see if we are master.
+	id Master
+
+	service Service
+}
+
+// Notify runs Elect() on m, and calls Start()/Stop() on s when the
+// elected master starts/stops matching 'id'. Never returns.
+func Notify(m MasterElector, path, id string, s Service, abort <-chan struct{}) {
+	n := &notifier{id: Master(id), service: s}
+	n.cond = sync.NewCond(&n.lock)
+	finished := runtime.After(func() {
+		runtime.Until(func() {
+			for {
+				w := m.Elect(path, id)
+				for {
+					select {
+					case <-abort:
+						return
+					case event, open := <-w.ResultChan():
+						if !open {
+							break
+						}
+						if event.Type != watch.Modified {
+							continue
+						}
+						electedMaster, ok := event.Object.(Master)
+						if !ok {
+							glog.Errorf("Unexpected object from election channel: %v", event.Object)
+							break
+						}
+						func() {
+							n.lock.Lock()
+							defer n.lock.Unlock()
+							n.desired = electedMaster
+							if n.desired != n.current {
+								n.cond.Signal()
+							}
+						}()
+					}
+				}
+			}
+		}, 0, abort)
+	})
+	runtime.Until(func() { n.serviceLoop(finished) }, 0, abort)
+}
+
+// serviceLoop waits for changes, and calls Start()/Stop() as needed.
+func (n *notifier) serviceLoop(abort <-chan struct{}) {
+	n.lock.Lock()
+	defer n.lock.Unlock()
+	for {
+		select {
+		case <-abort:
+			return
+		default:
+			for n.desired == n.current {
+				ch := runtime.After(n.cond.Wait)
+				select {
+				case <-abort:
+					n.cond.Signal() // ensure that Wait() returns
+					<-ch
+					return
+				case <-ch:
+					// we were notified and have the lock, proceed..
+				}
+			}
+			if n.current != n.id && n.desired == n.id {
+				n.service.Validate(n.desired, n.current)
+				n.service.Start()
+			} else if n.current == n.id && n.desired != n.id {
+				n.service.Stop()
+			}
+			n.current = n.desired
+		}
+	}
+}
--- a/contrib/mesos/pkg/election/master_test.go
+++ b/contrib/mesos/pkg/election/master_test.go
@@ -0,0 +1,98 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package election
+
+import (
+	"testing"
+	"time"
+
+	"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/runtime"
+)
+
+type slowService struct {
+	t  *testing.T
+	on bool
+	// We explicitly have no lock to prove that
+	// Start and Stop are not called concurrently.
+	changes chan<- bool
+	done    <-chan struct{}
+}
+
+func (s *slowService) Validate(d, c Master) {
+	// noop
+}
+
+func (s *slowService) Start() {
+	select {
+	case <-s.done:
+		return // avoid writing to closed changes chan
+	default:
+	}
+	if s.on {
+		s.t.Errorf("started already on service")
+	}
+	time.Sleep(2 * time.Millisecond)
+	s.on = true
+	s.changes <- true
+}
+
+func (s *slowService) Stop() {
+	select {
+	case <-s.done:
+		return // avoid writing to closed changes chan
+	default:
+	}
+	if !s.on {
+		s.t.Errorf("stopped already off service")
+	}
+	time.Sleep(2 * time.Millisecond)
+	s.on = false
+	s.changes <- false
+}
+
+func Test(t *testing.T) {
+	m := NewFake()
+	changes := make(chan bool, 1500)
+	done := make(chan struct{})
+	s := &slowService{t: t, changes: changes, done: done}
+	notifyDone := runtime.After(func() { Notify(m, "", "me", s, done) })
+
+	go func() {
+		defer close(done)
+		for i := 0; i < 500; i++ {
+			for _, key := range []string{"me", "notme", "alsonotme"} {
+				m.ChangeMaster(Master(key))
+			}
+		}
+	}()
+
+	<-notifyDone
+	close(changes)
+
+	changeList := []bool{}
+	for {
+		change, ok := <-changes
+		if !ok {
+			break
+		}
+		changeList = append(changeList, change)
+	}
+
+	if len(changeList) > 1000 {
+		t.Errorf("unexpected number of changes: %v", len(changeList))
+	}
+}
--- a/contrib/mesos/pkg/executor/config/config.go
+++ b/contrib/mesos/pkg/executor/config/config.go
@@ -0,0 +1,29 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package config
+
+import (
+	"time"
+)
+
+// default values to use when constructing mesos ExecutorInfo messages
+const (
+	DefaultInfoID         = "k8sm-executor"
+	DefaultInfoSource     = "kubernetes"
+	DefaultInfoName       = "Kubelet-Executor"
+	DefaultSuicideTimeout = 20 * time.Minute
+)
--- a/contrib/mesos/pkg/executor/config/doc.go
+++ b/contrib/mesos/pkg/executor/config/doc.go
@@ -0,0 +1,18 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+// Package config contains executor configuration constants.
+package config
--- a/contrib/mesos/pkg/executor/doc.go
+++ b/contrib/mesos/pkg/executor/doc.go
@@ -0,0 +1,21 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+/*
+Package executor includes a mesos executor, which contains
+a kubelet as its member to manage containers.
+*/
+package executor
--- a/contrib/mesos/pkg/executor/executor.go
+++ b/contrib/mesos/pkg/executor/executor.go
@@ -0,0 +1,846 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package executor
+
+import (
+	"encoding/json"
+	"fmt"
+	"strings"
+	"sync"
+	"sync/atomic"
+	"time"
+
+	"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/executor/messages"
+	"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/scheduler/meta"
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/api"
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/client"
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/kubelet"
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/kubelet/container"
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/kubelet/dockertools"
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/util"
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/watch"
+	"github.com/fsouza/go-dockerclient"
+	"github.com/gogo/protobuf/proto"
+	log "github.com/golang/glog"
+	bindings "github.com/mesos/mesos-go/executor"
+	mesos "github.com/mesos/mesos-go/mesosproto"
+	mutil "github.com/mesos/mesos-go/mesosutil"
+)
+
+const (
+	containerPollTime = 300 * time.Millisecond
+	launchGracePeriod = 5 * time.Minute
+)
+
+type stateType int32
+
+const (
+	disconnectedState stateType = iota
+	connectedState
+	suicidalState
+	terminalState
+)
+
+func (s *stateType) get() stateType {
+	return stateType(atomic.LoadInt32((*int32)(s)))
+}
+
+func (s *stateType) transition(from, to stateType) bool {
+	return atomic.CompareAndSwapInt32((*int32)(s), int32(from), int32(to))
+}
+
+func (s *stateType) transitionTo(to stateType, unless ...stateType) bool {
+	if len(unless) == 0 {
+		atomic.StoreInt32((*int32)(s), int32(to))
+		return true
+	}
+	for {
+		state := s.get()
+		for _, x := range unless {
+			if state == x {
+				return false
+			}
+		}
+		if s.transition(state, to) {
+			return true
+		}
+	}
+}
+
+type kuberTask struct {
+	mesosTaskInfo *mesos.TaskInfo
+	podName       string
+}
+
+// func that attempts suicide
+type jumper func(bindings.ExecutorDriver, <-chan struct{})
+
+type suicideWatcher interface {
+	Next(time.Duration, bindings.ExecutorDriver, jumper) suicideWatcher
+	Reset(time.Duration) bool
+	Stop() bool
+}
+
+type podStatusFunc func() (*api.PodStatus, error)
+
+// KubernetesExecutor is an mesos executor that runs pods
+// in a minion machine.
+type KubernetesExecutor struct {
+	kl                  *kubelet.Kubelet   // the kubelet instance.
+	updateChan          chan<- interface{} // to send pod config updates to the kubelet
+	state               stateType
+	tasks               map[string]*kuberTask
+	pods                map[string]*api.Pod
+	lock                sync.RWMutex
+	sourcename          string
+	client              *client.Client
+	events              <-chan watch.Event
+	done                chan struct{}                     // signals shutdown
+	outgoing            chan func() (mesos.Status, error) // outgoing queue to the mesos driver
+	dockerClient        dockertools.DockerInterface
+	suicideWatch        suicideWatcher
+	suicideTimeout      time.Duration
+	shutdownAlert       func()          // invoked just prior to executor shutdown
+	kubeletFinished     <-chan struct{} // signals that kubelet Run() died
+	initialRegistration sync.Once
+	exitFunc            func(int)
+	podStatusFunc       func(*kubelet.Kubelet, *api.Pod) (*api.PodStatus, error)
+}
+
+type Config struct {
+	Kubelet         *kubelet.Kubelet
+	Updates         chan<- interface{} // to send pod config updates to the kubelet
+	SourceName      string
+	APIClient       *client.Client
+	Watch           watch.Interface
+	Docker          dockertools.DockerInterface
+	ShutdownAlert   func()
+	SuicideTimeout  time.Duration
+	KubeletFinished <-chan struct{} // signals that kubelet Run() died
+	ExitFunc        func(int)
+	PodStatusFunc   func(*kubelet.Kubelet, *api.Pod) (*api.PodStatus, error)
+}
+
+func (k *KubernetesExecutor) isConnected() bool {
+	return connectedState == (&k.state).get()
+}
+
+// New creates a new kubernetes executor.
+func New(config Config) *KubernetesExecutor {
+	k := &KubernetesExecutor{
+		kl:              config.Kubelet,
+		updateChan:      config.Updates,
+		state:           disconnectedState,
+		tasks:           make(map[string]*kuberTask),
+		pods:            make(map[string]*api.Pod),
+		sourcename:      config.SourceName,
+		client:          config.APIClient,
+		done:            make(chan struct{}),
+		outgoing:        make(chan func() (mesos.Status, error), 1024),
+		dockerClient:    config.Docker,
+		suicideTimeout:  config.SuicideTimeout,
+		kubeletFinished: config.KubeletFinished,
+		suicideWatch:    &suicideTimer{},
+		shutdownAlert:   config.ShutdownAlert,
+		exitFunc:        config.ExitFunc,
+		podStatusFunc:   config.PodStatusFunc,
+	}
+	//TODO(jdef) do something real with these events..
+	if config.Watch != nil {
+		events := config.Watch.ResultChan()
+		if events != nil {
+			go func() {
+				for e := range events {
+					// e ~= watch.Event { ADDED, *api.Event }
+					log.V(1).Info(e)
+				}
+			}()
+			k.events = events
+		}
+	}
+	return k
+}
+
+func (k *KubernetesExecutor) Init(driver bindings.ExecutorDriver) {
+	k.killKubeletContainers()
+	k.resetSuicideWatch(driver)
+	go k.sendLoop()
+	//TODO(jdef) monitor kubeletFinished and shutdown if it happens
+}
+
+func (k *KubernetesExecutor) Done() <-chan struct{} {
+	return k.done
+}
+
+func (k *KubernetesExecutor) isDone() bool {
+	select {
+	case <-k.done:
+		return true
+	default:
+		return false
+	}
+}
+
+// Registered is called when the executor is successfully registered with the slave.
+func (k *KubernetesExecutor) Registered(driver bindings.ExecutorDriver,
+	executorInfo *mesos.ExecutorInfo, frameworkInfo *mesos.FrameworkInfo, slaveInfo *mesos.SlaveInfo) {
+	if k.isDone() {
+		return
+	}
+	log.Infof("Executor %v of framework %v registered with slave %v\n",
+		executorInfo, frameworkInfo, slaveInfo)
+	if !(&k.state).transition(disconnectedState, connectedState) {
+		log.Errorf("failed to register/transition to a connected state")
+	}
+	k.initialRegistration.Do(k.onInitialRegistration)
+}
+
+// Reregistered is called when the executor is successfully re-registered with the slave.
+// This can happen when the slave fails over.
+func (k *KubernetesExecutor) Reregistered(driver bindings.ExecutorDriver, slaveInfo *mesos.SlaveInfo) {
+	if k.isDone() {
+		return
+	}
+	log.Infof("Reregistered with slave %v\n", slaveInfo)
+	if !(&k.state).transition(disconnectedState, connectedState) {
+		log.Errorf("failed to reregister/transition to a connected state")
+	}
+	k.initialRegistration.Do(k.onInitialRegistration)
+}
+
+func (k *KubernetesExecutor) onInitialRegistration() {
+	// emit an empty update to allow the mesos "source" to be marked as seen
+	k.updateChan <- kubelet.PodUpdate{
+		Pods:   []*api.Pod{},
+		Op:     kubelet.SET,
+		Source: k.sourcename,
+	}
+}
+
+// Disconnected is called when the executor is disconnected from the slave.
+func (k *KubernetesExecutor) Disconnected(driver bindings.ExecutorDriver) {
+	if k.isDone() {
+		return
+	}
+	log.Infof("Slave is disconnected\n")
+	if !(&k.state).transition(connectedState, disconnectedState) {
+		log.Errorf("failed to disconnect/transition to a disconnected state")
+	}
+}
+
+// LaunchTask is called when the executor receives a request to launch a task.
+// The happens when the k8sm scheduler has decided to schedule the pod
+// (which corresponds to a Mesos Task) onto the node where this executor
+// is running, but the binding is not recorded in the Kubernetes store yet.
+// This function is invoked to tell the executor to record the binding in the
+// Kubernetes store and start the pod via the Kubelet.
+func (k *KubernetesExecutor) LaunchTask(driver bindings.ExecutorDriver, taskInfo *mesos.TaskInfo) {
+	if k.isDone() {
+		return
+	}
+	log.Infof("Launch task %v\n", taskInfo)
+
+	if !k.isConnected() {
+		log.Errorf("Ignore launch task because the executor is disconnected\n")
+		k.sendStatus(driver, newStatus(taskInfo.GetTaskId(), mesos.TaskState_TASK_FAILED,
+			messages.ExecutorUnregistered))
+		return
+	}
+
+	obj, err := api.Codec.Decode(taskInfo.GetData())
+	if err != nil {
+		log.Errorf("failed to extract yaml data from the taskInfo.data %v", err)
+		k.sendStatus(driver, newStatus(taskInfo.GetTaskId(), mesos.TaskState_TASK_FAILED,
+			messages.UnmarshalTaskDataFailure))
+		return
+	}
+	pod, ok := obj.(*api.Pod)
+	if !ok {
+		log.Errorf("expected *api.Pod instead of %T: %+v", pod, pod)
+		k.sendStatus(driver, newStatus(taskInfo.GetTaskId(), mesos.TaskState_TASK_FAILED,
+			messages.UnmarshalTaskDataFailure))
+		return
+	}
+
+	k.lock.Lock()
+	defer k.lock.Unlock()
+
+	taskId := taskInfo.GetTaskId().GetValue()
+	if _, found := k.tasks[taskId]; found {
+		log.Errorf("task already launched\n")
+		// Not to send back TASK_RUNNING here, because
+		// may be duplicated messages or duplicated task id.
+		return
+	}
+	// remember this task so that:
+	// (a) we ignore future launches for it
+	// (b) we have a record of it so that we can kill it if needed
+	// (c) we're leaving podName == "" for now, indicates we don't need to delete containers
+	k.tasks[taskId] = &kuberTask{
+		mesosTaskInfo: taskInfo,
+	}
+	k.resetSuicideWatch(driver)
+
+	go k.launchTask(driver, taskId, pod)
+}
+
+// TODO(jdef) add metrics for this?
+type suicideTimer struct {
+	timer *time.Timer
+}
+
+func (w *suicideTimer) Next(d time.Duration, driver bindings.ExecutorDriver, f jumper) suicideWatcher {
+	return &suicideTimer{
+		timer: time.AfterFunc(d, func() {
+			log.Warningf("Suicide timeout (%v) expired", d)
+			f(driver, nil)
+		}),
+	}
+}
+
+func (w *suicideTimer) Stop() (result bool) {
+	if w != nil && w.timer != nil {
+		log.Infoln("stopping suicide watch") //TODO(jdef) debug
+		result = w.timer.Stop()
+	}
+	return
+}
+
+// return true if the timer was successfully reset
+func (w *suicideTimer) Reset(d time.Duration) bool {
+	if w != nil && w.timer != nil {
+		log.Infoln("resetting suicide watch") //TODO(jdef) debug
+		w.timer.Reset(d)
+		return true
+	}
+	return false
+}
+
+// determine whether we need to start a suicide countdown. if so, then start
+// a timer that, upon expiration, causes this executor to commit suicide.
+// this implementation runs asynchronously. callers that wish to wait for the
+// reset to complete may wait for the returned signal chan to close.
+func (k *KubernetesExecutor) resetSuicideWatch(driver bindings.ExecutorDriver) <-chan struct{} {
+	ch := make(chan struct{})
+	go func() {
+		defer close(ch)
+		k.lock.Lock()
+		defer k.lock.Unlock()
+
+		if k.suicideTimeout < 1 {
+			return
+		}
+
+		if k.suicideWatch != nil {
+			if len(k.tasks) > 0 {
+				k.suicideWatch.Stop()
+				return
+			}
+			if k.suicideWatch.Reset(k.suicideTimeout) {
+				// valid timer, reset was successful
+				return
+			}
+		}
+
+		//TODO(jdef) reduce verbosity here once we're convinced that suicide watch is working properly
+		log.Infof("resetting suicide watch timer for %v", k.suicideTimeout)
+
+		k.suicideWatch = k.suicideWatch.Next(k.suicideTimeout, driver, jumper(k.attemptSuicide))
+	}()
+	return ch
+}
+
+func (k *KubernetesExecutor) attemptSuicide(driver bindings.ExecutorDriver, abort <-chan struct{}) {
+	k.lock.Lock()
+	defer k.lock.Unlock()
+
+	// this attempt may have been queued and since been aborted
+	select {
+	case <-abort:
+		//TODO(jdef) reduce verbosity once suicide watch is working properly
+		log.Infof("aborting suicide attempt since watch was cancelled")
+		return
+	default: // continue
+	}
+
+	// fail-safe, will abort kamikaze attempts if there are tasks
+	if len(k.tasks) > 0 {
+		ids := []string{}
+		for taskid := range k.tasks {
+			ids = append(ids, taskid)
+		}
+		log.Errorf("suicide attempt failed, there are still running tasks: %v", ids)
+		return
+	}
+
+	log.Infoln("Attempting suicide")
+	if (&k.state).transitionTo(suicidalState, suicidalState, terminalState) {
+		//TODO(jdef) let the scheduler know?
+		//TODO(jdef) is suicide more graceful than slave-demanded shutdown?
+		k.doShutdown(driver)
+	}
+}
+
+// async continuation of LaunchTask
+func (k *KubernetesExecutor) launchTask(driver bindings.ExecutorDriver, taskId string, pod *api.Pod) {
+
+	//HACK(jdef): cloned binding construction from k8s plugin/pkg/scheduler/scheduler.go
+	binding := &api.Binding{
+		ObjectMeta: api.ObjectMeta{
+			Namespace:   pod.Namespace,
+			Name:        pod.Name,
+			Annotations: make(map[string]string),
+		},
+		Target: api.ObjectReference{
+			Kind: "Node",
+			Name: pod.Annotations[meta.BindingHostKey],
+		},
+	}
+
+	// forward the annotations that the scheduler wants to apply
+	for k, v := range pod.Annotations {
+		binding.Annotations[k] = v
+	}
+
+	deleteTask := func() {
+		k.lock.Lock()
+		defer k.lock.Unlock()
+		delete(k.tasks, taskId)
+		k.resetSuicideWatch(driver)
+	}
+
+	log.Infof("Binding '%v/%v' to '%v' with annotations %+v...", pod.Namespace, pod.Name, binding.Target.Name, binding.Annotations)
+	ctx := api.WithNamespace(api.NewContext(), binding.Namespace)
+	// TODO(k8s): use Pods interface for binding once clusters are upgraded
+	// return b.Pods(binding.Namespace).Bind(binding)
+	err := k.client.Post().Namespace(api.NamespaceValue(ctx)).Resource("bindings").Body(binding).Do().Error()
+	if err != nil {
+		deleteTask()
+		k.sendStatus(driver, newStatus(mutil.NewTaskID(taskId), mesos.TaskState_TASK_FAILED,
+			messages.CreateBindingFailure))
+		return
+	}
+	podFullName := container.GetPodFullName(pod)
+
+	// allow a recently failed-over scheduler the chance to recover the task/pod binding:
+	// it may have failed and recovered before the apiserver is able to report the updated
+	// binding information. replays of this status event will signal to the scheduler that
+	// the apiserver should be up-to-date.
+	data, err := json.Marshal(api.PodStatusResult{
+		ObjectMeta: api.ObjectMeta{
+			Name:     podFullName,
+			SelfLink: "/podstatusresult",
+		},
+	})
+	if err != nil {
+		deleteTask()
+		log.Errorf("failed to marshal pod status result: %v", err)
+		k.sendStatus(driver, newStatus(mutil.NewTaskID(taskId), mesos.TaskState_TASK_FAILED,
+			err.Error()))
+		return
+	}
+
+	k.lock.Lock()
+	defer k.lock.Unlock()
+
+	// Add the task.
+	task, found := k.tasks[taskId]
+	if !found {
+		log.V(1).Infof("task %v not found, probably killed: aborting launch, reporting lost", taskId)
+		k.reportLostTask(driver, taskId, messages.LaunchTaskFailed)
+		return
+	}
+
+	//TODO(jdef) check for duplicate pod name, if found send TASK_ERROR
+
+	// from here on, we need to delete containers associated with the task
+	// upon it going into a terminal state
+	task.podName = podFullName
+	k.pods[podFullName] = pod
+
+	// send the latest snapshot of the set of pods to the kubelet via the pod update channel
+	update := kubelet.PodUpdate{Op: kubelet.SET}
+	for _, p := range k.pods {
+		update.Pods = append(update.Pods, p)
+	}
+	k.updateChan <- update
+
+	statusUpdate := &mesos.TaskStatus{
+		TaskId:  mutil.NewTaskID(taskId),
+		State:   mesos.TaskState_TASK_STARTING.Enum(),
+		Message: proto.String(messages.CreateBindingSuccess),
+		Data:    data,
+	}
+	k.sendStatus(driver, statusUpdate)
+
+	// Delay reporting 'task running' until container is up.
+	psf := podStatusFunc(func() (*api.PodStatus, error) {
+		return k.podStatusFunc(k.kl, pod)
+	})
+
+	go k._launchTask(driver, taskId, podFullName, psf)
+}
+
+func (k *KubernetesExecutor) _launchTask(driver bindings.ExecutorDriver, taskId, podFullName string, psf podStatusFunc) {
+
+	expired := make(chan struct{})
+	time.AfterFunc(launchGracePeriod, func() { close(expired) })
+
+	getMarshalledInfo := func() (data []byte, cancel bool) {
+		// potentially long call..
+		if podStatus, err := psf(); err == nil && podStatus != nil {
+			select {
+			case <-expired:
+				cancel = true
+			default:
+				k.lock.Lock()
+				defer k.lock.Unlock()
+				if _, found := k.tasks[taskId]; !found {
+					// don't bother with the pod status if the task is already gone
+					cancel = true
+					break
+				} else if podStatus.Phase != api.PodRunning {
+					// avoid sending back a running status before it's really running
+					break
+				}
+				log.V(2).Infof("Found pod status: '%v'", podStatus)
+				result := api.PodStatusResult{
+					ObjectMeta: api.ObjectMeta{
+						Name:     podFullName,
+						SelfLink: "/podstatusresult",
+					},
+					Status: *podStatus,
+				}
+				if data, err = json.Marshal(result); err != nil {
+					log.Errorf("failed to marshal pod status result: %v", err)
+				}
+			}
+		}
+		return
+	}
+
+waitForRunningPod:
+	for {
+		select {
+		case <-expired:
+			log.Warningf("Launch expired grace period of '%v'", launchGracePeriod)
+			break waitForRunningPod
+		case <-time.After(containerPollTime):
+			if data, cancel := getMarshalledInfo(); cancel {
+				break waitForRunningPod
+			} else if data == nil {
+				continue waitForRunningPod
+			} else {
+				k.lock.Lock()
+				defer k.lock.Unlock()
+				if _, found := k.tasks[taskId]; !found {
+					goto reportLost
+				}
+
+				statusUpdate := &mesos.TaskStatus{
+					TaskId:  mutil.NewTaskID(taskId),
+					State:   mesos.TaskState_TASK_RUNNING.Enum(),
+					Message: proto.String(fmt.Sprintf("pod-running:%s", podFullName)),
+					Data:    data,
+				}
+
+				k.sendStatus(driver, statusUpdate)
+
+				// continue to monitor the health of the pod
+				go k.__launchTask(driver, taskId, podFullName, psf)
+				return
+			}
+		}
+	}
+
+	k.lock.Lock()
+	defer k.lock.Unlock()
+reportLost:
+	k.reportLostTask(driver, taskId, messages.LaunchTaskFailed)
+}
+
+func (k *KubernetesExecutor) __launchTask(driver bindings.ExecutorDriver, taskId, podFullName string, psf podStatusFunc) {
+	// TODO(nnielsen): Monitor health of pod and report if lost.
+	// Should we also allow this to fail a couple of times before reporting lost?
+	// What if the docker daemon is restarting and we can't connect, but it's
+	// going to bring the pods back online as soon as it restarts?
+	knownPod := func() bool {
+		_, err := psf()
+		return err == nil
+	}
+	// Wait for the pod to go away and stop monitoring once it does
+	// TODO (jdefelice) replace with an /events watch?
+	for {
+		time.Sleep(containerPollTime)
+		if k.checkForLostPodTask(driver, taskId, knownPod) {
+			return
+		}
+	}
+}
+
+// Intended to be executed as part of the pod monitoring loop, this fn (ultimately) checks with Docker
+// whether the pod is running. It will only return false if the task is still registered and the pod is
+// registered in Docker. Otherwise it returns true. If there's still a task record on file, but no pod
+// in Docker, then we'll also send a TASK_LOST event.
+func (k *KubernetesExecutor) checkForLostPodTask(driver bindings.ExecutorDriver, taskId string, isKnownPod func() bool) bool {
+	// TODO (jdefelice) don't send false alarms for deleted pods (KILLED tasks)
+	k.lock.Lock()
+	defer k.lock.Unlock()
+
+	// TODO(jdef) we should really consider k.pods here, along with what docker is reporting, since the
+	// kubelet may constantly attempt to instantiate a pod as long as it's in the pod state that we're
+	// handing to it. otherwise, we're probably reporting a TASK_LOST prematurely. Should probably
+	// consult RestartPolicy to determine appropriate behavior. Should probably also gracefully handle
+	// docker daemon restarts.
+	if _, ok := k.tasks[taskId]; ok {
+		if isKnownPod() {
+			return false
+		} else {
+			log.Warningf("Detected lost pod, reporting lost task %v", taskId)
+			k.reportLostTask(driver, taskId, messages.ContainersDisappeared)
+		}
+	} else {
+		log.V(2).Infof("Task %v no longer registered, stop monitoring for lost pods", taskId)
+	}
+	return true
+}
+
+// KillTask is called when the executor receives a request to kill a task.
+func (k *KubernetesExecutor) KillTask(driver bindings.ExecutorDriver, taskId *mesos.TaskID) {
+	if k.isDone() {
+		return
+	}
+	log.Infof("Kill task %v\n", taskId)
+
+	if !k.isConnected() {
+		//TODO(jdefelice) sent TASK_LOST here?
+		log.Warningf("Ignore kill task because the executor is disconnected\n")
+		return
+	}
+
+	k.lock.Lock()
+	defer k.lock.Unlock()
+	k.removePodTask(driver, taskId.GetValue(), messages.TaskKilled, mesos.TaskState_TASK_KILLED)
+}
+
+// Reports a lost task to the slave and updates internal task and pod tracking state.
+// Assumes that the caller is locking around pod and task state.
+func (k *KubernetesExecutor) reportLostTask(driver bindings.ExecutorDriver, tid, reason string) {
+	k.removePodTask(driver, tid, reason, mesos.TaskState_TASK_LOST)
+}
+
+// deletes the pod and task associated with the task identified by tid and sends a task
+// status update to mesos. also attempts to reset the suicide watch.
+// Assumes that the caller is locking around pod and task state.
+func (k *KubernetesExecutor) removePodTask(driver bindings.ExecutorDriver, tid, reason string, state mesos.TaskState) {
+	task, ok := k.tasks[tid]
+	if !ok {
+		log.V(1).Infof("Failed to remove task, unknown task %v\n", tid)
+		return
+	}
+	delete(k.tasks, tid)
+	k.resetSuicideWatch(driver)
+
+	pid := task.podName
+	if _, found := k.pods[pid]; !found {
+		log.Warningf("Cannot remove unknown pod %v for task %v", pid, tid)
+	} else {
+		log.V(2).Infof("deleting pod %v for task %v", pid, tid)
+		delete(k.pods, pid)
+
+		// Send the pod updates to the channel.
+		update := kubelet.PodUpdate{Op: kubelet.SET}
+		for _, p := range k.pods {
+			update.Pods = append(update.Pods, p)
+		}
+		k.updateChan <- update
+	}
+	// TODO(jdef): ensure that the update propagates, perhaps return a signal chan?
+	k.sendStatus(driver, newStatus(mutil.NewTaskID(tid), state, reason))
+}
+
+// FrameworkMessage is called when the framework sends some message to the executor
+func (k *KubernetesExecutor) FrameworkMessage(driver bindings.ExecutorDriver, message string) {
+	if k.isDone() {
+		return
+	}
+	if !k.isConnected() {
+		log.Warningf("Ignore framework message because the executor is disconnected\n")
+		return
+	}
+
+	log.Infof("Receives message from framework %v\n", message)
+	//TODO(jdef) master reported a lost task, reconcile this! @see scheduler.go:handleTaskLost
+	if strings.HasPrefix(message, "task-lost:") && len(message) > 10 {
+		taskId := message[10:]
+		if taskId != "" {
+			// clean up pod state
+			k.lock.Lock()
+			defer k.lock.Unlock()
+			k.reportLostTask(driver, taskId, messages.TaskLostAck)
+		}
+	}
+
+	switch message {
+	case messages.Kamikaze:
+		k.attemptSuicide(driver, nil)
+	}
+}
+
+// Shutdown is called when the executor receives a shutdown request.
+func (k *KubernetesExecutor) Shutdown(driver bindings.ExecutorDriver) {
+	k.lock.Lock()
+	defer k.lock.Unlock()
+	k.doShutdown(driver)
+}
+
+// assumes that caller has obtained state lock
+func (k *KubernetesExecutor) doShutdown(driver bindings.ExecutorDriver) {
+	defer func() {
+		log.Errorf("exiting with unclean shutdown: %v", recover())
+		if k.exitFunc != nil {
+			k.exitFunc(1)
+		}
+	}()
+
+	(&k.state).transitionTo(terminalState)
+
+	// signal to all listeners that this KubeletExecutor is done!
+	close(k.done)
+
+	if k.shutdownAlert != nil {
+		func() {
+			util.HandleCrash()
+			k.shutdownAlert()
+		}()
+	}
+
+	log.Infoln("Stopping executor driver")
+	_, err := driver.Stop()
+	if err != nil {
+		log.Warningf("failed to stop executor driver: %v", err)
+	}
+
+	log.Infoln("Shutdown the executor")
+
+	// according to docs, mesos will generate TASK_LOST updates for us
+	// if needed, so don't take extra time to do that here.
+	k.tasks = map[string]*kuberTask{}
+
+	select {
+	// the main Run() func may still be running... wait for it to finish: it will
+	// clear the pod configuration cleanly, telling k8s "there are no pods" and
+	// clean up resources (pods, volumes, etc).
+	case <-k.kubeletFinished:
+
+	//TODO(jdef) attempt to wait for events to propagate to API server?
+
+	// TODO(jdef) extract constant, should be smaller than whatever the
+	// slave graceful shutdown timeout period is.
+	case <-time.After(15 * time.Second):
+		log.Errorf("timed out waiting for kubelet Run() to die")
+	}
+
+	log.Infoln("exiting")
+	if k.exitFunc != nil {
+		k.exitFunc(0)
+	}
+}
+
+// Destroy existing k8s containers
+func (k *KubernetesExecutor) killKubeletContainers() {
+	if containers, err := dockertools.GetKubeletDockerContainers(k.dockerClient, true); err == nil {
+		opts := docker.RemoveContainerOptions{
+			RemoveVolumes: true,
+			Force:         true,
+		}
+		for _, container := range containers {
+			opts.ID = container.ID
+			log.V(2).Infof("Removing container: %v", opts.ID)
+			if err := k.dockerClient.RemoveContainer(opts); err != nil {
+				log.Warning(err)
+			}
+		}
+	} else {
+		log.Warningf("Failed to list kubelet docker containers: %v", err)
+	}
+}
+
+// Error is called when some error happens.
+func (k *KubernetesExecutor) Error(driver bindings.ExecutorDriver, message string) {
+	log.Errorln(message)
+}
+
+func newStatus(taskId *mesos.TaskID, state mesos.TaskState, message string) *mesos.TaskStatus {
+	return &mesos.TaskStatus{
+		TaskId:  taskId,
+		State:   &state,
+		Message: proto.String(message),
+	}
+}
+
+func (k *KubernetesExecutor) sendStatus(driver bindings.ExecutorDriver, status *mesos.TaskStatus) {
+	select {
+	case <-k.done:
+	default:
+		k.outgoing <- func() (mesos.Status, error) { return driver.SendStatusUpdate(status) }
+	}
+}
+
+func (k *KubernetesExecutor) sendFrameworkMessage(driver bindings.ExecutorDriver, msg string) {
+	select {
+	case <-k.done:
+	default:
+		k.outgoing <- func() (mesos.Status, error) { return driver.SendFrameworkMessage(msg) }
+	}
+}
+
+func (k *KubernetesExecutor) sendLoop() {
+	defer log.V(1).Info("sender loop exiting")
+	for {
+		select {
+		case <-k.done:
+			return
+		default:
+			if !k.isConnected() {
+				select {
+				case <-k.done:
+				case <-time.After(1 * time.Second):
+				}
+				continue
+			}
+			sender, ok := <-k.outgoing
+			if !ok {
+				// programming error
+				panic("someone closed the outgoing channel")
+			}
+			if status, err := sender(); err == nil {
+				continue
+			} else {
+				log.Error(err)
+				if status == mesos.Status_DRIVER_ABORTED {
+					return
+				}
+			}
+			// attempt to re-queue the sender
+			select {
+			case <-k.done:
+			case k.outgoing <- sender:
+			}
+		}
+	}
+}
--- a/contrib/mesos/pkg/executor/executor_test.go
+++ b/contrib/mesos/pkg/executor/executor_test.go
@@ -0,0 +1,618 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package executor
+
+import (
+	"fmt"
+	"net/http"
+	"net/http/httptest"
+	"reflect"
+	"sync"
+	"sync/atomic"
+	"testing"
+	"time"
+
+	assertext "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/assert"
+	"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/executor/messages"
+	kmruntime "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/runtime"
+	"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/scheduler/podtask"
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/api"
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/api/testapi"
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/client"
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/client/cache"
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/kubelet"
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/kubelet/dockertools"
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/runtime"
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/watch"
+
+	"github.com/golang/glog"
+	bindings "github.com/mesos/mesos-go/executor"
+	"github.com/mesos/mesos-go/mesosproto"
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/mock"
+)
+
+type suicideTracker struct {
+	suicideWatcher
+	stops  uint32
+	resets uint32
+	timers uint32
+	jumps  *uint32
+}
+
+func (t *suicideTracker) Reset(d time.Duration) bool {
+	defer func() { t.resets++ }()
+	return t.suicideWatcher.Reset(d)
+}
+
+func (t *suicideTracker) Stop() bool {
+	defer func() { t.stops++ }()
+	return t.suicideWatcher.Stop()
+}
+
+func (t *suicideTracker) Next(d time.Duration, driver bindings.ExecutorDriver, f jumper) suicideWatcher {
+	tracker := &suicideTracker{
+		stops:  t.stops,
+		resets: t.resets,
+		jumps:  t.jumps,
+		timers: t.timers + 1,
+	}
+	jumper := tracker.makeJumper(f)
+	tracker.suicideWatcher = t.suicideWatcher.Next(d, driver, jumper)
+	return tracker
+}
+
+func (t *suicideTracker) makeJumper(_ jumper) jumper {
+	return jumper(func(driver bindings.ExecutorDriver, cancel <-chan struct{}) {
+		glog.Warningln("jumping?!")
+		if t.jumps != nil {
+			atomic.AddUint32(t.jumps, 1)
+		}
+	})
+}
+
+func TestSuicide_zeroTimeout(t *testing.T) {
+	defer glog.Flush()
+
+	k := New(Config{})
+	tracker := &suicideTracker{suicideWatcher: k.suicideWatch}
+	k.suicideWatch = tracker
+
+	ch := k.resetSuicideWatch(nil)
+
+	select {
+	case <-ch:
+	case <-time.After(2 * time.Second):
+		t.Fatalf("timeout waiting for reset of suicide watch")
+	}
+	if tracker.stops != 0 {
+		t.Fatalf("expected no stops since suicideWatchTimeout was never set")
+	}
+	if tracker.resets != 0 {
+		t.Fatalf("expected no resets since suicideWatchTimeout was never set")
+	}
+	if tracker.timers != 0 {
+		t.Fatalf("expected no timers since suicideWatchTimeout was never set")
+	}
+}
+
+func TestSuicide_WithTasks(t *testing.T) {
+	defer glog.Flush()
+
+	k := New(Config{
+		SuicideTimeout: 50 * time.Millisecond,
+	})
+
+	jumps := uint32(0)
+	tracker := &suicideTracker{suicideWatcher: k.suicideWatch, jumps: &jumps}
+	k.suicideWatch = tracker
+
+	k.tasks["foo"] = &kuberTask{} // prevent suicide attempts from succeeding
+
+	// call reset with a nil timer
+	glog.Infoln("resetting suicide watch with 1 task")
+	select {
+	case <-k.resetSuicideWatch(nil):
+		tracker = k.suicideWatch.(*suicideTracker)
+		if tracker.stops != 1 {
+			t.Fatalf("expected suicide attempt to Stop() since there are registered tasks")
+		}
+		if tracker.resets != 0 {
+			t.Fatalf("expected no resets since")
+		}
+		if tracker.timers != 0 {
+			t.Fatalf("expected no timers since")
+		}
+	case <-time.After(1 * time.Second):
+		t.Fatalf("initial suicide watch setup failed")
+	}
+
+	delete(k.tasks, "foo") // zero remaining tasks
+	k.suicideTimeout = 1500 * time.Millisecond
+	suicideStart := time.Now()
+
+	// reset the suicide watch, which should actually start a timer now
+	glog.Infoln("resetting suicide watch with 0 tasks")
+	select {
+	case <-k.resetSuicideWatch(nil):
+		tracker = k.suicideWatch.(*suicideTracker)
+		if tracker.stops != 1 {
+			t.Fatalf("did not expect suicide attempt to Stop() since there are no registered tasks")
+		}
+		if tracker.resets != 1 {
+			t.Fatalf("expected 1 resets instead of %d", tracker.resets)
+		}
+		if tracker.timers != 1 {
+			t.Fatalf("expected 1 timers instead of %d", tracker.timers)
+		}
+	case <-time.After(1 * time.Second):
+		t.Fatalf("2nd suicide watch setup failed")
+	}
+
+	k.lock.Lock()
+	k.tasks["foo"] = &kuberTask{} // prevent suicide attempts from succeeding
+	k.lock.Unlock()
+
+	// reset the suicide watch, which should stop the existing timer
+	glog.Infoln("resetting suicide watch with 1 task")
+	select {
+	case <-k.resetSuicideWatch(nil):
+		tracker = k.suicideWatch.(*suicideTracker)
+		if tracker.stops != 2 {
+			t.Fatalf("expected 2 stops instead of %d since there are registered tasks", tracker.stops)
+		}
+		if tracker.resets != 1 {
+			t.Fatalf("expected 1 resets instead of %d", tracker.resets)
+		}
+		if tracker.timers != 1 {
+			t.Fatalf("expected 1 timers instead of %d", tracker.timers)
+		}
+	case <-time.After(1 * time.Second):
+		t.Fatalf("3rd suicide watch setup failed")
+	}
+
+	k.lock.Lock()
+	delete(k.tasks, "foo") // allow suicide attempts to schedule
+	k.lock.Unlock()
+
+	// reset the suicide watch, which should reset a stopped timer
+	glog.Infoln("resetting suicide watch with 0 tasks")
+	select {
+	case <-k.resetSuicideWatch(nil):
+		tracker = k.suicideWatch.(*suicideTracker)
+		if tracker.stops != 2 {
+			t.Fatalf("expected 2 stops instead of %d since there are no registered tasks", tracker.stops)
+		}
+		if tracker.resets != 2 {
+			t.Fatalf("expected 2 resets instead of %d", tracker.resets)
+		}
+		if tracker.timers != 1 {
+			t.Fatalf("expected 1 timers instead of %d", tracker.timers)
+		}
+	case <-time.After(1 * time.Second):
+		t.Fatalf("4th suicide watch setup failed")
+	}
+
+	sinceWatch := time.Since(suicideStart)
+	time.Sleep(3*time.Second - sinceWatch) // give the first timer to misfire (it shouldn't since Stop() was called)
+
+	if j := atomic.LoadUint32(&jumps); j != 1 {
+		t.Fatalf("expected 1 jumps instead of %d since stop was called", j)
+	} else {
+		glog.Infoln("jumps verified") // glog so we get a timestamp
+	}
+}
+
+// TestExecutorRegister ensures that the executor thinks it is connected
+// after Register is called.
+func TestExecutorRegister(t *testing.T) {
+	mockDriver := &MockExecutorDriver{}
+	updates := make(chan interface{}, 1024)
+	executor := New(Config{
+		Docker:     dockertools.ConnectToDockerOrDie("fake://"),
+		Updates:    updates,
+		SourceName: "executor_test",
+	})
+
+	executor.Init(mockDriver)
+	executor.Registered(mockDriver, nil, nil, nil)
+
+	initialPodUpdate := kubelet.PodUpdate{
+		Pods:   []*api.Pod{},
+		Op:     kubelet.SET,
+		Source: executor.sourcename,
+	}
+	receivedInitialPodUpdate := false
+	select {
+	case m := <-updates:
+		update, ok := m.(kubelet.PodUpdate)
+		if ok {
+			if reflect.DeepEqual(initialPodUpdate, update) {
+				receivedInitialPodUpdate = true
+			}
+		}
+	case <-time.After(time.Second):
+	}
+	assert.Equal(t, true, receivedInitialPodUpdate,
+		"executor should have sent an initial PodUpdate "+
+			"to the updates chan upon registration")
+
+	assert.Equal(t, true, executor.isConnected(), "executor should be connected")
+	mockDriver.AssertExpectations(t)
+}
+
+// TestExecutorDisconnect ensures that the executor thinks that it is not
+// connected after a call to Disconnected has occured.
+func TestExecutorDisconnect(t *testing.T) {
+	mockDriver := &MockExecutorDriver{}
+	executor := NewTestKubernetesExecutor()
+
+	executor.Init(mockDriver)
+	executor.Registered(mockDriver, nil, nil, nil)
+	executor.Disconnected(mockDriver)
+
+	assert.Equal(t, false, executor.isConnected(),
+		"executor should not be connected after Disconnected")
+	mockDriver.AssertExpectations(t)
+}
+
+// TestExecutorReregister ensures that the executor thinks it is connected
+// after a connection problem happens, followed by a call to Reregistered.
+func TestExecutorReregister(t *testing.T) {
+	mockDriver := &MockExecutorDriver{}
+	executor := NewTestKubernetesExecutor()
+
+	executor.Init(mockDriver)
+	executor.Registered(mockDriver, nil, nil, nil)
+	executor.Disconnected(mockDriver)
+	executor.Reregistered(mockDriver, nil)
+
+	assert.Equal(t, true, executor.isConnected(), "executor should be connected")
+	mockDriver.AssertExpectations(t)
+}
+
+// TestExecutorLaunchAndKillTask ensures that the executor is able to launch
+// and kill tasks while properly bookkeping its tasks.
+func TestExecutorLaunchAndKillTask(t *testing.T) {
+	// create a fake pod watch. We use that below to submit new pods to the scheduler
+	podListWatch := NewMockPodsListWatch(api.PodList{})
+
+	// create fake apiserver
+	testApiServer := NewTestServer(t, api.NamespaceDefault, &podListWatch.list)
+	defer testApiServer.server.Close()
+
+	mockDriver := &MockExecutorDriver{}
+	updates := make(chan interface{}, 1024)
+	config := Config{
+		Docker:  dockertools.ConnectToDockerOrDie("fake://"),
+		Updates: updates,
+		APIClient: client.NewOrDie(&client.Config{
+			Host:    testApiServer.server.URL,
+			Version: testapi.Version(),
+		}),
+		Kubelet: &kubelet.Kubelet{},
+		PodStatusFunc: func(kl *kubelet.Kubelet, pod *api.Pod) (*api.PodStatus, error) {
+			return &api.PodStatus{
+				ContainerStatuses: []api.ContainerStatus{
+					{
+						Name: "foo",
+						State: api.ContainerState{
+							Running: &api.ContainerStateRunning{},
+						},
+					},
+				},
+				Phase: api.PodRunning,
+			}, nil
+		},
+	}
+	executor := New(config)
+
+	executor.Init(mockDriver)
+	executor.Registered(mockDriver, nil, nil, nil)
+
+	select {
+	case <-updates:
+	case <-time.After(time.Second):
+		t.Fatalf("Executor should send an intial update on Registration")
+	}
+
+	pod := NewTestPod(1)
+	podTask, err := podtask.New(api.NewDefaultContext(), "",
+		*pod, &mesosproto.ExecutorInfo{})
+	assert.Equal(t, nil, err, "must be able to create a task from a pod")
+
+	taskInfo := podTask.BuildTaskInfo()
+	data, err := testapi.Codec().Encode(pod)
+	assert.Equal(t, nil, err, "must be able to encode a pod's spec data")
+	taskInfo.Data = data
+	var statusUpdateCalls sync.WaitGroup
+	statusUpdateDone := func(_ mock.Arguments) { statusUpdateCalls.Done() }
+
+	statusUpdateCalls.Add(1)
+	mockDriver.On(
+		"SendStatusUpdate",
+		mesosproto.TaskState_TASK_STARTING,
+	).Return(mesosproto.Status_DRIVER_RUNNING, nil).Run(statusUpdateDone).Once()
+
+	statusUpdateCalls.Add(1)
+	mockDriver.On(
+		"SendStatusUpdate",
+		mesosproto.TaskState_TASK_RUNNING,
+	).Return(mesosproto.Status_DRIVER_RUNNING, nil).Run(statusUpdateDone).Once()
+
+	executor.LaunchTask(mockDriver, taskInfo)
+
+	assertext.EventuallyTrue(t, 5*time.Second, func() bool {
+		executor.lock.Lock()
+		defer executor.lock.Unlock()
+		return len(executor.tasks) == 1 && len(executor.pods) == 1
+	}, "executor must be able to create a task and a pod")
+
+	gotPodUpdate := false
+	select {
+	case m := <-updates:
+		update, ok := m.(kubelet.PodUpdate)
+		if ok && len(update.Pods) == 1 {
+			gotPodUpdate = true
+		}
+	case <-time.After(time.Second):
+	}
+	assert.Equal(t, true, gotPodUpdate,
+		"the executor should send an update about a new pod to "+
+			"the updates chan when creating a new one.")
+
+	// Allow some time for asynchronous requests to the driver.
+	finished := kmruntime.After(statusUpdateCalls.Wait)
+	select {
+	case <-finished:
+	case <-time.After(5 * time.Second):
+		t.Fatalf("timed out waiting for status update calls to finish")
+	}
+
+	statusUpdateCalls.Add(1)
+	mockDriver.On(
+		"SendStatusUpdate",
+		mesosproto.TaskState_TASK_KILLED,
+	).Return(mesosproto.Status_DRIVER_RUNNING, nil).Run(statusUpdateDone).Once()
+
+	executor.KillTask(mockDriver, taskInfo.TaskId)
+
+	assertext.EventuallyTrue(t, 5*time.Second, func() bool {
+		executor.lock.Lock()
+		defer executor.lock.Unlock()
+		return len(executor.tasks) == 0 && len(executor.pods) == 0
+	}, "executor must be able to kill a created task and pod")
+
+	// Allow some time for asynchronous requests to the driver.
+	finished = kmruntime.After(statusUpdateCalls.Wait)
+	select {
+	case <-finished:
+	case <-time.After(5 * time.Second):
+		t.Fatalf("timed out waiting for status update calls to finish")
+	}
+	mockDriver.AssertExpectations(t)
+}
+
+// TestExecutorFrameworkMessage ensures that the executor is able to
+// handle messages from the framework, specifically about lost tasks
+// and Kamikaze.  When a task is lost, the executor needs to clean up
+// its state.  When a Kamikaze message is received, the executor should
+// attempt suicide.
+func TestExecutorFrameworkMessage(t *testing.T) {
+	mockDriver := &MockExecutorDriver{}
+	kubeletFinished := make(chan struct{})
+	config := Config{
+		Docker:  dockertools.ConnectToDockerOrDie("fake://"),
+		Updates: make(chan interface{}, 1024),
+		APIClient: client.NewOrDie(&client.Config{
+			Host:    "fakehost",
+			Version: testapi.Version(),
+		}),
+		ShutdownAlert: func() {
+			close(kubeletFinished)
+		},
+		KubeletFinished: kubeletFinished,
+	}
+	executor := New(config)
+
+	executor.Init(mockDriver)
+	executor.Registered(mockDriver, nil, nil, nil)
+
+	executor.FrameworkMessage(mockDriver, "test framework message")
+
+	// set up a pod to then lose
+	pod := NewTestPod(1)
+	podTask, _ := podtask.New(api.NewDefaultContext(), "foo",
+		*pod, &mesosproto.ExecutorInfo{})
+
+	taskInfo := podTask.BuildTaskInfo()
+	data, _ := testapi.Codec().Encode(pod)
+	taskInfo.Data = data
+
+	executor.LaunchTask(mockDriver, taskInfo)
+
+	// send task-lost message for it
+	called := make(chan struct{})
+	mockDriver.On(
+		"SendStatusUpdate",
+		mesosproto.TaskState_TASK_LOST,
+	).Return(mesosproto.Status_DRIVER_RUNNING, nil).Run(func(_ mock.Arguments) { close(called) }).Once()
+
+	executor.FrameworkMessage(mockDriver, "task-lost:foo")
+	assertext.EventuallyTrue(t, 5*time.Second, func() bool {
+		executor.lock.Lock()
+		defer executor.lock.Unlock()
+		return len(executor.tasks) == 0 && len(executor.pods) == 0
+	}, "executor must be able to kill a created task and pod")
+
+	select {
+	case <-called:
+	case <-time.After(5 * time.Second):
+		t.Fatalf("timed out waiting for SendStatusUpdate")
+	}
+
+	mockDriver.On("Stop").Return(mesosproto.Status_DRIVER_STOPPED, nil).Once()
+
+	executor.FrameworkMessage(mockDriver, messages.Kamikaze)
+	assert.Equal(t, true, executor.isDone(),
+		"executor should have shut down after receiving a Kamikaze message")
+
+	mockDriver.AssertExpectations(t)
+}
+
+// Create a pod with a given index, requiring one port
+func NewTestPod(i int) *api.Pod {
+	name := fmt.Sprintf("pod%d", i)
+	return &api.Pod{
+		TypeMeta: api.TypeMeta{APIVersion: testapi.Version()},
+		ObjectMeta: api.ObjectMeta{
+			Name:      name,
+			Namespace: api.NamespaceDefault,
+			SelfLink:  testapi.SelfLink("pods", string(i)),
+		},
+		Spec: api.PodSpec{
+			Containers: []api.Container{
+				{
+					Ports: []api.ContainerPort{
+						{
+							ContainerPort: 8000 + i,
+							Protocol:      api.ProtocolTCP,
+						},
+					},
+				},
+			},
+		},
+		Status: api.PodStatus{
+			Conditions: []api.PodCondition{
+				{
+					Type:   api.PodReady,
+					Status: api.ConditionTrue,
+				},
+			},
+		},
+	}
+}
+
+// Create mock of pods ListWatch, usually listening on the apiserver pods watch endpoint
+type MockPodsListWatch struct {
+	ListWatch   cache.ListWatch
+	fakeWatcher *watch.FakeWatcher
+	list        api.PodList
+}
+
+// A apiserver mock which partially mocks the pods API
+type TestServer struct {
+	server *httptest.Server
+	Stats  map[string]uint
+	lock   sync.Mutex
+}
+
+func NewTestServer(t *testing.T, namespace string, pods *api.PodList) *TestServer {
+	ts := TestServer{
+		Stats: map[string]uint{},
+	}
+	mux := http.NewServeMux()
+
+	mux.HandleFunc(testapi.ResourcePath("bindings", namespace, ""), func(w http.ResponseWriter, r *http.Request) {
+		w.WriteHeader(http.StatusOK)
+	})
+
+	ts.server = httptest.NewServer(mux)
+	return &ts
+}
+
+func NewMockPodsListWatch(initialPodList api.PodList) *MockPodsListWatch {
+	lw := MockPodsListWatch{
+		fakeWatcher: watch.NewFake(),
+		list:        initialPodList,
+	}
+	lw.ListWatch = cache.ListWatch{
+		WatchFunc: func(resourceVersion string) (watch.Interface, error) {
+			return lw.fakeWatcher, nil
+		},
+		ListFunc: func() (runtime.Object, error) {
+			return &lw.list, nil
+		},
+	}
+	return &lw
+}
+
+// TestExecutorShutdown ensures that the executor properly shuts down
+// when Shutdown is called.
+func TestExecutorShutdown(t *testing.T) {
+	mockDriver := &MockExecutorDriver{}
+	kubeletFinished := make(chan struct{})
+	var exitCalled int32 = 0
+	config := Config{
+		Docker:  dockertools.ConnectToDockerOrDie("fake://"),
+		Updates: make(chan interface{}, 1024),
+		ShutdownAlert: func() {
+			close(kubeletFinished)
+		},
+		KubeletFinished: kubeletFinished,
+		ExitFunc: func(_ int) {
+			atomic.AddInt32(&exitCalled, 1)
+		},
+	}
+	executor := New(config)
+
+	executor.Init(mockDriver)
+	executor.Registered(mockDriver, nil, nil, nil)
+
+	mockDriver.On("Stop").Return(mesosproto.Status_DRIVER_STOPPED, nil).Once()
+
+	executor.Shutdown(mockDriver)
+
+	assert.Equal(t, false, executor.isConnected(),
+		"executor should not be connected after Shutdown")
+	assert.Equal(t, true, executor.isDone(),
+		"executor should be in Done state after Shutdown")
+
+	select {
+	case <-executor.Done():
+	default:
+		t.Fatal("done channel should be closed after shutdown")
+	}
+
+	assert.Equal(t, true, atomic.LoadInt32(&exitCalled) > 0,
+		"the executor should call its ExitFunc when it is ready to close down")
+
+	mockDriver.AssertExpectations(t)
+}
+
+func TestExecutorsendFrameworkMessage(t *testing.T) {
+	mockDriver := &MockExecutorDriver{}
+	executor := NewTestKubernetesExecutor()
+
+	executor.Init(mockDriver)
+	executor.Registered(mockDriver, nil, nil, nil)
+
+	called := make(chan struct{})
+	mockDriver.On(
+		"SendFrameworkMessage",
+		"foo bar baz",
+	).Return(mesosproto.Status_DRIVER_RUNNING, nil).Run(func(_ mock.Arguments) { close(called) }).Once()
+	executor.sendFrameworkMessage(mockDriver, "foo bar baz")
+
+	// guard against data race in mock driver between AssertExpectations and Called
+	select {
+	case <-called: // expected
+	case <-time.After(5 * time.Second):
+		t.Fatalf("expected call to SendFrameworkMessage")
+	}
+	mockDriver.AssertExpectations(t)
+}
--- a/contrib/mesos/pkg/executor/messages/doc.go
+++ b/contrib/mesos/pkg/executor/messages/doc.go
@@ -0,0 +1,18 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+// Package messages exposes executor event/message names as constants.
+package messages
--- a/contrib/mesos/pkg/executor/messages/messages.go
+++ b/contrib/mesos/pkg/executor/messages/messages.go
@@ -0,0 +1,32 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package messages
+
+// messages that ship with TaskStatus objects
+
+const (
+	ContainersDisappeared    = "containers-disappeared"
+	CreateBindingFailure     = "create-binding-failure"
+	CreateBindingSuccess     = "create-binding-success"
+	ExecutorUnregistered     = "executor-unregistered"
+	ExecutorShutdown         = "executor-shutdown"
+	LaunchTaskFailed         = "launch-task-failed"
+	TaskKilled               = "task-killed"
+	UnmarshalTaskDataFailure = "unmarshal-task-data-failure"
+	TaskLostAck              = "task-lost-ack" // executor acknowledgement of forwarded TASK_LOST framework message
+	Kamikaze                 = "kamikaze"
+)
--- a/contrib/mesos/pkg/executor/mock_test.go
+++ b/contrib/mesos/pkg/executor/mock_test.go
@@ -0,0 +1,81 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package executor
+
+import (
+	"testing"
+
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/kubelet/dockertools"
+	"github.com/mesos/mesos-go/mesosproto"
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/mock"
+)
+
+type MockExecutorDriver struct {
+	mock.Mock
+}
+
+func (m *MockExecutorDriver) Start() (mesosproto.Status, error) {
+	args := m.Called()
+	return args.Get(0).(mesosproto.Status), args.Error(1)
+}
+
+func (m *MockExecutorDriver) Stop() (mesosproto.Status, error) {
+	args := m.Called()
+	return args.Get(0).(mesosproto.Status), args.Error(1)
+}
+
+func (m *MockExecutorDriver) Abort() (mesosproto.Status, error) {
+	args := m.Called()
+	return args.Get(0).(mesosproto.Status), args.Error(1)
+}
+
+func (m *MockExecutorDriver) Join() (mesosproto.Status, error) {
+	args := m.Called()
+	return args.Get(0).(mesosproto.Status), args.Error(1)
+}
+
+func (m *MockExecutorDriver) Run() (mesosproto.Status, error) {
+	args := m.Called()
+	return args.Get(0).(mesosproto.Status), args.Error(1)
+}
+
+func (m *MockExecutorDriver) SendStatusUpdate(taskStatus *mesosproto.TaskStatus) (mesosproto.Status, error) {
+	args := m.Called(*taskStatus.State)
+	return args.Get(0).(mesosproto.Status), args.Error(1)
+}
+
+func (m *MockExecutorDriver) SendFrameworkMessage(msg string) (mesosproto.Status, error) {
+	args := m.Called(msg)
+	return args.Get(0).(mesosproto.Status), args.Error(1)
+}
+
+func NewTestKubernetesExecutor() *KubernetesExecutor {
+	return New(Config{
+		Docker:  dockertools.ConnectToDockerOrDie("fake://"),
+		Updates: make(chan interface{}, 1024),
+	})
+}
+
+func TestExecutorNew(t *testing.T) {
+	mockDriver := &MockExecutorDriver{}
+	executor := NewTestKubernetesExecutor()
+	executor.Init(mockDriver)
+
+	assert.Equal(t, executor.isDone(), false, "executor should not be in Done state on initialization")
+	assert.Equal(t, executor.isConnected(), false, "executor should not be connected on initialization")
+}
--- a/contrib/mesos/pkg/executor/service/doc.go
+++ b/contrib/mesos/pkg/executor/service/doc.go
@@ -0,0 +1,18 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+// Package service contains the cmd/k8sm-executor glue code.
+package service
--- a/contrib/mesos/pkg/executor/service/service.go
+++ b/contrib/mesos/pkg/executor/service/service.go
@@ -0,0 +1,600 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package service
+
+import (
+	"bufio"
+	"fmt"
+	"io"
+	"math/rand"
+	"net"
+	"net/http"
+	"os"
+	"os/exec"
+	"strconv"
+	"strings"
+	"sync"
+	"time"
+
+	"github.com/GoogleCloudPlatform/kubernetes/cmd/kubelet/app"
+	"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/executor"
+	"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/executor/config"
+	"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/hyperkube"
+	"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/redirfd"
+	"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/runtime"
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/api"
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/client"
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/credentialprovider"
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/healthz"
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/kubelet"
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/kubelet/cadvisor"
+	kconfig "github.com/GoogleCloudPlatform/kubernetes/pkg/kubelet/config"
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/kubelet/dockertools"
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/util"
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/util/mount"
+	log "github.com/golang/glog"
+	"github.com/kardianos/osext"
+	bindings "github.com/mesos/mesos-go/executor"
+
+	"github.com/spf13/pflag"
+)
+
+const (
+	// if we don't use this source then the kubelet will do funny, mirror things.
+	// @see ConfigSourceAnnotationKey
+	MESOS_CFG_SOURCE = kubelet.ApiserverSource
+)
+
+type KubeletExecutorServer struct {
+	*app.KubeletServer
+	RunProxy       bool
+	ProxyLogV      int
+	ProxyExec      string
+	ProxyLogfile   string
+	ProxyBindall   bool
+	SuicideTimeout time.Duration
+	ShutdownFD     int
+	ShutdownFIFO   string
+}
+
+func NewKubeletExecutorServer() *KubeletExecutorServer {
+	k := &KubeletExecutorServer{
+		KubeletServer:  app.NewKubeletServer(),
+		RunProxy:       true,
+		ProxyExec:      "./kube-proxy",
+		ProxyLogfile:   "./proxy-log",
+		SuicideTimeout: config.DefaultSuicideTimeout,
+	}
+	if pwd, err := os.Getwd(); err != nil {
+		log.Warningf("failed to determine current directory: %v", err)
+	} else {
+		k.RootDirectory = pwd // mesos sandbox dir
+	}
+	k.Address = util.IP(net.ParseIP(defaultBindingAddress()))
+	k.ShutdownFD = -1 // indicates unspecified FD
+	return k
+}
+
+func NewHyperKubeletExecutorServer() *KubeletExecutorServer {
+	s := NewKubeletExecutorServer()
+
+	// cache this for later use
+	binary, err := osext.Executable()
+	if err != nil {
+		log.Fatalf("failed to determine currently running executable: %v", err)
+	}
+
+	s.ProxyExec = binary
+	return s
+}
+
+func (s *KubeletExecutorServer) addCoreFlags(fs *pflag.FlagSet) {
+	s.KubeletServer.AddFlags(fs)
+	fs.BoolVar(&s.RunProxy, "run-proxy", s.RunProxy, "Maintain a running kube-proxy instance as a child proc of this kubelet-executor.")
+	fs.IntVar(&s.ProxyLogV, "proxy-logv", s.ProxyLogV, "Log verbosity of the child kube-proxy.")
+	fs.StringVar(&s.ProxyLogfile, "proxy-logfile", s.ProxyLogfile, "Path to the kube-proxy log file.")
+	fs.BoolVar(&s.ProxyBindall, "proxy-bindall", s.ProxyBindall, "When true will cause kube-proxy to bind to 0.0.0.0.")
+	fs.DurationVar(&s.SuicideTimeout, "suicide-timeout", s.SuicideTimeout, "Self-terminate after this period of inactivity. Zero disables suicide watch.")
+	fs.IntVar(&s.ShutdownFD, "shutdown-fd", s.ShutdownFD, "File descriptor used to signal shutdown to external watchers, requires shutdown-fifo flag")
+	fs.StringVar(&s.ShutdownFIFO, "shutdown-fifo", s.ShutdownFIFO, "FIFO used to signal shutdown to external watchers, requires shutdown-fd flag")
+}
+
+func (s *KubeletExecutorServer) AddStandaloneFlags(fs *pflag.FlagSet) {
+	s.addCoreFlags(fs)
+	fs.StringVar(&s.ProxyExec, "proxy-exec", s.ProxyExec, "Path to the kube-proxy executable.")
+}
+
+func (s *KubeletExecutorServer) AddHyperkubeFlags(fs *pflag.FlagSet) {
+	s.addCoreFlags(fs)
+}
+
+// returns a Closer that should be closed to signal impending shutdown, but only if ShutdownFD
+// and ShutdownFIFO were specified. if they are specified, then this func blocks until there's
+// a reader on the FIFO stream.
+func (s *KubeletExecutorServer) syncExternalShutdownWatcher() (io.Closer, error) {
+	if s.ShutdownFD == -1 || s.ShutdownFIFO == "" {
+		return nil, nil
+	}
+	// redirfd -w n fifo ...  # (blocks until the fifo is read)
+	log.Infof("blocked, waiting for shutdown reader for FD %d FIFO at %s", s.ShutdownFD, s.ShutdownFIFO)
+	return redirfd.Write.Redirect(true, false, redirfd.FileDescriptor(s.ShutdownFD), s.ShutdownFIFO)
+}
+
+// Run runs the specified KubeletExecutorServer.
+func (s *KubeletExecutorServer) Run(hks hyperkube.Interface, _ []string) error {
+	rand.Seed(time.Now().UTC().UnixNano())
+
+	if err := util.ApplyOomScoreAdj(0, s.OOMScoreAdj); err != nil {
+		log.Info(err)
+	}
+
+	var apiclient *client.Client
+	clientConfig, err := s.CreateAPIServerClientConfig()
+	if err == nil {
+		apiclient, err = client.New(clientConfig)
+	}
+	if err != nil {
+		// required for k8sm since we need to send api.Binding information
+		// back to the apiserver
+		log.Fatalf("No API client: %v", err)
+	}
+
+	log.Infof("Using root directory: %v", s.RootDirectory)
+	credentialprovider.SetPreferredDockercfgPath(s.RootDirectory)
+
+	shutdownCloser, err := s.syncExternalShutdownWatcher()
+	if err != nil {
+		return err
+	}
+
+	cadvisorInterface, err := cadvisor.New(s.CadvisorPort)
+	if err != nil {
+		return err
+	}
+
+	imageGCPolicy := kubelet.ImageGCPolicy{
+		HighThresholdPercent: s.ImageGCHighThresholdPercent,
+		LowThresholdPercent:  s.ImageGCLowThresholdPercent,
+	}
+
+	diskSpacePolicy := kubelet.DiskSpacePolicy{
+		DockerFreeDiskMB: s.LowDiskSpaceThresholdMB,
+		RootFreeDiskMB:   s.LowDiskSpaceThresholdMB,
+	}
+
+	//TODO(jdef) intentionally NOT initializing a cloud provider here since:
+	//(a) the kubelet doesn't actually use it
+	//(b) we don't need to create N-kubelet connections to zookeeper for no good reason
+	//cloud := cloudprovider.InitCloudProvider(s.CloudProvider, s.CloudConfigFile)
+	//log.Infof("Successfully initialized cloud provider: %q from the config file: %q\n", s.CloudProvider, s.CloudConfigFile)
+
+	hostNetworkSources, err := kubelet.GetValidatedSources(strings.Split(s.HostNetworkSources, ","))
+	if err != nil {
+		return err
+	}
+
+	tlsOptions, err := s.InitializeTLS()
+	if err != nil {
+		return err
+	}
+	mounter := mount.New()
+	if s.Containerized {
+		log.V(2).Info("Running kubelet in containerized mode (experimental)")
+		mounter = &mount.NsenterMounter{}
+	}
+
+	var dockerExecHandler dockertools.ExecHandler
+	switch s.DockerExecHandlerName {
+	case "native":
+		dockerExecHandler = &dockertools.NativeExecHandler{}
+	case "nsenter":
+		dockerExecHandler = &dockertools.NsenterExecHandler{}
+	default:
+		log.Warningf("Unknown Docker exec handler %q; defaulting to native", s.DockerExecHandlerName)
+		dockerExecHandler = &dockertools.NativeExecHandler{}
+	}
+
+	kcfg := app.KubeletConfig{
+		Address:            s.Address,
+		AllowPrivileged:    s.AllowPrivileged,
+		HostNetworkSources: hostNetworkSources,
+		HostnameOverride:   s.HostnameOverride,
+		RootDirectory:      s.RootDirectory,
+		// ConfigFile: ""
+		// ManifestURL: ""
+		// FileCheckFrequency
+		// HTTPCheckFrequency
+		PodInfraContainerImage:         s.PodInfraContainerImage,
+		SyncFrequency:                  s.SyncFrequency,
+		RegistryPullQPS:                s.RegistryPullQPS,
+		RegistryBurst:                  s.RegistryBurst,
+		MinimumGCAge:                   s.MinimumGCAge,
+		MaxPerPodContainerCount:        s.MaxPerPodContainerCount,
+		MaxContainerCount:              s.MaxContainerCount,
+		RegisterNode:                   s.RegisterNode,
+		ClusterDomain:                  s.ClusterDomain,
+		ClusterDNS:                     s.ClusterDNS,
+		Runonce:                        s.RunOnce,
+		Port:                           s.Port,
+		ReadOnlyPort:                   s.ReadOnlyPort,
+		CadvisorInterface:              cadvisorInterface,
+		EnableServer:                   s.EnableServer,
+		EnableDebuggingHandlers:        s.EnableDebuggingHandlers,
+		DockerClient:                   dockertools.ConnectToDockerOrDie(s.DockerEndpoint),
+		KubeClient:                     apiclient,
+		MasterServiceNamespace:         s.MasterServiceNamespace,
+		VolumePlugins:                  app.ProbeVolumePlugins(),
+		NetworkPlugins:                 app.ProbeNetworkPlugins(),
+		NetworkPluginName:              s.NetworkPluginName,
+		StreamingConnectionIdleTimeout: s.StreamingConnectionIdleTimeout,
+		TLSOptions:                     tlsOptions,
+		ImageGCPolicy:                  imageGCPolicy,
+		DiskSpacePolicy:                diskSpacePolicy,
+		Cloud:                          nil, // TODO(jdef) Cloud, specifying null here because we don't want all kubelets polling mesos-master; need to account for this in the cloudprovider impl
+		NodeStatusUpdateFrequency: s.NodeStatusUpdateFrequency,
+		ResourceContainer:         s.ResourceContainer,
+		CgroupRoot:                s.CgroupRoot,
+		ContainerRuntime:          s.ContainerRuntime,
+		Mounter:                   mounter,
+		DockerDaemonContainer:     s.DockerDaemonContainer,
+		SystemContainer:           s.SystemContainer,
+		ConfigureCBR0:             s.ConfigureCBR0,
+		MaxPods:                   s.MaxPods,
+		DockerExecHandler:         dockerExecHandler,
+	}
+
+	err = app.RunKubelet(&kcfg, app.KubeletBuilder(func(kc *app.KubeletConfig) (app.KubeletBootstrap, *kconfig.PodConfig, error) {
+		return s.createAndInitKubelet(kc, hks, clientConfig, shutdownCloser)
+	}))
+	if err != nil {
+		return err
+	}
+
+	if s.HealthzPort > 0 {
+		healthz.DefaultHealthz()
+		go util.Forever(func() {
+			err := http.ListenAndServe(net.JoinHostPort(s.HealthzBindAddress.String(), strconv.Itoa(s.HealthzPort)), nil)
+			if err != nil {
+				log.Errorf("Starting health server failed: %v", err)
+			}
+		}, 5*time.Second)
+	}
+
+	// block until executor is shut down or commits shutdown
+	select {}
+}
+
+func defaultBindingAddress() string {
+	libProcessIP := os.Getenv("LIBPROCESS_IP")
+	if libProcessIP == "" {
+		return "0.0.0.0"
+	} else {
+		return libProcessIP
+	}
+}
+
+func (ks *KubeletExecutorServer) createAndInitKubelet(
+	kc *app.KubeletConfig,
+	hks hyperkube.Interface,
+	clientConfig *client.Config,
+	shutdownCloser io.Closer,
+) (app.KubeletBootstrap, *kconfig.PodConfig, error) {
+
+	// TODO(k8s): block until all sources have delivered at least one update to the channel, or break the sync loop
+	// up into "per source" synchronizations
+	// TODO(k8s): KubeletConfig.KubeClient should be a client interface, but client interface misses certain methods
+	// used by kubelet. Since NewMainKubelet expects a client interface, we need to make sure we are not passing
+	// a nil pointer to it when what we really want is a nil interface.
+	var kubeClient client.Interface
+	if kc.KubeClient == nil {
+		kubeClient = nil
+	} else {
+		kubeClient = kc.KubeClient
+	}
+
+	gcPolicy := kubelet.ContainerGCPolicy{
+		MinAge:             kc.MinimumGCAge,
+		MaxPerPodContainer: kc.MaxPerPodContainerCount,
+		MaxContainers:      kc.MaxContainerCount,
+	}
+
+	pc := kconfig.NewPodConfig(kconfig.PodConfigNotificationSnapshotAndUpdates, kc.Recorder)
+	updates := pc.Channel(MESOS_CFG_SOURCE)
+
+	klet, err := kubelet.NewMainKubelet(
+		kc.Hostname,
+		kc.DockerClient,
+		kubeClient,
+		kc.RootDirectory,
+		kc.PodInfraContainerImage,
+		kc.SyncFrequency,
+		float32(kc.RegistryPullQPS),
+		kc.RegistryBurst,
+		gcPolicy,
+		pc.SeenAllSources,
+		kc.RegisterNode,
+		kc.ClusterDomain,
+		net.IP(kc.ClusterDNS),
+		kc.MasterServiceNamespace,
+		kc.VolumePlugins,
+		kc.NetworkPlugins,
+		kc.NetworkPluginName,
+		kc.StreamingConnectionIdleTimeout,
+		kc.Recorder,
+		kc.CadvisorInterface,
+		kc.ImageGCPolicy,
+		kc.DiskSpacePolicy,
+		kc.Cloud,
+		kc.NodeStatusUpdateFrequency,
+		kc.ResourceContainer,
+		kc.OSInterface,
+		kc.CgroupRoot,
+		kc.ContainerRuntime,
+		kc.Mounter,
+		kc.DockerDaemonContainer,
+		kc.SystemContainer,
+		kc.ConfigureCBR0,
+		kc.MaxPods,
+		kc.DockerExecHandler,
+	)
+	if err != nil {
+		return nil, nil, err
+	}
+
+	//TODO(jdef) either configure Watch here with something useful, or else
+	// get rid of it from executor.Config
+	kubeletFinished := make(chan struct{})
+	exec := executor.New(executor.Config{
+		Kubelet:         klet,
+		Updates:         updates,
+		SourceName:      MESOS_CFG_SOURCE,
+		APIClient:       kc.KubeClient,
+		Docker:          kc.DockerClient,
+		SuicideTimeout:  ks.SuicideTimeout,
+		KubeletFinished: kubeletFinished,
+		ShutdownAlert: func() {
+			if shutdownCloser != nil {
+				if e := shutdownCloser.Close(); e != nil {
+					log.Warningf("failed to signal shutdown to external watcher: %v", e)
+				}
+			}
+		},
+		ExitFunc: os.Exit,
+		PodStatusFunc: func(kl *kubelet.Kubelet, pod *api.Pod) (*api.PodStatus, error) {
+			return kl.GetRuntime().GetPodStatus(pod)
+		},
+	})
+
+	k := &kubeletExecutor{
+		Kubelet:         klet,
+		runProxy:        ks.RunProxy,
+		proxyLogV:       ks.ProxyLogV,
+		proxyExec:       ks.ProxyExec,
+		proxyLogfile:    ks.ProxyLogfile,
+		proxyBindall:    ks.ProxyBindall,
+		address:         ks.Address,
+		dockerClient:    kc.DockerClient,
+		hks:             hks,
+		kubeletFinished: kubeletFinished,
+		executorDone:    exec.Done(),
+		clientConfig:    clientConfig,
+	}
+
+	dconfig := bindings.DriverConfig{
+		Executor:         exec,
+		HostnameOverride: ks.HostnameOverride,
+		BindingAddress:   net.IP(ks.Address),
+	}
+	if driver, err := bindings.NewMesosExecutorDriver(dconfig); err != nil {
+		log.Fatalf("failed to create executor driver: %v", err)
+	} else {
+		k.driver = driver
+	}
+
+	log.V(2).Infof("Initialize executor driver...")
+
+	k.BirthCry()
+	exec.Init(k.driver)
+
+	k.StartGarbageCollection()
+
+	return k, pc, nil
+}
+
+// kubelet decorator
+type kubeletExecutor struct {
+	*kubelet.Kubelet
+	initialize      sync.Once
+	driver          bindings.ExecutorDriver
+	runProxy        bool
+	proxyLogV       int
+	proxyExec       string
+	proxyLogfile    string
+	proxyBindall    bool
+	address         util.IP
+	dockerClient    dockertools.DockerInterface
+	hks             hyperkube.Interface
+	kubeletFinished chan struct{}   // closed once kubelet.Run() returns
+	executorDone    <-chan struct{} // from KubeletExecutor.Done()
+	clientConfig    *client.Config
+}
+
+func (kl *kubeletExecutor) ListenAndServe(address net.IP, port uint, tlsOptions *kubelet.TLSOptions, enableDebuggingHandlers bool) {
+	// this func could be called many times, depending how often the HTTP server crashes,
+	// so only execute certain initialization procs once
+	kl.initialize.Do(func() {
+		if kl.runProxy {
+			go runtime.Until(kl.runProxyService, 5*time.Second, kl.executorDone)
+		}
+		go func() {
+			if _, err := kl.driver.Run(); err != nil {
+				log.Fatalf("executor driver failed: %v", err)
+			}
+			log.Info("executor Run completed")
+		}()
+	})
+	log.Infof("Starting kubelet server...")
+	kubelet.ListenAndServeKubeletServer(kl, address, port, tlsOptions, enableDebuggingHandlers)
+}
+
+// this function blocks as long as the proxy service is running; intended to be
+// executed asynchronously.
+func (kl *kubeletExecutor) runProxyService() {
+
+	log.Infof("Starting proxy process...")
+
+	const KM_PROXY = "proxy" //TODO(jdef) constant should be shared with km package
+	args := []string{}
+
+	if kl.hks.FindServer(KM_PROXY) {
+		args = append(args, KM_PROXY)
+		log.V(1).Infof("attempting to using km proxy service")
+	} else if _, err := os.Stat(kl.proxyExec); os.IsNotExist(err) {
+		log.Errorf("failed to locate proxy executable at '%v' and km not present: %v", kl.proxyExec, err)
+		return
+	}
+
+	bindAddress := "0.0.0.0"
+	if !kl.proxyBindall {
+		bindAddress = kl.address.String()
+	}
+	args = append(args,
+		fmt.Sprintf("--bind-address=%s", bindAddress),
+		fmt.Sprintf("--v=%d", kl.proxyLogV),
+		"--logtostderr=true",
+	)
+
+	// add client.Config args here. proxy still calls client.BindClientConfigFlags
+	appendStringArg := func(name, value string) {
+		if value != "" {
+			args = append(args, fmt.Sprintf("--%s=%s", name, value))
+		}
+	}
+	appendStringArg("master", kl.clientConfig.Host)
+	/* TODO(jdef) move these flags to a config file pointed to by --kubeconfig
+	appendStringArg("api-version", kl.clientConfig.Version)
+	appendStringArg("client-certificate", kl.clientConfig.CertFile)
+	appendStringArg("client-key", kl.clientConfig.KeyFile)
+	appendStringArg("certificate-authority", kl.clientConfig.CAFile)
+	args = append(args, fmt.Sprintf("--insecure-skip-tls-verify=%t", kl.clientConfig.Insecure))
+	*/
+
+	log.Infof("Spawning process executable %s with args '%+v'", kl.proxyExec, args)
+
+	cmd := exec.Command(kl.proxyExec, args...)
+	if _, err := cmd.StdoutPipe(); err != nil {
+		log.Fatal(err)
+	}
+
+	proxylogs, err := cmd.StderrPipe()
+	if err != nil {
+		log.Fatal(err)
+	}
+
+	//TODO(jdef) append instead of truncate? what if the disk is full?
+	logfile, err := os.Create(kl.proxyLogfile)
+	if err != nil {
+		log.Fatal(err)
+	}
+	defer logfile.Close()
+
+	ch := make(chan struct{})
+	go func() {
+		defer func() {
+			select {
+			case <-ch:
+				log.Infof("killing proxy process..")
+				if err = cmd.Process.Kill(); err != nil {
+					log.Errorf("failed to kill proxy process: %v", err)
+				}
+			default:
+			}
+		}()
+
+		writer := bufio.NewWriter(logfile)
+		defer writer.Flush()
+
+		<-ch
+		written, err := io.Copy(writer, proxylogs)
+		if err != nil {
+			log.Errorf("error writing data to proxy log: %v", err)
+		}
+
+		log.Infof("wrote %d bytes to proxy log", written)
+	}()
+
+	// if the proxy fails to start then we exit the executor, otherwise
+	// wait for the proxy process to end (and release resources after).
+	if err := cmd.Start(); err != nil {
+		log.Fatal(err)
+	}
+	close(ch)
+	if err := cmd.Wait(); err != nil {
+		log.Error(err)
+	}
+}
+
+// runs the main kubelet loop, closing the kubeletFinished chan when the loop exits.
+// never returns.
+func (kl *kubeletExecutor) Run(updates <-chan kubelet.PodUpdate) {
+	defer func() {
+		close(kl.kubeletFinished)
+		util.HandleCrash()
+		log.Infoln("kubelet run terminated") //TODO(jdef) turn down verbosity
+		// important: never return! this is in our contract
+		select {}
+	}()
+
+	// push updates through a closable pipe. when the executor indicates shutdown
+	// via Done() we want to stop the Kubelet from processing updates.
+	pipe := make(chan kubelet.PodUpdate)
+	go func() {
+		// closing pipe will cause our patched kubelet's syncLoop() to exit
+		defer close(pipe)
+	pipeLoop:
+		for {
+			select {
+			case <-kl.executorDone:
+				break pipeLoop
+			default:
+				select {
+				case u := <-updates:
+					select {
+					case pipe <- u: // noop
+					case <-kl.executorDone:
+						break pipeLoop
+					}
+				case <-kl.executorDone:
+					break pipeLoop
+				}
+			}
+		}
+	}()
+
+	// we expect that Run() will complete after the pipe is closed and the
+	// kubelet's syncLoop() has finished processing its backlog, which hopefully
+	// will not take very long. Peeking into the future (current k8s master) it
+	// seems that the backlog has grown from 1 to 50 -- this may negatively impact
+	// us going forward, time will tell.
+	util.Until(func() { kl.Kubelet.Run(pipe) }, 0, kl.executorDone)
+
+	//TODO(jdef) revisit this if/when executor failover lands
+	err := kl.SyncPods([]*api.Pod{}, nil, nil, time.Now())
+	if err != nil {
+		log.Errorf("failed to cleanly remove all pods and associated state: %v", err)
+	}
+}
--- a/contrib/mesos/pkg/hyperkube/doc.go
+++ b/contrib/mesos/pkg/hyperkube/doc.go
@@ -0,0 +1,21 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+// Package hyperkube facilitates the combination of multiple
+// kubernetes-mesos components into a single binary form, providing a
+// simple mechanism for intra-component discovery as per the original
+// Kubernetes hyperkube package.
+package hyperkube
--- a/contrib/mesos/pkg/hyperkube/types.go
+++ b/contrib/mesos/pkg/hyperkube/types.go
@@ -0,0 +1,54 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package hyperkube
+
+import (
+	"github.com/spf13/pflag"
+)
+
+var (
+	nilKube = &nilKubeType{}
+)
+
+type Interface interface {
+	// FindServer will find a specific server named name.
+	FindServer(name string) bool
+
+	// The executable name, used for help and soft-link invocation
+	Name() string
+
+	// Flags returns a flagset for "global" flags.
+	Flags() *pflag.FlagSet
+}
+
+type nilKubeType struct{}
+
+func (n *nilKubeType) FindServer(_ string) bool {
+	return false
+}
+
+func (n *nilKubeType) Name() string {
+	return ""
+}
+
+func (n *nilKubeType) Flags() *pflag.FlagSet {
+	return nil
+}
+
+func Nil() Interface {
+	return nilKube
+}
--- a/contrib/mesos/pkg/offers/doc.go
+++ b/contrib/mesos/pkg/offers/doc.go
@@ -0,0 +1,18 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+// Package offers contains code that manages Mesos offers.
+package offers
--- a/contrib/mesos/pkg/offers/metrics/doc.go
+++ b/contrib/mesos/pkg/offers/metrics/doc.go
@@ -0,0 +1,19 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+// Package metrics defines and exposes instrumentation metrics related to
+// Mesos offers.
+package metrics
--- a/contrib/mesos/pkg/offers/metrics/metrics.go
+++ b/contrib/mesos/pkg/offers/metrics/metrics.go
@@ -0,0 +1,89 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package metrics
+
+import (
+	"sync"
+	"time"
+
+	"github.com/prometheus/client_golang/prometheus"
+)
+
+const (
+	offerSubsystem = "mesos_offers"
+)
+
+type OfferDeclinedReason string
+
+const (
+	OfferExpired   = OfferDeclinedReason("expired")
+	OfferRescinded = OfferDeclinedReason("rescinded")
+	OfferCompat    = OfferDeclinedReason("compat")
+)
+
+var (
+	OffersReceived = prometheus.NewCounterVec(
+		prometheus.CounterOpts{
+			Subsystem: offerSubsystem,
+			Name:      "received",
+			Help:      "Counter of offers received from Mesos broken out by slave host.",
+		},
+		[]string{"hostname"},
+	)
+
+	OffersDeclined = prometheus.NewCounterVec(
+		prometheus.CounterOpts{
+			Subsystem: offerSubsystem,
+			Name:      "declined",
+			Help:      "Counter of offers declined by the framework broken out by slave host.",
+		},
+		[]string{"hostname", "reason"},
+	)
+
+	OffersAcquired = prometheus.NewCounterVec(
+		prometheus.CounterOpts{
+			Subsystem: offerSubsystem,
+			Name:      "acquired",
+			Help:      "Counter of offers acquired for task launch broken out by slave host.",
+		},
+		[]string{"hostname"},
+	)
+
+	OffersReleased = prometheus.NewCounterVec(
+		prometheus.CounterOpts{
+			Subsystem: offerSubsystem,
+			Name:      "released",
+			Help:      "Counter of previously-acquired offers later released, broken out by slave host.",
+		},
+		[]string{"hostname"},
+	)
+)
+
+var registerMetrics sync.Once
+
+func Register() {
+	registerMetrics.Do(func() {
+		prometheus.MustRegister(OffersReceived)
+		prometheus.MustRegister(OffersDeclined)
+		prometheus.MustRegister(OffersAcquired)
+		prometheus.MustRegister(OffersReleased)
+	})
+}
+
+func InMicroseconds(d time.Duration) float64 {
+	return float64(d.Nanoseconds() / time.Microsecond.Nanoseconds())
+}
--- a/contrib/mesos/pkg/offers/offers.go
+++ b/contrib/mesos/pkg/offers/offers.go
@@ -0,0 +1,570 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package offers
+
+import (
+	"fmt"
+	"reflect"
+	"sync"
+	"sync/atomic"
+	"time"
+
+	"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/offers/metrics"
+	"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/proc"
+	"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/queue"
+	"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/runtime"
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/client/cache"
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/util"
+	log "github.com/golang/glog"
+	mesos "github.com/mesos/mesos-go/mesosproto"
+)
+
+const (
+	offerListenerMaxAge      = 12              // max number of times we'll attempt to fit an offer to a listener before requiring them to re-register themselves
+	offerIdCacheTTL          = 1 * time.Second // determines expiration of cached offer ids, used in listener notification
+	deferredDeclineTtlFactor = 2               // this factor, multiplied by the offer ttl, determines how long to wait before attempting to decline previously claimed offers that were subsequently deleted, then released. see offerStorage.Delete
+	notifyListenersDelay     = 0               // delay between offer listener notification attempts
+)
+
+type Filter func(*mesos.Offer) bool
+
+type Registry interface {
+	// Initialize the instance, spawning necessary housekeeping go routines.
+	Init(<-chan struct{})
+
+	// Add offers to this registry, rejecting those that are deemed incompatible.
+	Add([]*mesos.Offer)
+
+	// Listen for arriving offers that are acceptable to the filter, sending
+	// a signal on (by closing) the returned channel. A listener will only
+	// ever be notified once, if at all.
+	Listen(id string, f Filter) <-chan struct{}
+
+	// invoked when offers are rescinded or expired
+	Delete(string, metrics.OfferDeclinedReason)
+
+	// when true, returns the offer that's registered for the given ID
+	Get(offerId string) (Perishable, bool)
+
+	// iterate through non-expired offers in this registry
+	Walk(Walker) error
+
+	// invalidate one or all (when offerId="") offers; offers are not declined,
+	// but are simply flagged as expired in the offer history
+	Invalidate(offerId string)
+
+	// invalidate all offers associated with the slave identified by slaveId.
+	InvalidateForSlave(slaveId string)
+}
+
+// callback that is invoked during a walk through a series of live offers,
+// returning with stop=true (or err != nil) if the walk should stop permaturely.
+type Walker func(offer Perishable) (stop bool, err error)
+
+type RegistryConfig struct {
+	DeclineOffer  func(offerId string) <-chan error // tell Mesos that we're declining the offer
+	Compat        func(*mesos.Offer) bool           // returns true if offer is compatible; incompatible offers are declined
+	TTL           time.Duration                     // determines a perishable offer's expiration deadline: now+ttl
+	LingerTTL     time.Duration                     // if zero, offers will not linger in the FIFO past their expiration deadline
+	ListenerDelay time.Duration                     // specifies the sleep time between offer listener notifications
+}
+
+type offerStorage struct {
+	RegistryConfig
+	offers    *cache.FIFO       // collection of Perishable, both live and expired
+	listeners *queue.DelayFIFO  // collection of *offerListener
+	delayed   *queue.DelayQueue // deadline-oriented offer-event queue
+	slaves    *slaveStorage     // slave to offer mappings
+}
+
+type liveOffer struct {
+	*mesos.Offer
+	expiration time.Time
+	acquired   int32 // 1 = acquired, 0 = free
+}
+
+type expiredOffer struct {
+	offerSpec
+	deadline time.Time
+}
+
+// subset of mesos.OfferInfo useful for recordkeeping
+type offerSpec struct {
+	id       string
+	hostname string
+}
+
+// offers that may perish (all of them?) implement this interface.
+// callers may expect to access these funcs concurrently so implementations
+// must provide their own form of synchronization around mutable state.
+type Perishable interface {
+	// returns true if this offer has expired
+	HasExpired() bool
+	// if not yet expired, return mesos offer details; otherwise nil
+	Details() *mesos.Offer
+	// mark this offer as acquired, returning true if it was previously unacquired. thread-safe.
+	Acquire() bool
+	// mark this offer as un-acquired. thread-safe.
+	Release()
+	// expire or delete this offer from storage
+	age(s *offerStorage)
+	// return a unique identifier for this offer
+	Id() string
+	// return the slave host for this offer
+	Host() string
+	addTo(*queue.DelayQueue)
+}
+
+func (e *expiredOffer) addTo(q *queue.DelayQueue) {
+	q.Add(e)
+}
+
+func (e *expiredOffer) Id() string {
+	return e.id
+}
+
+func (e *expiredOffer) Host() string {
+	return e.hostname
+}
+
+func (e *expiredOffer) HasExpired() bool {
+	return true
+}
+
+func (e *expiredOffer) Details() *mesos.Offer {
+	return nil
+}
+
+func (e *expiredOffer) Acquire() bool {
+	return false
+}
+
+func (e *expiredOffer) Release() {}
+
+func (e *expiredOffer) age(s *offerStorage) {
+	log.V(3).Infof("Delete lingering offer: %v", e.id)
+	s.offers.Delete(e)
+	s.slaves.deleteOffer(e.id)
+}
+
+// return the time left to linger
+func (e *expiredOffer) GetDelay() time.Duration {
+	return e.deadline.Sub(time.Now())
+}
+
+func (to *liveOffer) HasExpired() bool {
+	return time.Now().After(to.expiration)
+}
+
+func (to *liveOffer) Details() *mesos.Offer {
+	return to.Offer
+}
+
+func (to *liveOffer) Acquire() (acquired bool) {
+	if acquired = atomic.CompareAndSwapInt32(&to.acquired, 0, 1); acquired {
+		metrics.OffersAcquired.WithLabelValues(to.Host()).Inc()
+	}
+	return
+}
+
+func (to *liveOffer) Release() {
+	if released := atomic.CompareAndSwapInt32(&to.acquired, 1, 0); released {
+		metrics.OffersReleased.WithLabelValues(to.Host()).Inc()
+	}
+}
+
+func (to *liveOffer) age(s *offerStorage) {
+	s.Delete(to.Id(), metrics.OfferExpired)
+}
+
+func (to *liveOffer) Id() string {
+	return to.Offer.Id.GetValue()
+}
+
+func (to *liveOffer) Host() string {
+	return to.Offer.GetHostname()
+}
+
+func (to *liveOffer) addTo(q *queue.DelayQueue) {
+	q.Add(to)
+}
+
+// return the time remaining before the offer expires
+func (to *liveOffer) GetDelay() time.Duration {
+	return to.expiration.Sub(time.Now())
+}
+
+func CreateRegistry(c RegistryConfig) Registry {
+	metrics.Register()
+	return &offerStorage{
+		RegistryConfig: c,
+		offers: cache.NewFIFO(cache.KeyFunc(func(v interface{}) (string, error) {
+			if perishable, ok := v.(Perishable); !ok {
+				return "", fmt.Errorf("expected perishable offer, not '%+v'", v)
+			} else {
+				return perishable.Id(), nil
+			}
+		})),
+		listeners: queue.NewDelayFIFO(),
+		delayed:   queue.NewDelayQueue(),
+		slaves:    newSlaveStorage(),
+	}
+}
+
+func (s *offerStorage) declineOffer(offerId, hostname string, reason metrics.OfferDeclinedReason) {
+	//TODO(jdef) might be nice to spec an abort chan here
+	runtime.Signal(proc.OnError(s.DeclineOffer(offerId), func(err error) {
+		log.Warningf("decline failed for offer id %v: %v", offerId, err)
+	}, nil)).Then(func() {
+		metrics.OffersDeclined.WithLabelValues(hostname, string(reason)).Inc()
+	})
+}
+
+func (s *offerStorage) Add(offers []*mesos.Offer) {
+	now := time.Now()
+	for _, offer := range offers {
+		if !s.Compat(offer) {
+			//TODO(jdef) would be nice to batch these up
+			offerId := offer.Id.GetValue()
+			log.V(3).Infof("Declining incompatible offer %v", offerId)
+			s.declineOffer(offerId, offer.GetHostname(), metrics.OfferCompat)
+			return
+		}
+		timed := &liveOffer{
+			Offer:      offer,
+			expiration: now.Add(s.TTL),
+			acquired:   0,
+		}
+		log.V(3).Infof("Receiving offer %v", timed.Id())
+		s.offers.Add(timed)
+		s.delayed.Add(timed)
+		s.slaves.add(offer.SlaveId.GetValue(), timed.Id())
+		metrics.OffersReceived.WithLabelValues(timed.Host()).Inc()
+	}
+}
+
+// delete an offer from storage, implicitly expires the offer
+func (s *offerStorage) Delete(offerId string, reason metrics.OfferDeclinedReason) {
+	if offer, ok := s.Get(offerId); ok {
+		log.V(3).Infof("Deleting offer %v", offerId)
+		// attempt to block others from consuming the offer. if it's already been
+		// claimed and is not yet lingering then don't decline it - just mark it as
+		// expired in the history: allow a prior claimant to attempt to launch with it
+		notYetClaimed := offer.Acquire()
+		if offer.Details() != nil {
+			if notYetClaimed {
+				log.V(3).Infof("Declining offer %v", offerId)
+				s.declineOffer(offerId, offer.Host(), reason)
+			} else {
+				// some pod has acquired this and may attempt to launch a task with it
+				// failed schedule/launch attempts are requried to Release() any claims on the offer
+
+				// TODO(jdef): not sure what a good value is here. the goal is to provide a
+				// launchTasks (driver) operation enough time to complete so that we don't end
+				// up declining an offer that we're actually attempting to use.
+				time.AfterFunc(deferredDeclineTtlFactor*s.TTL, func() {
+					// at this point the offer is in one of five states:
+					// a) permanently deleted: expired due to timeout
+					// b) permanently deleted: expired due to having been rescinded
+					// c) lingering: expired due to timeout
+					// d) lingering: expired due to having been rescinded
+					// e) claimed: task launched and it using resources from this offer
+					// we want to **avoid** declining an offer that's claimed: attempt to acquire
+					if offer.Acquire() {
+						// previously claimed offer was released, perhaps due to a launch
+						// failure, so we should attempt to decline
+						log.V(3).Infof("attempting to decline (previously claimed) offer %v", offerId)
+						s.declineOffer(offerId, offer.Host(), reason)
+					}
+				})
+			}
+		}
+		s.expireOffer(offer)
+	} // else, ignore offers not in the history
+}
+
+func (s *offerStorage) InvalidateForSlave(slaveId string) {
+	offerIds := s.slaves.deleteSlave(slaveId)
+	for oid := range offerIds {
+		s.invalidateOne(oid)
+	}
+}
+
+// if offerId == "" then expire all known, live offers, otherwise only the offer indicated
+func (s *offerStorage) Invalidate(offerId string) {
+	if offerId != "" {
+		s.invalidateOne(offerId)
+		return
+	}
+	obj := s.offers.List()
+	for _, o := range obj {
+		offer, ok := o.(Perishable)
+		if !ok {
+			log.Errorf("Expected perishable offer, not %v", o)
+			continue
+		}
+		offer.Acquire() // attempt to block others from using it
+		s.expireOffer(offer)
+		// don't decline, we already know that it's an invalid offer
+	}
+}
+
+func (s *offerStorage) invalidateOne(offerId string) {
+	if offer, ok := s.Get(offerId); ok {
+		offer.Acquire() // attempt to block others from using it
+		s.expireOffer(offer)
+		// don't decline, we already know that it's an invalid offer
+	}
+}
+
+// Walk the collection of offers. The walk stops either as indicated by the
+// Walker or when the end of the offer list is reached. Expired offers are
+// never passed to a Walker.
+func (s *offerStorage) Walk(w Walker) error {
+	for _, v := range s.offers.List() {
+		offer, ok := v.(Perishable)
+		if !ok {
+			// offer disappeared...
+			continue
+		}
+		if offer.HasExpired() {
+			// never pass expired offers to walkers
+			continue
+		}
+		if stop, err := w(offer); err != nil {
+			return err
+		} else if stop {
+			return nil
+		}
+	}
+	return nil
+}
+
+func Expired(offerId, hostname string, ttl time.Duration) *expiredOffer {
+	return &expiredOffer{offerSpec{id: offerId, hostname: hostname}, time.Now().Add(ttl)}
+}
+
+func (s *offerStorage) expireOffer(offer Perishable) {
+	// the offer may or may not be expired due to TTL so check for details
+	// since that's a more reliable determinant of lingering status
+	if details := offer.Details(); details != nil {
+		// recently expired, should linger
+		offerId := details.Id.GetValue()
+		log.V(3).Infof("Expiring offer %v", offerId)
+		if s.LingerTTL > 0 {
+			log.V(3).Infof("offer will linger: %v", offerId)
+			expired := Expired(offerId, offer.Host(), s.LingerTTL)
+			s.offers.Update(expired)
+			s.delayed.Add(expired)
+		} else {
+			log.V(3).Infof("Permanently deleting offer %v", offerId)
+			s.offers.Delete(offerId)
+			s.slaves.deleteOffer(offerId)
+		}
+	} // else, it's still lingering...
+}
+
+func (s *offerStorage) Get(id string) (Perishable, bool) {
+	if obj, ok, _ := s.offers.GetByKey(id); !ok {
+		return nil, false
+	} else {
+		to, ok := obj.(Perishable)
+		if !ok {
+			log.Errorf("invalid offer object in fifo '%v'", obj)
+		}
+		return to, ok
+	}
+}
+
+type offerListener struct {
+	id         string
+	accepts    Filter
+	notify     chan<- struct{}
+	age        int
+	deadline   time.Time
+	sawVersion uint64
+}
+
+func (l *offerListener) GetUID() string {
+	return l.id
+}
+
+func (l *offerListener) Deadline() (time.Time, bool) {
+	return l.deadline, true
+}
+
+// register a listener for new offers, whom we'll notify upon receiving such.
+// notification is delivered in the form of closing the channel, nothing is ever sent.
+func (s *offerStorage) Listen(id string, f Filter) <-chan struct{} {
+	if f == nil {
+		return nil
+	}
+	ch := make(chan struct{})
+	listen := &offerListener{
+		id:       id,
+		accepts:  f,
+		notify:   ch,
+		deadline: time.Now().Add(s.ListenerDelay),
+	}
+	log.V(3).Infof("Registering offer listener %s", listen.id)
+	s.listeners.Offer(listen, queue.ReplaceExisting)
+	return ch
+}
+
+func (s *offerStorage) ageOffers() {
+	offer, ok := s.delayed.Pop().(Perishable)
+	if !ok {
+		log.Errorf("Expected Perishable, not %v", offer)
+		return
+	}
+	if details := offer.Details(); details != nil && !offer.HasExpired() {
+		// live offer has not expired yet: timed out early
+		// FWIW: early timeouts are more frequent when GOMAXPROCS is > 1
+		offer.addTo(s.delayed)
+	} else {
+		offer.age(s)
+	}
+}
+
+func (s *offerStorage) nextListener() *offerListener {
+	obj := s.listeners.Pop()
+	if listen, ok := obj.(*offerListener); !ok {
+		//programming error
+		panic(fmt.Sprintf("unexpected listener object %v", obj))
+	} else {
+		return listen
+	}
+}
+
+// notify listeners if we find an acceptable offer for them. listeners
+// are garbage collected after a certain age (see offerListenerMaxAge).
+// ids lists offer IDs that are retrievable from offer storage.
+func (s *offerStorage) notifyListeners(ids func() (util.StringSet, uint64)) {
+	listener := s.nextListener() // blocking
+
+	offerIds, version := ids()
+	if listener.sawVersion == version {
+		// no changes to offer list, avoid growing older - just wait for new offers to arrive
+		listener.deadline = time.Now().Add(s.ListenerDelay)
+		s.listeners.Offer(listener, queue.KeepExisting)
+		return
+	}
+	listener.sawVersion = version
+
+	// notify if we find an acceptable offer
+	for id := range offerIds {
+		if offer, ok := s.Get(id); !ok || offer.HasExpired() {
+			continue
+		} else if listener.accepts(offer.Details()) {
+			log.V(3).Infof("Notifying offer listener %s", listener.id)
+			close(listener.notify)
+			return
+		}
+	}
+
+	// no interesting offers found, re-queue the listener
+	listener.age++
+	if listener.age < offerListenerMaxAge {
+		listener.deadline = time.Now().Add(s.ListenerDelay)
+		s.listeners.Offer(listener, queue.KeepExisting)
+	} else {
+		// garbage collection is as simple as not re-adding the listener to the queue
+		log.V(3).Infof("garbage collecting offer listener %s", listener.id)
+	}
+}
+
+func (s *offerStorage) Init(done <-chan struct{}) {
+	// zero delay, reap offers as soon as they expire
+	go runtime.Until(s.ageOffers, 0, done)
+
+	// cached offer ids for the purposes of listener notification
+	idCache := &stringsCache{
+		refill: func() util.StringSet {
+			result := util.NewStringSet()
+			for _, v := range s.offers.List() {
+				if offer, ok := v.(Perishable); ok {
+					result.Insert(offer.Id())
+				}
+			}
+			return result
+		},
+		ttl: offerIdCacheTTL,
+	}
+
+	go runtime.Until(func() { s.notifyListeners(idCache.Strings) }, notifyListenersDelay, done)
+}
+
+type stringsCache struct {
+	expiresAt time.Time
+	cached    util.StringSet
+	ttl       time.Duration
+	refill    func() util.StringSet
+	version   uint64
+}
+
+// not thread-safe
+func (c *stringsCache) Strings() (util.StringSet, uint64) {
+	now := time.Now()
+	if c.expiresAt.Before(now) {
+		old := c.cached
+		c.cached = c.refill()
+		c.expiresAt = now.Add(c.ttl)
+		if !reflect.DeepEqual(old, c.cached) {
+			c.version++
+		}
+	}
+	return c.cached, c.version
+}
+
+type slaveStorage struct {
+	sync.Mutex
+	index map[string]string // map offerId to slaveId
+}
+
+func newSlaveStorage() *slaveStorage {
+	return &slaveStorage{
+		index: make(map[string]string),
+	}
+}
+
+// create a mapping between a slave and an offer
+func (self *slaveStorage) add(slaveId, offerId string) {
+	self.Lock()
+	defer self.Unlock()
+	self.index[offerId] = slaveId
+}
+
+// delete the slave-offer mappings for slaveId, returns the IDs of the offers that were unmapped
+func (self *slaveStorage) deleteSlave(slaveId string) util.StringSet {
+	offerIds := util.NewStringSet()
+	self.Lock()
+	defer self.Unlock()
+	for oid, sid := range self.index {
+		if sid == slaveId {
+			offerIds.Insert(oid)
+			delete(self.index, oid)
+		}
+	}
+	return offerIds
+}
+
+// delete the slave-offer mappings for offerId
+func (self *slaveStorage) deleteOffer(offerId string) {
+	self.Lock()
+	defer self.Unlock()
+	delete(self.index, offerId)
+}
--- a/contrib/mesos/pkg/offers/offers_test.go
+++ b/contrib/mesos/pkg/offers/offers_test.go
@@ -0,0 +1,391 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package offers
+
+import (
+	"errors"
+	"sync/atomic"
+	"testing"
+	"time"
+
+	"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/proc"
+	mesos "github.com/mesos/mesos-go/mesosproto"
+	util "github.com/mesos/mesos-go/mesosutil"
+)
+
+func TestExpiredOffer(t *testing.T) {
+	t.Parallel()
+
+	ttl := 2 * time.Second
+	o := Expired("test", "testhost", ttl)
+
+	if o.Id() != "test" {
+		t.Error("expiredOffer does not return its Id")
+	}
+	if o.Host() != "testhost" {
+		t.Error("expiredOffer does not return its hostname")
+	}
+	if o.HasExpired() != true {
+		t.Error("expiredOffer is not expired")
+	}
+	if o.Details() != nil {
+		t.Error("expiredOffer does not return nil Details")
+	}
+	if o.Acquire() != false {
+		t.Error("expiredOffer must not be able to be acquired")
+	}
+	if delay := o.GetDelay(); !(0 < delay && delay <= ttl) {
+		t.Error("expiredOffer does not return a valid deadline")
+	}
+} // TestExpiredOffer
+
+func TestTimedOffer(t *testing.T) {
+	t.Parallel()
+
+	ttl := 2 * time.Second
+	now := time.Now()
+	o := &liveOffer{nil, now.Add(ttl), 0}
+
+	if o.HasExpired() {
+		t.Errorf("offer ttl was %v and should not have expired yet", ttl)
+	}
+	if !o.Acquire() {
+		t.Fatal("1st acquisition of offer failed")
+	}
+	o.Release()
+	if !o.Acquire() {
+		t.Fatal("2nd acquisition of offer failed")
+	}
+	if o.Acquire() {
+		t.Fatal("3rd acquisition of offer passed but prior claim was not released")
+	}
+	o.Release()
+	if !o.Acquire() {
+		t.Fatal("4th acquisition of offer failed")
+	}
+	o.Release()
+	time.Sleep(ttl)
+	if !o.HasExpired() {
+		t.Fatal("offer not expired after ttl passed")
+	}
+	if !o.Acquire() {
+		t.Fatal("5th acquisition of offer failed; should not be tied to expiration")
+	}
+	if o.Acquire() {
+		t.Fatal("6th acquisition of offer succeeded; should already be acquired")
+	}
+} // TestTimedOffer
+
+func TestOfferStorage(t *testing.T) {
+	ttl := time.Second / 4
+	var declinedNum int32
+	getDeclinedNum := func() int32 { return atomic.LoadInt32(&declinedNum) }
+	config := RegistryConfig{
+		DeclineOffer: func(offerId string) <-chan error {
+			atomic.AddInt32(&declinedNum, 1)
+			return proc.ErrorChan(nil)
+		},
+		Compat: func(o *mesos.Offer) bool {
+			return o.Hostname == nil || *o.Hostname != "incompatiblehost"
+		},
+		TTL:       ttl,
+		LingerTTL: 2 * ttl,
+	}
+	storage := CreateRegistry(config)
+
+	done := make(chan struct{})
+	storage.Init(done)
+
+	// Add offer
+	id := util.NewOfferID("foo")
+	o := &mesos.Offer{Id: id}
+	storage.Add([]*mesos.Offer{o})
+
+	// Added offer should be in the storage
+	if obj, ok := storage.Get(id.GetValue()); obj == nil || !ok {
+		t.Error("offer not added")
+	}
+	if obj, _ := storage.Get(id.GetValue()); obj.Details() != o {
+		t.Error("added offer differs from returned offer")
+	}
+
+	// Not-added offer is not in storage
+	if obj, ok := storage.Get("bar"); obj != nil || ok {
+		t.Error("offer bar should not exist in storage")
+	}
+
+	// Deleted offer lingers in storage, is acquired and declined
+	offer, _ := storage.Get(id.GetValue())
+	declinedNumBefore := getDeclinedNum()
+	storage.Delete(id.GetValue(), "deleted for test")
+	if obj, _ := storage.Get(id.GetValue()); obj == nil {
+		t.Error("deleted offer is not lingering")
+	}
+	if obj, _ := storage.Get(id.GetValue()); !obj.HasExpired() {
+		t.Error("deleted offer is no expired")
+	}
+	if ok := offer.Acquire(); ok {
+		t.Error("deleted offer can be acquired")
+	}
+	if getDeclinedNum() <= declinedNumBefore {
+		t.Error("deleted offer was not declined")
+	}
+
+	// Acquired offer is only declined after 2*ttl
+	id = util.NewOfferID("foo2")
+	o = &mesos.Offer{Id: id}
+	storage.Add([]*mesos.Offer{o})
+	offer, _ = storage.Get(id.GetValue())
+	declinedNumBefore = getDeclinedNum()
+	offer.Acquire()
+	storage.Delete(id.GetValue(), "deleted for test")
+	if getDeclinedNum() > declinedNumBefore {
+		t.Error("acquired offer is declined")
+	}
+
+	offer.Release()
+	time.Sleep(3 * ttl)
+	if getDeclinedNum() <= declinedNumBefore {
+		t.Error("released offer is not declined after 2*ttl")
+	}
+
+	// Added offer should be expired after ttl, but lingering
+	id = util.NewOfferID("foo3")
+	o = &mesos.Offer{Id: id}
+	storage.Add([]*mesos.Offer{o})
+
+	time.Sleep(2 * ttl)
+	obj, ok := storage.Get(id.GetValue())
+	if obj == nil || !ok {
+		t.Error("offer not lingering after ttl")
+	}
+	if !obj.HasExpired() {
+		t.Error("offer is not expired after ttl")
+	}
+
+	// Should be deleted when waiting longer than LingerTTL
+	time.Sleep(2 * ttl)
+	if obj, ok := storage.Get(id.GetValue()); obj != nil || ok {
+		t.Error("offer not deleted after LingerTTL")
+	}
+
+	// Incompatible offer is declined
+	id = util.NewOfferID("foo4")
+	incompatibleHostname := "incompatiblehost"
+	o = &mesos.Offer{Id: id, Hostname: &incompatibleHostname}
+	declinedNumBefore = getDeclinedNum()
+	storage.Add([]*mesos.Offer{o})
+	if obj, ok := storage.Get(id.GetValue()); obj != nil || ok {
+		t.Error("incompatible offer not rejected")
+	}
+	if getDeclinedNum() <= declinedNumBefore {
+		t.Error("incompatible offer is not declined")
+	}
+
+	// Invalidated offer are not declined, but expired
+	id = util.NewOfferID("foo5")
+	o = &mesos.Offer{Id: id}
+	storage.Add([]*mesos.Offer{o})
+	offer, _ = storage.Get(id.GetValue())
+	declinedNumBefore = getDeclinedNum()
+	storage.Invalidate(id.GetValue())
+	if obj, _ := storage.Get(id.GetValue()); !obj.HasExpired() {
+		t.Error("invalidated offer is not expired")
+	}
+	if getDeclinedNum() > declinedNumBefore {
+		t.Error("invalidated offer is declined")
+	}
+	if ok := offer.Acquire(); ok {
+		t.Error("invalidated offer can be acquired")
+	}
+
+	// Invalidate "" will invalidate all offers
+	id = util.NewOfferID("foo6")
+	o = &mesos.Offer{Id: id}
+	storage.Add([]*mesos.Offer{o})
+	id2 := util.NewOfferID("foo7")
+	o2 := &mesos.Offer{Id: id2}
+	storage.Add([]*mesos.Offer{o2})
+	storage.Invalidate("")
+	if obj, _ := storage.Get(id.GetValue()); !obj.HasExpired() {
+		t.Error("invalidated offer is not expired")
+	}
+	if obj2, _ := storage.Get(id2.GetValue()); !obj2.HasExpired() {
+		t.Error("invalidated offer is not expired")
+	}
+
+	// InvalidateForSlave invalides all offers for that slave, but only those
+	id = util.NewOfferID("foo8")
+	slaveId := util.NewSlaveID("test-slave")
+	o = &mesos.Offer{Id: id, SlaveId: slaveId}
+	storage.Add([]*mesos.Offer{o})
+	id2 = util.NewOfferID("foo9")
+	o2 = &mesos.Offer{Id: id2}
+	storage.Add([]*mesos.Offer{o2})
+	storage.InvalidateForSlave(slaveId.GetValue())
+	if obj, _ := storage.Get(id.GetValue()); !obj.HasExpired() {
+		t.Error("invalidated offer for test-slave is not expired")
+	}
+	if obj2, _ := storage.Get(id2.GetValue()); obj2.HasExpired() {
+		t.Error("invalidated offer another slave is expired")
+	}
+
+	close(done)
+} // TestOfferStorage
+
+func TestListen(t *testing.T) {
+	ttl := time.Second / 4
+	config := RegistryConfig{
+		DeclineOffer: func(offerId string) <-chan error {
+			return proc.ErrorChan(nil)
+		},
+		Compat: func(o *mesos.Offer) bool {
+			return true
+		},
+		TTL:           ttl,
+		ListenerDelay: ttl / 2,
+	}
+	storage := CreateRegistry(config)
+
+	done := make(chan struct{})
+	storage.Init(done)
+
+	// Create two listeners with a hostname filter
+	hostname1 := "hostname1"
+	hostname2 := "hostname2"
+	listener1 := storage.Listen("listener1", func(offer *mesos.Offer) bool {
+		return offer.GetHostname() == hostname1
+	})
+	listener2 := storage.Listen("listener2", func(offer *mesos.Offer) bool {
+		return offer.GetHostname() == hostname2
+	})
+
+	// Add hostname1 offer
+	id := util.NewOfferID("foo")
+	o := &mesos.Offer{Id: id, Hostname: &hostname1}
+	storage.Add([]*mesos.Offer{o})
+
+	// listener1 is notified by closing channel
+	select {
+	case _, more := <-listener1:
+		if more {
+			t.Error("listener1 is not closed")
+		}
+	}
+
+	// listener2 is not notified within ttl
+	select {
+	case <-listener2:
+		t.Error("listener2 is notified")
+	case <-time.After(ttl):
+	}
+
+	close(done)
+} // TestListen
+
+func TestWalk(t *testing.T) {
+	t.Parallel()
+	config := RegistryConfig{
+		DeclineOffer: func(offerId string) <-chan error {
+			return proc.ErrorChan(nil)
+		},
+		TTL:           0 * time.Second,
+		LingerTTL:     0 * time.Second,
+		ListenerDelay: 0 * time.Second,
+	}
+	storage := CreateRegistry(config)
+	acceptedOfferId := ""
+	walked := 0
+	walker1 := func(p Perishable) (bool, error) {
+		walked++
+		if p.Acquire() {
+			acceptedOfferId = p.Details().Id.GetValue()
+			return true, nil
+		}
+		return false, nil
+	}
+	// sanity check
+	err := storage.Walk(walker1)
+	if err != nil {
+		t.Fatalf("received impossible error %v", err)
+	}
+	if walked != 0 {
+		t.Fatal("walked empty storage")
+	}
+	if acceptedOfferId != "" {
+		t.Fatal("somehow found an offer when registry was empty")
+	}
+	impl, ok := storage.(*offerStorage)
+	if !ok {
+		t.Fatal("unexpected offer storage impl")
+	}
+	// single offer
+	ttl := 2 * time.Second
+	now := time.Now()
+	o := &liveOffer{&mesos.Offer{Id: util.NewOfferID("foo")}, now.Add(ttl), 0}
+
+	impl.offers.Add(o)
+	err = storage.Walk(walker1)
+	if err != nil {
+		t.Fatalf("received impossible error %v", err)
+	}
+	if walked != 1 {
+		t.Fatalf("walk count %d", walked)
+	}
+	if acceptedOfferId != "foo" {
+		t.Fatalf("found offer %v", acceptedOfferId)
+	}
+
+	acceptedOfferId = ""
+	err = storage.Walk(walker1)
+	if err != nil {
+		t.Fatalf("received impossible error %v", err)
+	}
+	if walked != 2 {
+		t.Fatalf("walk count %d", walked)
+	}
+	if acceptedOfferId != "" {
+		t.Fatalf("found offer %v", acceptedOfferId)
+	}
+
+	walker2 := func(p Perishable) (bool, error) {
+		walked++
+		return true, nil
+	}
+	err = storage.Walk(walker2)
+	if err != nil {
+		t.Fatalf("received impossible error %v", err)
+	}
+	if walked != 3 {
+		t.Fatalf("walk count %d", walked)
+	}
+	if acceptedOfferId != "" {
+		t.Fatalf("found offer %v", acceptedOfferId)
+	}
+
+	walker3 := func(p Perishable) (bool, error) {
+		walked++
+		return true, errors.New("baz")
+	}
+	err = storage.Walk(walker3)
+	if err == nil {
+		t.Fatal("expected error")
+	}
+	if walked != 4 {
+		t.Fatalf("walk count %d", walked)
+	}
+}
--- a/contrib/mesos/pkg/proc/doc.go
+++ b/contrib/mesos/pkg/proc/doc.go
@@ -0,0 +1,19 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+// Package proc provides opinionated utilities for processing background
+// operations and future errors, somewhat inspired by libprocess.
+package proc
--- a/contrib/mesos/pkg/proc/errors.go
+++ b/contrib/mesos/pkg/proc/errors.go
@@ -0,0 +1,34 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package proc
+
+import (
+	"errors"
+)
+
+var (
+	errProcessTerminated = errors.New("cannot execute action because process has terminated")
+	errIllegalState      = errors.New("illegal state, cannot execute action")
+)
+
+func IsProcessTerminated(err error) bool {
+	return err == errProcessTerminated
+}
+
+func IsIllegalState(err error) bool {
+	return err == errIllegalState
+}
--- a/contrib/mesos/pkg/proc/proc.go
+++ b/contrib/mesos/pkg/proc/proc.go
@@ -0,0 +1,377 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package proc
+
+import (
+	"fmt"
+	"sync"
+	"sync/atomic"
+	"time"
+
+	"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/runtime"
+	log "github.com/golang/glog"
+)
+
+const (
+	// if the action processor crashes (if some Action panics) then we
+	// wait this long before spinning up the action processor again.
+	defaultActionHandlerCrashDelay = 100 * time.Millisecond
+
+	// how many actions we can store in the backlog
+	defaultActionQueueDepth = 1024
+)
+
+type procImpl struct {
+	Config
+	backlog   chan Action    // action queue
+	terminate chan struct{}  // signaled via close()
+	wg        sync.WaitGroup // End() terminates when the wait is over
+	done      runtime.Signal
+	state     *stateType
+	pid       uint32
+	writeLock sync.Mutex    // avoid data race between write and close of backlog
+	changed   *sync.Cond    // wait/signal for backlog changes
+	engine    DoerFunc      // isolated this for easier unit testing later on
+	running   chan struct{} // closes once event loop processing starts
+	dead      chan struct{} // closes upon completion of process termination
+}
+
+type Config struct {
+	// cooldown period in between deferred action crashes
+	actionHandlerCrashDelay time.Duration
+
+	// determines the size of the deferred action backlog
+	actionQueueDepth uint32
+}
+
+var (
+	defaultConfig = Config{
+		actionHandlerCrashDelay: defaultActionHandlerCrashDelay,
+		actionQueueDepth:        defaultActionQueueDepth,
+	}
+	pid           uint32
+	closedErrChan <-chan error
+)
+
+func init() {
+	ch := make(chan error)
+	close(ch)
+	closedErrChan = ch
+}
+
+func New() Process {
+	return newConfigured(defaultConfig)
+}
+
+func newConfigured(config Config) Process {
+	state := stateNew
+	pi := &procImpl{
+		Config:    config,
+		backlog:   make(chan Action, config.actionQueueDepth),
+		terminate: make(chan struct{}),
+		state:     &state,
+		pid:       atomic.AddUint32(&pid, 1),
+		running:   make(chan struct{}),
+		dead:      make(chan struct{}),
+	}
+	pi.engine = DoerFunc(pi.doLater)
+	pi.changed = sync.NewCond(&pi.writeLock)
+	pi.wg.Add(1) // symmetrical to wg.Done() in End()
+	pi.done = pi.begin()
+	return pi
+}
+
+// returns a chan that closes upon termination of the action processing loop
+func (self *procImpl) Done() <-chan struct{} {
+	return self.done
+}
+
+func (self *procImpl) Running() <-chan struct{} {
+	return self.running
+}
+
+func (self *procImpl) begin() runtime.Signal {
+	if !self.state.transition(stateNew, stateRunning) {
+		panic(fmt.Errorf("failed to transition from New to Idle state"))
+	}
+	defer log.V(2).Infof("started process %d", self.pid)
+	var entered runtime.Latch
+	// execute actions on the backlog chan
+	return runtime.After(func() {
+		runtime.Until(func() {
+			if entered.Acquire() {
+				close(self.running)
+				self.wg.Add(1)
+			}
+			for action := range self.backlog {
+				select {
+				case <-self.terminate:
+					return
+				default:
+					// signal to indicate there's room in the backlog now
+					self.changed.Broadcast()
+					// rely on Until to handle action panics
+					action()
+				}
+			}
+		}, self.actionHandlerCrashDelay, self.terminate)
+	}).Then(func() {
+		log.V(2).Infof("finished processing action backlog for process %d", self.pid)
+		if !entered.Acquire() {
+			self.wg.Done()
+		}
+	})
+}
+
+// execute some action in the context of the current process. Actions
+// executed via this func are to be executed in a concurrency-safe manner:
+// no two actions should execute at the same time. invocations of this func
+// should not block for very long, unless the action backlog is full or the
+// process is terminating.
+// returns errProcessTerminated if the process already ended.
+func (self *procImpl) doLater(deferredAction Action) (err <-chan error) {
+	a := Action(func() {
+		self.wg.Add(1)
+		defer self.wg.Done()
+		deferredAction()
+	})
+
+	scheduled := false
+	self.writeLock.Lock()
+	defer self.writeLock.Unlock()
+
+	for err == nil && !scheduled {
+		switch s := self.state.get(); s {
+		case stateRunning:
+			select {
+			case self.backlog <- a:
+				scheduled = true
+			default:
+				self.changed.Wait()
+			}
+		case stateTerminal:
+			err = ErrorChan(errProcessTerminated)
+		default:
+			err = ErrorChan(errIllegalState)
+		}
+	}
+	return
+}
+
+// implementation of Doer interface, schedules some action to be executed via
+// the current execution engine
+func (self *procImpl) Do(a Action) <-chan error {
+	return self.engine(a)
+}
+
+// spawn a goroutine that waits for an error. if a non-nil error is read from the
+// channel then the handler func is invoked, otherwise (nil error or closed chan)
+// the handler is skipped. if a nil handler is specified then it's not invoked.
+// the signal chan that's returned closes once the error process logic (and handler,
+// if any) has completed.
+func OnError(ch <-chan error, f func(error), abort <-chan struct{}) <-chan struct{} {
+	return runtime.After(func() {
+		if ch == nil {
+			return
+		}
+		select {
+		case err, ok := <-ch:
+			if ok && err != nil && f != nil {
+				f(err)
+			}
+		case <-abort:
+			if f != nil {
+				f(errProcessTerminated)
+			}
+		}
+	})
+}
+
+func (self *procImpl) OnError(ch <-chan error, f func(error)) <-chan struct{} {
+	return OnError(ch, f, self.Done())
+}
+
+func (self *procImpl) flush() {
+	log.V(2).Infof("flushing action backlog for process %d", self.pid)
+	i := 0
+	//TODO: replace with `for range self.backlog` once Go 1.3 support is dropped
+	for {
+		_, open := <-self.backlog
+		if !open {
+			break
+		}
+		i++
+	}
+	log.V(2).Infof("flushed %d backlog actions for process %d", i, self.pid)
+}
+
+func (self *procImpl) End() <-chan struct{} {
+	if self.state.transitionTo(stateTerminal, stateTerminal) {
+		go func() {
+			defer close(self.dead)
+			self.writeLock.Lock()
+			defer self.writeLock.Unlock()
+
+			log.V(2).Infof("terminating process %d", self.pid)
+
+			close(self.backlog)
+			close(self.terminate)
+			self.wg.Done()
+			self.changed.Broadcast()
+
+			log.V(2).Infof("waiting for deferred actions to complete")
+
+			// wait for all pending actions to complete, then flush the backlog
+			self.wg.Wait()
+			self.flush()
+		}()
+	}
+	return self.dead
+}
+
+type errorOnce struct {
+	once  sync.Once
+	err   chan error
+	abort <-chan struct{}
+}
+
+func NewErrorOnce(abort <-chan struct{}) ErrorOnce {
+	return &errorOnce{
+		err:   make(chan error, 1),
+		abort: abort,
+	}
+}
+
+func (b *errorOnce) Err() <-chan error {
+	return b.err
+}
+
+func (b *errorOnce) Reportf(msg string, args ...interface{}) {
+	b.Report(fmt.Errorf(msg, args...))
+}
+
+func (b *errorOnce) Report(err error) {
+	b.once.Do(func() {
+		select {
+		case b.err <- err:
+		default:
+		}
+	})
+}
+
+func (b *errorOnce) Send(errIn <-chan error) ErrorOnce {
+	go b.forward(errIn)
+	return b
+}
+
+func (b *errorOnce) forward(errIn <-chan error) {
+	if errIn == nil {
+		b.Report(nil)
+		return
+	}
+	select {
+	case err, _ := <-errIn:
+		b.Report(err)
+	case <-b.abort:
+		b.Report(errProcessTerminated)
+	}
+}
+
+type processAdapter struct {
+	parent   Process
+	delegate Doer
+}
+
+func (p *processAdapter) Do(a Action) <-chan error {
+	if p == nil || p.parent == nil || p.delegate == nil {
+		return ErrorChan(errIllegalState)
+	}
+	errCh := NewErrorOnce(p.Done())
+	go func() {
+		errOuter := p.parent.Do(func() {
+			errInner := p.delegate.Do(a)
+			errCh.forward(errInner)
+		})
+		// if the outer err is !nil then either the parent failed to schedule the
+		// the action, or else it backgrounded the scheduling task.
+		if errOuter != nil {
+			errCh.forward(errOuter)
+		}
+	}()
+	return errCh.Err()
+}
+
+func (p *processAdapter) End() <-chan struct{} {
+	if p != nil && p.parent != nil {
+		return p.parent.End()
+	}
+	return nil
+}
+
+func (p *processAdapter) Done() <-chan struct{} {
+	if p != nil && p.parent != nil {
+		return p.parent.Done()
+	}
+	return nil
+}
+
+func (p *processAdapter) Running() <-chan struct{} {
+	if p != nil && p.parent != nil {
+		return p.parent.Running()
+	}
+	return nil
+}
+
+func (p *processAdapter) OnError(ch <-chan error, f func(error)) <-chan struct{} {
+	if p != nil && p.parent != nil {
+		return p.parent.OnError(ch, f)
+	}
+	return nil
+}
+
+// returns a process that, within its execution context, delegates to the specified Doer.
+// if the given Doer instance is nil, a valid Process is still returned though calls to its
+// Do() implementation will always return errIllegalState.
+// if the given Process instance is nil then in addition to the behavior in the prior sentence,
+// calls to End() and Done() are effectively noops.
+func DoWith(other Process, d Doer) Process {
+	return &processAdapter{
+		parent:   other,
+		delegate: d,
+	}
+}
+
+func ErrorChanf(msg string, args ...interface{}) <-chan error {
+	return ErrorChan(fmt.Errorf(msg, args...))
+}
+
+func ErrorChan(err error) <-chan error {
+	if err == nil {
+		return closedErrChan
+	}
+	ch := make(chan error, 1)
+	ch <- err
+	return ch
+}
+
+// invoke the f on action a. returns an illegal state error if f is nil.
+func (f DoerFunc) Do(a Action) <-chan error {
+	if f != nil {
+		return f(a)
+	}
+	return ErrorChan(errIllegalState)
+}
--- a/contrib/mesos/pkg/proc/proc_test.go
+++ b/contrib/mesos/pkg/proc/proc_test.go
@@ -0,0 +1,373 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package proc
+
+import (
+	"fmt"
+	"sync"
+	"testing"
+	"time"
+
+	"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/runtime"
+	log "github.com/golang/glog"
+)
+
+// logs a testing.Fatalf if the elapsed time d passes before signal chan done is closed
+func fatalAfter(t *testing.T, done <-chan struct{}, d time.Duration, msg string, args ...interface{}) {
+	select {
+	case <-done:
+	case <-time.After(d):
+		t.Fatalf(msg, args...)
+	}
+}
+
+func errorAfter(errOnce ErrorOnce, done <-chan struct{}, d time.Duration, msg string, args ...interface{}) {
+	select {
+	case <-done:
+	case <-time.After(d):
+		errOnce.Reportf(msg, args...)
+	}
+}
+
+// logs a testing.Fatalf if the signal chan closes before the elapsed time d passes
+func fatalOn(t *testing.T, done <-chan struct{}, d time.Duration, msg string, args ...interface{}) {
+	select {
+	case <-done:
+		t.Fatalf(msg, args...)
+	case <-time.After(d):
+	}
+}
+
+func TestProc_manyEndings(t *testing.T) {
+	p := New()
+	const COUNT = 20
+	var wg sync.WaitGroup
+	wg.Add(COUNT)
+	for i := 0; i < COUNT; i++ {
+		runtime.On(p.End(), wg.Done)
+	}
+	fatalAfter(t, runtime.After(wg.Wait), 5*time.Second, "timed out waiting for loose End()s")
+	fatalAfter(t, p.Done(), 5*time.Second, "timed out waiting for process death")
+}
+
+func TestProc_singleAction(t *testing.T) {
+	p := New()
+	scheduled := make(chan struct{})
+	called := make(chan struct{})
+
+	go func() {
+		log.Infof("do'ing deferred action")
+		defer close(scheduled)
+		err := p.Do(func() {
+			defer close(called)
+			log.Infof("deferred action invoked")
+		})
+		if err != nil {
+			t.Fatalf("unexpected error: %v", err)
+		}
+	}()
+
+	fatalAfter(t, scheduled, 5*time.Second, "timed out waiting for deferred action to be scheduled")
+	fatalAfter(t, called, 5*time.Second, "timed out waiting for deferred action to be invoked")
+
+	p.End()
+
+	fatalAfter(t, p.Done(), 5*time.Second, "timed out waiting for process death")
+}
+
+func TestProc_singleActionEnd(t *testing.T) {
+	p := New()
+	scheduled := make(chan struct{})
+	called := make(chan struct{})
+
+	go func() {
+		log.Infof("do'ing deferred action")
+		defer close(scheduled)
+		err := p.Do(func() {
+			defer close(called)
+			log.Infof("deferred action invoked")
+			p.End()
+		})
+		if err != nil {
+			t.Fatalf("unexpected error: %v", err)
+		}
+	}()
+
+	fatalAfter(t, scheduled, 5*time.Second, "timed out waiting for deferred action to be scheduled")
+	fatalAfter(t, called, 5*time.Second, "timed out waiting for deferred action to be invoked")
+	fatalAfter(t, p.Done(), 5*time.Second, "timed out waiting for process death")
+}
+
+func TestProc_multiAction(t *testing.T) {
+	p := New()
+	const COUNT = 10
+	var called sync.WaitGroup
+	called.Add(COUNT)
+
+	// test FIFO property
+	next := 0
+	for i := 0; i < COUNT; i++ {
+		log.Infof("do'ing deferred action %d", i)
+		idx := i
+		err := p.Do(func() {
+			defer called.Done()
+			log.Infof("deferred action invoked")
+			if next != idx {
+				t.Fatalf("expected index %d instead of %d", idx, next)
+			}
+			next++
+		})
+		if err != nil {
+			t.Fatalf("unexpected error: %v", err)
+		}
+	}
+
+	fatalAfter(t, runtime.After(called.Wait), 2*time.Second, "timed out waiting for deferred actions to be invoked")
+
+	p.End()
+
+	fatalAfter(t, p.Done(), 5*time.Second, "timed out waiting for process death")
+}
+
+func TestProc_goodLifecycle(t *testing.T) {
+	p := New()
+	p.End()
+	fatalAfter(t, p.Done(), 5*time.Second, "timed out waiting for process death")
+}
+
+func TestProc_doWithDeadProc(t *testing.T) {
+	p := New()
+	p.End()
+	time.Sleep(100 * time.Millisecond)
+
+	errUnexpected := fmt.Errorf("unexpected execution of delegated action")
+	decorated := DoWith(p, DoerFunc(func(_ Action) <-chan error {
+		return ErrorChan(errUnexpected)
+	}))
+
+	decorated.Do(func() {})
+	fatalAfter(t, decorated.Done(), 5*time.Second, "timed out waiting for process death")
+}
+
+func TestProc_doWith(t *testing.T) {
+	p := New()
+
+	delegated := false
+	decorated := DoWith(p, DoerFunc(func(a Action) <-chan error {
+		delegated = true
+		a()
+		return nil
+	}))
+
+	executed := make(chan struct{})
+	err := decorated.Do(func() {
+		defer close(executed)
+		if !delegated {
+			t.Fatalf("expected delegated execution")
+		}
+	})
+	if err == nil {
+		t.Fatalf("expected !nil error chan")
+	}
+
+	fatalAfter(t, executed, 5*time.Second, "timed out waiting deferred execution")
+	fatalAfter(t, decorated.OnError(err, func(e error) {
+		t.Fatalf("unexpected error: %v", err)
+	}), 1*time.Second, "timed out waiting for doer result")
+
+	decorated.End()
+	fatalAfter(t, p.Done(), 5*time.Second, "timed out waiting for process death")
+}
+
+func TestProc_doWithNestedTwice(t *testing.T) {
+	p := New()
+
+	delegated := false
+	decorated := DoWith(p, DoerFunc(func(a Action) <-chan error {
+		a()
+		return nil
+	}))
+
+	decorated2 := DoWith(decorated, DoerFunc(func(a Action) <-chan error {
+		delegated = true
+		a()
+		return nil
+	}))
+
+	executed := make(chan struct{})
+	err := decorated2.Do(func() {
+		defer close(executed)
+		if !delegated {
+			t.Fatalf("expected delegated execution")
+		}
+	})
+	if err == nil {
+		t.Fatalf("expected !nil error chan")
+	}
+
+	fatalAfter(t, executed, 5*time.Second, "timed out waiting deferred execution")
+	fatalAfter(t, decorated2.OnError(err, func(e error) {
+		t.Fatalf("unexpected error: %v", err)
+	}), 1*time.Second, "timed out waiting for doer result")
+
+	decorated2.End()
+	fatalAfter(t, p.Done(), 5*time.Second, "timed out waiting for process death")
+}
+
+func TestProc_doWithNestedErrorPropagation(t *testing.T) {
+	p := New()
+
+	delegated := false
+	decorated := DoWith(p, DoerFunc(func(a Action) <-chan error {
+		a()
+		return nil
+	}))
+
+	expectedErr := fmt.Errorf("expecting this")
+	errOnce := NewErrorOnce(p.Done())
+	decorated2 := DoWith(decorated, DoerFunc(func(a Action) <-chan error {
+		delegated = true
+		a()
+		errOnce.Reportf("unexpected error in decorator2")
+		return ErrorChanf("another unexpected error in decorator2")
+	}))
+
+	executed := make(chan struct{})
+	err := decorated2.Do(func() {
+		defer close(executed)
+		if !delegated {
+			t.Fatalf("expected delegated execution")
+		}
+		errOnce.Report(expectedErr)
+	})
+	if err == nil {
+		t.Fatalf("expected !nil error chan")
+	}
+	errOnce.Send(err)
+
+	foundError := false
+	fatalAfter(t, executed, 1*time.Second, "timed out waiting deferred execution")
+	fatalAfter(t, decorated2.OnError(errOnce.Err(), func(e error) {
+		if e != expectedErr {
+			t.Fatalf("unexpected error: %v", err)
+		} else {
+			foundError = true
+		}
+	}), 1*time.Second, "timed out waiting for doer result")
+
+	if !foundError {
+		t.Fatalf("expected a propagated error")
+	}
+
+	decorated2.End()
+	fatalAfter(t, p.Done(), 5*time.Second, "timed out waiting for process death")
+}
+
+func runDelegationTest(t *testing.T, p Process, name string, errOnce ErrorOnce) {
+	defer func() {
+		t.Logf("runDelegationTest finished at " + time.Now().String())
+	}()
+	var decorated Process
+	decorated = p
+
+	const DEPTH = 100
+	var wg sync.WaitGroup
+	wg.Add(DEPTH)
+	y := 0
+
+	for x := 1; x <= DEPTH; x++ {
+		x := x
+		nextp := DoWith(decorated, DoerFunc(func(a Action) <-chan error {
+			if x == 1 {
+				t.Logf("delegate chain invoked for " + name)
+			}
+			y++
+			if y != x {
+				return ErrorChanf("out of order delegated execution")
+			}
+			defer wg.Done()
+			a()
+			return nil
+		}))
+		decorated = nextp
+	}
+
+	executed := make(chan struct{})
+	errCh := decorated.Do(func() {
+		defer close(executed)
+		if y != DEPTH {
+			errOnce.Reportf("expected delegated execution")
+		}
+		t.Logf("executing deferred action: " + name + " at " + time.Now().String())
+		errOnce.Send(nil) // we completed without error, let the listener know
+	})
+	if errCh == nil {
+		t.Fatalf("expected !nil error chan")
+	}
+
+	// forward any scheduling errors to the listener; NOTHING else should attempt to read
+	// from errCh after this point
+	errOnce.Send(errCh)
+
+	errorAfter(errOnce, executed, 5*time.Second, "timed out waiting deferred execution")
+	t.Logf("runDelegationTest received executed signal at " + time.Now().String())
+}
+
+func TestProc_doWithNestedX(t *testing.T) {
+	t.Logf("starting test case at " + time.Now().String())
+	p := New()
+	errOnce := NewErrorOnce(p.Done())
+	runDelegationTest(t, p, "nested", errOnce)
+	<-p.End()
+	select {
+	case err := <-errOnce.Err():
+		if err != nil {
+			t.Fatalf("unexpected error: %v", err)
+		}
+	case <-time.After(5 * time.Second):
+		t.Fatalf("timed out waiting for doer result")
+	}
+	fatalAfter(t, p.Done(), 5*time.Second, "timed out waiting for process death")
+}
+
+// intended to be run with -race
+func TestProc_doWithNestedXConcurrent(t *testing.T) {
+	p := New()
+	errOnce := NewErrorOnce(p.Done())
+	var wg sync.WaitGroup
+	const CONC = 20
+	wg.Add(CONC)
+	for i := 0; i < CONC; i++ {
+		i := i
+		runtime.After(func() { runDelegationTest(t, p, fmt.Sprintf("nested%d", i), errOnce) }).Then(wg.Done)
+	}
+	ch := runtime.After(wg.Wait)
+	fatalAfter(t, ch, 10*time.Second, "timed out waiting for concurrent delegates")
+
+	<-p.End()
+
+	select {
+	case err := <-errOnce.Err():
+		if err != nil {
+			t.Fatalf("unexpected error: %v", err)
+		}
+	case <-time.After(5 * time.Second):
+		t.Fatalf("timed out waiting for doer result")
+	}
+
+	fatalAfter(t, p.Done(), 5*time.Second, "timed out waiting for process death")
+}
--- a/contrib/mesos/pkg/proc/state.go
+++ b/contrib/mesos/pkg/proc/state.go
@@ -0,0 +1,55 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package proc
+
+import (
+	"sync/atomic"
+)
+
+type stateType int32
+
+const (
+	stateNew stateType = iota
+	stateRunning
+	stateTerminal
+)
+
+func (s *stateType) get() stateType {
+	return stateType(atomic.LoadInt32((*int32)(s)))
+}
+
+func (s *stateType) transition(from, to stateType) bool {
+	return atomic.CompareAndSwapInt32((*int32)(s), int32(from), int32(to))
+}
+
+func (s *stateType) transitionTo(to stateType, unless ...stateType) bool {
+	if len(unless) == 0 {
+		atomic.StoreInt32((*int32)(s), int32(to))
+		return true
+	}
+	for {
+		state := s.get()
+		for _, x := range unless {
+			if state == x {
+				return false
+			}
+		}
+		if s.transition(state, to) {
+			return true
+		}
+	}
+}
--- a/contrib/mesos/pkg/proc/types.go
+++ b/contrib/mesos/pkg/proc/types.go
@@ -0,0 +1,71 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package proc
+
+// something that executes in the context of a process
+type Action func()
+
+type Context interface {
+	// end (terminate) the execution context
+	End() <-chan struct{}
+
+	// return a signal chan that will close upon the termination of this process
+	Done() <-chan struct{}
+}
+
+type Doer interface {
+	// execute some action in some context. actions are to be executed in a
+	// concurrency-safe manner: no two actions should execute at the same time.
+	// errors are generated if the action cannot be executed (not by the execution
+	// of the action) and should be testable with the error API of this package,
+	// for example, IsProcessTerminated.
+	Do(Action) <-chan error
+}
+
+// adapter func for Doer interface
+type DoerFunc func(Action) <-chan error
+
+type Process interface {
+	Context
+	Doer
+
+	// see top level OnError func. this implementation will terminate upon the arrival of
+	// an error (and subsequently invoke the error handler, if given) or else the termination
+	// of the process (testable via IsProcessTerminated).
+	OnError(<-chan error, func(error)) <-chan struct{}
+
+	// return a signal chan that will close once the process is ready to run actions
+	Running() <-chan struct{}
+}
+
+// this is an error promise. if we ever start building out support for other promise types it will probably
+// make sense to group them in some sort of "promises" package.
+type ErrorOnce interface {
+	// return a chan that only ever sends one error, either obtained via Report() or Forward()
+	Err() <-chan error
+
+	// reports the given error via Err(), but only if no other errors have been reported or forwarded
+	Report(error)
+	Reportf(string, ...interface{})
+
+	// waits for an error on the incoming chan, the result of which is later obtained via Err() (if no
+	// other errors have been reported or forwarded)
+	forward(<-chan error)
+
+	// non-blocking, spins up a goroutine that reports an error (if any) that occurs on the error chan.
+	Send(<-chan error) ErrorOnce
+}
--- a/contrib/mesos/pkg/profile/doc.go
+++ b/contrib/mesos/pkg/profile/doc.go
@@ -0,0 +1,18 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+// Package profile contains reusable code for profiling Go programs with pprof.
+package profile
--- a/contrib/mesos/pkg/profile/profile.go
+++ b/contrib/mesos/pkg/profile/profile.go
@@ -0,0 +1,27 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package profile
+
+import "net/http"
+import "net/http/pprof"
+
+func InstallHandler(m *http.ServeMux) {
+	// register similar endpoints as net/http/pprof.init() does
+	m.Handle("/debug/pprof/", http.HandlerFunc(pprof.Index))
+	m.Handle("/debug/pprof/profile", http.HandlerFunc(pprof.Profile))
+	m.Handle("/debug/pprof/symbol", http.HandlerFunc(pprof.Symbol))
+}
--- a/contrib/mesos/pkg/queue/delay.go
+++ b/contrib/mesos/pkg/queue/delay.go
@@ -0,0 +1,373 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package queue
+
+import (
+	"container/heap"
+	"sync"
+	"time"
+
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/util"
+)
+
+type qitem struct {
+	value    interface{}
+	priority Priority
+	index    int
+	readd    func(item *qitem) // re-add the value of the item to the queue
+}
+
+// A priorityQueue implements heap.Interface and holds qitems.
+type priorityQueue []*qitem
+
+func (pq priorityQueue) Len() int { return len(pq) }
+
+func (pq priorityQueue) Less(i, j int) bool {
+	return pq[i].priority.ts.Before(pq[j].priority.ts)
+}
+
+func (pq priorityQueue) Swap(i, j int) {
+	pq[i], pq[j] = pq[j], pq[i]
+	pq[i].index = i
+	pq[j].index = j
+}
+
+func (pq *priorityQueue) Push(x interface{}) {
+	n := len(*pq)
+	item := x.(*qitem)
+	item.index = n
+	*pq = append(*pq, item)
+}
+
+func (pq *priorityQueue) Pop() interface{} {
+	old := *pq
+	n := len(old)
+	item := old[n-1]
+	item.index = -1 // for safety
+	*pq = old[0 : n-1]
+	return item
+}
+
+// concurrency-safe, deadline-oriented queue that returns items after their
+// delay period has expired.
+type DelayQueue struct {
+	queue priorityQueue
+	lock  sync.RWMutex
+	cond  sync.Cond
+}
+
+func NewDelayQueue() *DelayQueue {
+	q := &DelayQueue{}
+	q.cond.L = &q.lock
+	return q
+}
+
+func (q *DelayQueue) Add(d Delayed) {
+	deadline := extractFromDelayed(d)
+
+	q.lock.Lock()
+	defer q.lock.Unlock()
+
+	// readd using the original deadline computed from the original delay
+	var readd func(*qitem)
+	readd = func(qp *qitem) {
+		q.lock.Lock()
+		defer q.lock.Unlock()
+		heap.Push(&q.queue, &qitem{
+			value:    d,
+			priority: deadline,
+			readd:    readd,
+		})
+		q.cond.Broadcast()
+	}
+	heap.Push(&q.queue, &qitem{
+		value:    d,
+		priority: deadline,
+		readd:    readd,
+	})
+	q.cond.Broadcast()
+}
+
+// If there's a deadline reported by d.Deadline() then `d` is added to the
+// queue and this func returns true.
+func (q *DelayQueue) Offer(d Deadlined) bool {
+	deadline, ok := extractFromDeadlined(d)
+	if ok {
+		q.lock.Lock()
+		defer q.lock.Unlock()
+		heap.Push(&q.queue, &qitem{
+			value:    d,
+			priority: deadline,
+			readd: func(qp *qitem) {
+				q.Offer(qp.value.(Deadlined))
+			},
+		})
+		q.cond.Broadcast()
+	}
+	return ok
+}
+
+// wait for the delay of the next item in the queue to expire, blocking if
+// there are no items in the queue. does not guarantee first-come-first-serve
+// ordering with respect to clients.
+func (q *DelayQueue) Pop() interface{} {
+	// doesn't implement cancellation, will always return a non-nil value
+	return q.pop(func() *qitem {
+		q.lock.Lock()
+		defer q.lock.Unlock()
+		for q.queue.Len() == 0 {
+			q.cond.Wait()
+		}
+		x := heap.Pop(&q.queue)
+		item := x.(*qitem)
+		return item
+	}, nil)
+}
+
+// returns a non-nil value from the queue, or else nil if/when cancelled; if cancel
+// is nil then cancellation is disabled and this func must return a non-nil value.
+func (q *DelayQueue) pop(next func() *qitem, cancel <-chan struct{}) interface{} {
+	var ch chan struct{}
+	for {
+		item := next()
+		if item == nil {
+			// cancelled
+			return nil
+		}
+		x := item.value
+		waitingPeriod := item.priority.ts.Sub(time.Now())
+		if waitingPeriod >= 0 {
+			// listen for calls to Add() while we're waiting for the deadline
+			if ch == nil {
+				ch = make(chan struct{}, 1)
+			}
+			go func() {
+				q.lock.Lock()
+				defer q.lock.Unlock()
+				q.cond.Wait()
+				ch <- struct{}{}
+			}()
+			select {
+			case <-cancel:
+				item.readd(item)
+				return nil
+			case <-ch:
+				// we may no longer have the earliest deadline, re-try
+				item.readd(item)
+				continue
+			case <-time.After(waitingPeriod):
+				// noop
+			case <-item.priority.notify:
+				// noop
+			}
+		}
+		return x
+	}
+}
+
+// If multiple adds/updates of a single item happen while an item is in the
+// queue before it has been processed, it will only be processed once, and
+// when it is processed, the most recent version will be processed. Items are
+// popped in order of their priority, currently controlled by a delay or
+// deadline assigned to each item in the queue.
+type DelayFIFO struct {
+	// internal deadline-based priority queue
+	delegate *DelayQueue
+	// We depend on the property that items in the set are in the queue and vice versa.
+	items          map[string]*qitem
+	deadlinePolicy DeadlinePolicy
+}
+
+func (q *DelayFIFO) lock() {
+	q.delegate.lock.Lock()
+}
+
+func (q *DelayFIFO) unlock() {
+	q.delegate.lock.Unlock()
+}
+
+func (q *DelayFIFO) rlock() {
+	q.delegate.lock.RLock()
+}
+
+func (q *DelayFIFO) runlock() {
+	q.delegate.lock.RUnlock()
+}
+
+func (q *DelayFIFO) queue() *priorityQueue {
+	return &q.delegate.queue
+}
+
+func (q *DelayFIFO) cond() *sync.Cond {
+	return &q.delegate.cond
+}
+
+// Add inserts an item, and puts it in the queue. The item is only enqueued
+// if it doesn't already exist in the set.
+func (q *DelayFIFO) Add(d UniqueDelayed, rp ReplacementPolicy) {
+	deadline := extractFromDelayed(d)
+	id := d.GetUID()
+	var adder func(*qitem)
+	adder = func(*qitem) {
+		q.add(id, deadline, d, KeepExisting, adder)
+	}
+	q.add(id, deadline, d, rp, adder)
+}
+
+func (q *DelayFIFO) Offer(d UniqueDeadlined, rp ReplacementPolicy) bool {
+	if deadline, ok := extractFromDeadlined(d); ok {
+		id := d.GetUID()
+		q.add(id, deadline, d, rp, func(qp *qitem) { q.Offer(qp.value.(UniqueDeadlined), KeepExisting) })
+		return true
+	}
+	return false
+}
+
+func (q *DelayFIFO) add(id string, deadline Priority, value interface{}, rp ReplacementPolicy, adder func(*qitem)) {
+	q.lock()
+	defer q.unlock()
+	if item, exists := q.items[id]; !exists {
+		item = &qitem{
+			value:    value,
+			priority: deadline,
+			readd:    adder,
+		}
+		heap.Push(q.queue(), item)
+		q.items[id] = item
+	} else {
+		// this is an update of an existing item
+		item.value = rp.replacementValue(item.value, value)
+		item.priority = q.deadlinePolicy.nextDeadline(item.priority, deadline)
+		heap.Fix(q.queue(), item.index)
+	}
+	q.cond().Broadcast()
+}
+
+// Delete removes an item. It doesn't add it to the queue, because
+// this implementation assumes the consumer only cares about the objects,
+// not their priority order.
+func (f *DelayFIFO) Delete(id string) {
+	f.lock()
+	defer f.unlock()
+	delete(f.items, id)
+}
+
+// List returns a list of all the items.
+func (f *DelayFIFO) List() []UniqueID {
+	f.rlock()
+	defer f.runlock()
+	list := make([]UniqueID, 0, len(f.items))
+	for _, item := range f.items {
+		list = append(list, item.value.(UniqueDelayed))
+	}
+	return list
+}
+
+// ContainedIDs returns a util.StringSet containing all IDs of the stored items.
+// This is a snapshot of a moment in time, and one should keep in mind that
+// other go routines can add or remove items after you call this.
+func (c *DelayFIFO) ContainedIDs() util.StringSet {
+	c.rlock()
+	defer c.runlock()
+	set := util.StringSet{}
+	for id := range c.items {
+		set.Insert(id)
+	}
+	return set
+}
+
+// Get returns the requested item, or sets exists=false.
+func (f *DelayFIFO) Get(id string) (UniqueID, bool) {
+	f.rlock()
+	defer f.runlock()
+	if item, exists := f.items[id]; exists {
+		return item.value.(UniqueID), true
+	}
+	return nil, false
+}
+
+// Variant of DelayQueue.Pop() for UniqueDelayed items
+func (q *DelayFIFO) Await(timeout time.Duration) UniqueID {
+	cancel := make(chan struct{})
+	ch := make(chan interface{}, 1)
+	go func() { ch <- q.pop(cancel) }()
+	var x interface{}
+	select {
+	case <-time.After(timeout):
+		close(cancel)
+		x = <-ch
+	case x = <-ch:
+		// noop
+	}
+	if x != nil {
+		return x.(UniqueID)
+	}
+	return nil
+}
+
+// Variant of DelayQueue.Pop() for UniqueDelayed items
+func (q *DelayFIFO) Pop() UniqueID {
+	return q.pop(nil).(UniqueID)
+}
+
+// variant of DelayQueue.Pop that implements optional cancellation
+func (q *DelayFIFO) pop(cancel chan struct{}) interface{} {
+	next := func() *qitem {
+		q.lock()
+		defer q.unlock()
+		for {
+			for q.queue().Len() == 0 {
+				signal := make(chan struct{})
+				go func() {
+					defer close(signal)
+					q.cond().Wait()
+				}()
+				select {
+				case <-cancel:
+					// we may not have the lock yet, so
+					// broadcast to abort Wait, then
+					// return after lock re-acquisition
+					q.cond().Broadcast()
+					<-signal
+					return nil
+				case <-signal:
+					// we have the lock, re-check
+					// the queue for data...
+				}
+			}
+			x := heap.Pop(q.queue())
+			item := x.(*qitem)
+			unique := item.value.(UniqueID)
+			uid := unique.GetUID()
+			if _, ok := q.items[uid]; !ok {
+				// item was deleted, keep looking
+				continue
+			}
+			delete(q.items, uid)
+			return item
+		}
+	}
+	return q.delegate.pop(next, cancel)
+}
+
+func NewDelayFIFO() *DelayFIFO {
+	f := &DelayFIFO{
+		delegate: NewDelayQueue(),
+		items:    map[string]*qitem{},
+	}
+	return f
+}
--- a/contrib/mesos/pkg/queue/delay_test.go
+++ b/contrib/mesos/pkg/queue/delay_test.go
@@ -0,0 +1,406 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package queue
+
+import (
+	"sync/atomic"
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/assert"
+)
+
+const (
+	tolerance = 100 * time.Millisecond // go time delays aren't perfect, this is our tolerance for errors WRT expected timeouts
+)
+
+func timedPriority(t time.Time) Priority {
+	return Priority{ts: t}
+}
+
+func TestPQ(t *testing.T) {
+	t.Parallel()
+
+	var pq priorityQueue
+	if pq.Len() != 0 {
+		t.Fatalf("pq should be empty")
+	}
+
+	now := timedPriority(time.Now())
+	now2 := timedPriority(now.ts.Add(2 * time.Second))
+	pq.Push(&qitem{priority: now2})
+	if pq.Len() != 1 {
+		t.Fatalf("pq.len should be 1")
+	}
+	x := pq.Pop()
+	if x == nil {
+		t.Fatalf("x is nil")
+	}
+	if pq.Len() != 0 {
+		t.Fatalf("pq should be empty")
+	}
+	item := x.(*qitem)
+	if !item.priority.Equal(now2) {
+		t.Fatalf("item.priority != now2")
+	}
+
+	pq.Push(&qitem{priority: now2})
+	pq.Push(&qitem{priority: now2})
+	pq.Push(&qitem{priority: now2})
+	pq.Push(&qitem{priority: now2})
+	pq.Push(&qitem{priority: now2})
+	pq.Pop()
+	pq.Pop()
+	pq.Pop()
+	pq.Pop()
+	pq.Pop()
+	if pq.Len() != 0 {
+		t.Fatalf("pq should be empty")
+	}
+	now4 := timedPriority(now.ts.Add(4 * time.Second))
+	now6 := timedPriority(now.ts.Add(4 * time.Second))
+	pq.Push(&qitem{priority: now2})
+	pq.Push(&qitem{priority: now4})
+	pq.Push(&qitem{priority: now6})
+	pq.Swap(0, 2)
+	if !pq[0].priority.Equal(now6) || !pq[2].priority.Equal(now2) {
+		t.Fatalf("swap failed")
+	}
+	if pq.Less(1, 2) {
+		t.Fatalf("now4 < now2")
+	}
+}
+
+func TestPopEmptyPQ(t *testing.T) {
+	t.Parallel()
+	defer func() {
+		if r := recover(); r == nil {
+			t.Fatalf("Expected panic from popping an empty PQ")
+		}
+	}()
+	var pq priorityQueue
+	pq.Pop()
+}
+
+type testjob struct {
+	d        time.Duration
+	t        time.Time
+	deadline *time.Time
+	uid      string
+	instance int
+}
+
+func (j *testjob) GetDelay() time.Duration {
+	return j.d
+}
+
+func (j testjob) GetUID() string {
+	return j.uid
+}
+
+func (td *testjob) Deadline() (deadline time.Time, ok bool) {
+	if td.deadline != nil {
+		return *td.deadline, true
+	} else {
+		return time.Now(), false
+	}
+}
+
+func TestDQ_sanity_check(t *testing.T) {
+	t.Parallel()
+
+	dq := NewDelayQueue()
+	delay := 2 * time.Second
+	dq.Add(&testjob{d: delay})
+
+	before := time.Now()
+	x := dq.Pop()
+
+	now := time.Now()
+	waitPeriod := now.Sub(before)
+
+	if waitPeriod+tolerance < delay {
+		t.Fatalf("delay too short: %v, expected: %v", waitPeriod, delay)
+	}
+	if x == nil {
+		t.Fatalf("x is nil")
+	}
+	item := x.(*testjob)
+	if item.d != delay {
+		t.Fatalf("d != delay")
+	}
+}
+
+func TestDQ_Offer(t *testing.T) {
+	t.Parallel()
+	assert := assert.New(t)
+
+	dq := NewDelayQueue()
+	delay := time.Second
+
+	added := dq.Offer(&testjob{})
+	if added {
+		t.Fatalf("DelayQueue should not add offered job without deadline")
+	}
+
+	deadline := time.Now().Add(delay)
+	added = dq.Offer(&testjob{deadline: &deadline})
+	if !added {
+		t.Fatalf("DelayQueue should add offered job with deadline")
+	}
+
+	before := time.Now()
+	x := dq.Pop()
+
+	now := time.Now()
+	waitPeriod := now.Sub(before)
+
+	if waitPeriod+tolerance < delay {
+		t.Fatalf("delay too short: %v, expected: %v", waitPeriod, delay)
+	}
+	assert.NotNil(x)
+	assert.Equal(x.(*testjob).deadline, &deadline)
+}
+
+func TestDQ_ordered_add_pop(t *testing.T) {
+	t.Parallel()
+
+	dq := NewDelayQueue()
+	dq.Add(&testjob{d: 2 * time.Second})
+	dq.Add(&testjob{d: 1 * time.Second})
+	dq.Add(&testjob{d: 3 * time.Second})
+
+	var finished [3]*testjob
+	before := time.Now()
+	idx := int32(-1)
+	ch := make(chan bool, 3)
+	//TODO: replace with `for range finished` once Go 1.3 support is dropped
+	for n := 0; n < len(finished); n++ {
+		go func() {
+			var ok bool
+			x := dq.Pop()
+			i := atomic.AddInt32(&idx, 1)
+			if finished[i], ok = x.(*testjob); !ok {
+				t.Fatalf("expected a *testjob, not %v", x)
+			}
+			finished[i].t = time.Now()
+			ch <- true
+		}()
+	}
+	<-ch
+	<-ch
+	<-ch
+
+	after := time.Now()
+	totalDelay := after.Sub(before)
+	if totalDelay+tolerance < (3 * time.Second) {
+		t.Fatalf("totalDelay < 3s: %v", totalDelay)
+	}
+	for i, v := range finished {
+		if v == nil {
+			t.Fatalf("task %d was nil", i)
+		}
+		expected := time.Duration(i+1) * time.Second
+		if v.d != expected {
+			t.Fatalf("task %d had delay-priority %v, expected %v", i, v.d, expected)
+		}
+		actualDelay := v.t.Sub(before)
+		if actualDelay+tolerance < v.d {
+			t.Fatalf("task %d had actual-delay %v < expected delay %v", i, actualDelay, v.d)
+		}
+	}
+}
+
+func TestDQ_always_pop_earliest_deadline(t *testing.T) {
+	t.Parallel()
+
+	// add a testjob with delay of 2s
+	// spawn a func f1 that attempts to Pop() and wait for f1 to begin
+	// add a testjob with a delay of 1s
+	// check that the func f1 actually popped the 1s task (not the 2s task)
+
+	dq := NewDelayQueue()
+	dq.Add(&testjob{d: 2 * time.Second})
+	ch := make(chan *testjob)
+	started := make(chan bool)
+
+	go func() {
+		started <- true
+		x := dq.Pop()
+		job := x.(*testjob)
+		job.t = time.Now()
+		ch <- job
+	}()
+
+	<-started
+	time.Sleep(500 * time.Millisecond) // give plently of time for Pop() to enter
+	expected := 1 * time.Second
+	dq.Add(&testjob{d: expected})
+	job := <-ch
+
+	if expected != job.d {
+		t.Fatalf("Expected delay-prority of %v got instead got %v", expected, job.d)
+	}
+
+	job = dq.Pop().(*testjob)
+	expected = 2 * time.Second
+	if expected != job.d {
+		t.Fatalf("Expected delay-prority of %v got instead got %v", expected, job.d)
+	}
+}
+
+func TestDQ_always_pop_earliest_deadline_multi(t *testing.T) {
+	t.Parallel()
+
+	dq := NewDelayQueue()
+	dq.Add(&testjob{d: 2 * time.Second})
+
+	ch := make(chan *testjob)
+	multi := 10
+	started := make(chan bool, multi)
+
+	go func() {
+		started <- true
+		for i := 0; i < multi; i++ {
+			x := dq.Pop()
+			job := x.(*testjob)
+			job.t = time.Now()
+			ch <- job
+		}
+	}()
+
+	<-started
+	time.Sleep(500 * time.Millisecond) // give plently of time for Pop() to enter
+	expected := 1 * time.Second
+
+	for i := 0; i < multi; i++ {
+		dq.Add(&testjob{d: expected})
+	}
+	for i := 0; i < multi; i++ {
+		job := <-ch
+		if expected != job.d {
+			t.Fatalf("Expected delay-prority of %v got instead got %v", expected, job.d)
+		}
+	}
+
+	job := dq.Pop().(*testjob)
+	expected = 2 * time.Second
+	if expected != job.d {
+		t.Fatalf("Expected delay-prority of %v got instead got %v", expected, job.d)
+	}
+}
+
+func TestDQ_negative_delay(t *testing.T) {
+	t.Parallel()
+
+	dq := NewDelayQueue()
+	delay := -2 * time.Second
+	dq.Add(&testjob{d: delay})
+
+	before := time.Now()
+	x := dq.Pop()
+
+	now := time.Now()
+	waitPeriod := now.Sub(before)
+
+	if waitPeriod > tolerance {
+		t.Fatalf("delay too long: %v, expected something less than: %v", waitPeriod, tolerance)
+	}
+	if x == nil {
+		t.Fatalf("x is nil")
+	}
+	item := x.(*testjob)
+	if item.d != delay {
+		t.Fatalf("d != delay")
+	}
+}
+
+func TestDFIFO_sanity_check(t *testing.T) {
+	t.Parallel()
+	assert := assert.New(t)
+
+	df := NewDelayFIFO()
+	delay := 2 * time.Second
+	df.Add(&testjob{d: delay, uid: "a", instance: 1}, ReplaceExisting)
+	assert.True(df.ContainedIDs().Has("a"))
+
+	// re-add by ReplaceExisting
+	df.Add(&testjob{d: delay, uid: "a", instance: 2}, ReplaceExisting)
+	assert.True(df.ContainedIDs().Has("a"))
+
+	a, ok := df.Get("a")
+	assert.True(ok)
+	assert.Equal(a.(*testjob).instance, 2)
+
+	// re-add by KeepExisting
+	df.Add(&testjob{d: delay, uid: "a", instance: 3}, KeepExisting)
+	assert.True(df.ContainedIDs().Has("a"))
+
+	a, ok = df.Get("a")
+	assert.True(ok)
+	assert.Equal(a.(*testjob).instance, 2)
+
+	// pop last
+	before := time.Now()
+	x := df.Pop()
+	assert.Equal(a.(*testjob).instance, 2)
+
+	now := time.Now()
+	waitPeriod := now.Sub(before)
+
+	if waitPeriod+tolerance < delay {
+		t.Fatalf("delay too short: %v, expected: %v", waitPeriod, delay)
+	}
+	if x == nil {
+		t.Fatalf("x is nil")
+	}
+	item := x.(*testjob)
+	if item.d != delay {
+		t.Fatalf("d != delay")
+	}
+}
+
+func TestDFIFO_Offer(t *testing.T) {
+	t.Parallel()
+	assert := assert.New(t)
+
+	dq := NewDelayFIFO()
+	delay := time.Second
+
+	added := dq.Offer(&testjob{instance: 1}, ReplaceExisting)
+	if added {
+		t.Fatalf("DelayFIFO should not add offered job without deadline")
+	}
+
+	deadline := time.Now().Add(delay)
+	added = dq.Offer(&testjob{deadline: &deadline, instance: 2}, ReplaceExisting)
+	if !added {
+		t.Fatalf("DelayFIFO should add offered job with deadline")
+	}
+
+	before := time.Now()
+	x := dq.Pop()
+
+	now := time.Now()
+	waitPeriod := now.Sub(before)
+
+	if waitPeriod+tolerance < delay {
+		t.Fatalf("delay too short: %v, expected: %v", waitPeriod, delay)
+	}
+	assert.NotNil(x)
+	assert.Equal(x.(*testjob).instance, 2)
+}
--- a/contrib/mesos/pkg/queue/doc.go
+++ b/contrib/mesos/pkg/queue/doc.go
@@ -0,0 +1,19 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+// Package queue provides several queue implementations, originally
+// inspired by Kubernetes pkg/client/cache/fifo.
+package queue
--- a/contrib/mesos/pkg/queue/historical.go
+++ b/contrib/mesos/pkg/queue/historical.go
@@ -0,0 +1,403 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package queue
+
+import (
+	"fmt"
+	"reflect"
+	"sync"
+	"time"
+
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/util"
+)
+
+type entry struct {
+	value UniqueCopyable
+	event EventType
+}
+
+type deletedEntry struct {
+	*entry
+	expiration time.Time
+}
+
+func (e *entry) Value() UniqueCopyable {
+	return e.value
+}
+
+func (e *entry) Copy() Copyable {
+	if e == nil {
+		return nil
+	}
+	return &entry{e.value.Copy().(UniqueCopyable), e.event}
+}
+
+func (e *entry) Is(types EventType) bool {
+	return types&e.event != 0
+}
+
+func (e *deletedEntry) Copy() Copyable {
+	if e == nil {
+		return nil
+	}
+	return &deletedEntry{e.entry.Copy().(*entry), e.expiration}
+}
+
+// deliver a message
+type pigeon func(msg Entry)
+
+func dead(msg Entry) {
+	// intentionally blank
+}
+
+// HistoricalFIFO receives adds and updates from a Reflector, and puts them in a queue for
+// FIFO order processing. If multiple adds/updates of a single item happen while
+// an item is in the queue before it has been processed, it will only be
+// processed once, and when it is processed, the most recent version will be
+// processed. This can't be done with a channel.
+type HistoricalFIFO struct {
+	lock      sync.RWMutex
+	cond      sync.Cond
+	items     map[string]Entry // We depend on the property that items in the queue are in the set.
+	queue     []string
+	carrier   pigeon // may be dead, but never nil
+	gcc       int
+	lingerTTL time.Duration
+}
+
+// panics if obj doesn't implement UniqueCopyable; otherwise returns the same, typecast object
+func checkType(obj interface{}) UniqueCopyable {
+	if v, ok := obj.(UniqueCopyable); !ok {
+		panic(fmt.Sprintf("Illegal object type, expected UniqueCopyable: %T", obj))
+	} else {
+		return v
+	}
+}
+
+// Add inserts an item, and puts it in the queue. The item is only enqueued
+// if it doesn't already exist in the set.
+func (f *HistoricalFIFO) Add(v interface{}) error {
+	obj := checkType(v)
+	notifications := []Entry(nil)
+	defer func() {
+		for _, e := range notifications {
+			f.carrier(e)
+		}
+	}()
+
+	f.lock.Lock()
+	defer f.lock.Unlock()
+
+	id := obj.GetUID()
+	if entry, exists := f.items[id]; !exists {
+		f.queue = append(f.queue, id)
+	} else {
+		if entry.Is(DELETE_EVENT | POP_EVENT) {
+			f.queue = append(f.queue, id)
+		}
+	}
+	notifications = f.merge(id, obj)
+	f.cond.Broadcast()
+	return nil
+}
+
+// Update is the same as Add in this implementation.
+func (f *HistoricalFIFO) Update(obj interface{}) error {
+	return f.Add(obj)
+}
+
+// Delete removes an item. It doesn't add it to the queue, because
+// this implementation assumes the consumer only cares about the objects,
+// not the order in which they were created/added.
+func (f *HistoricalFIFO) Delete(v interface{}) error {
+	obj := checkType(v)
+	deleteEvent := (Entry)(nil)
+	defer func() {
+		f.carrier(deleteEvent)
+	}()
+
+	f.lock.Lock()
+	defer f.lock.Unlock()
+	id := obj.GetUID()
+	item, exists := f.items[id]
+	if exists && !item.Is(DELETE_EVENT) {
+		e := item.(*entry)
+		e.event = DELETE_EVENT
+		deleteEvent = &deletedEntry{e, time.Now().Add(f.lingerTTL)}
+		f.items[id] = deleteEvent
+	}
+	return nil
+}
+
+// List returns a list of all the items.
+func (f *HistoricalFIFO) List() []interface{} {
+	f.lock.RLock()
+	defer f.lock.RUnlock()
+
+	// TODO(jdef): slightly overallocates b/c of deleted items
+	list := make([]interface{}, 0, len(f.queue))
+
+	for _, entry := range f.items {
+		if entry.Is(DELETE_EVENT | POP_EVENT) {
+			continue
+		}
+		list = append(list, entry.Value().Copy())
+	}
+	return list
+}
+
+// List returns a list of all the items.
+func (f *HistoricalFIFO) ListKeys() []string {
+	f.lock.RLock()
+	defer f.lock.RUnlock()
+
+	// TODO(jdef): slightly overallocates b/c of deleted items
+	list := make([]string, 0, len(f.queue))
+
+	for key, entry := range f.items {
+		if entry.Is(DELETE_EVENT | POP_EVENT) {
+			continue
+		}
+		list = append(list, key)
+	}
+	return list
+}
+
+// ContainedIDs returns a util.StringSet containing all IDs of the stored items.
+// This is a snapshot of a moment in time, and one should keep in mind that
+// other go routines can add or remove items after you call this.
+func (c *HistoricalFIFO) ContainedIDs() util.StringSet {
+	c.lock.RLock()
+	defer c.lock.RUnlock()
+	set := util.StringSet{}
+	for id, entry := range c.items {
+		if entry.Is(DELETE_EVENT | POP_EVENT) {
+			continue
+		}
+		set.Insert(id)
+	}
+	return set
+}
+
+// Get returns the requested item, or sets exists=false.
+func (f *HistoricalFIFO) Get(v interface{}) (interface{}, bool, error) {
+	obj := checkType(v)
+	return f.GetByKey(obj.GetUID())
+}
+
+// Get returns the requested item, or sets exists=false.
+func (f *HistoricalFIFO) GetByKey(id string) (interface{}, bool, error) {
+	f.lock.RLock()
+	defer f.lock.RUnlock()
+	entry, exists := f.items[id]
+	if exists && !entry.Is(DELETE_EVENT|POP_EVENT) {
+		return entry.Value().Copy(), true, nil
+	}
+	return nil, false, nil
+}
+
+// Get returns the requested item, or sets exists=false.
+func (f *HistoricalFIFO) Poll(id string, t EventType) bool {
+	f.lock.RLock()
+	defer f.lock.RUnlock()
+	entry, exists := f.items[id]
+	return exists && entry.Is(t)
+}
+
+// Variant of DelayQueue.Pop() for UniqueDelayed items
+func (q *HistoricalFIFO) Await(timeout time.Duration) interface{} {
+	cancel := make(chan struct{})
+	ch := make(chan interface{}, 1)
+	go func() { ch <- q.pop(cancel) }()
+	select {
+	case <-time.After(timeout):
+		close(cancel)
+		return <-ch
+	case x := <-ch:
+		return x
+	}
+}
+func (f *HistoricalFIFO) Pop() interface{} {
+	return f.pop(nil)
+}
+
+func (f *HistoricalFIFO) pop(cancel chan struct{}) interface{} {
+	popEvent := (Entry)(nil)
+	defer func() {
+		f.carrier(popEvent)
+	}()
+
+	f.lock.Lock()
+	defer f.lock.Unlock()
+	for {
+		for len(f.queue) == 0 {
+			signal := make(chan struct{})
+			go func() {
+				defer close(signal)
+				f.cond.Wait()
+			}()
+			select {
+			case <-cancel:
+				// we may not have the lock yet, so
+				// broadcast to abort Wait, then
+				// return after lock re-acquisition
+				f.cond.Broadcast()
+				<-signal
+				return nil
+			case <-signal:
+				// we have the lock, re-check
+				// the queue for data...
+			}
+		}
+		id := f.queue[0]
+		f.queue = f.queue[1:]
+		item, ok := f.items[id]
+		if !ok || item.Is(DELETE_EVENT|POP_EVENT) {
+			// Item may have been deleted subsequently.
+			continue
+		}
+		value := item.Value()
+		popEvent = &entry{value, POP_EVENT}
+		f.items[id] = popEvent
+		return value.Copy()
+	}
+}
+
+func (f *HistoricalFIFO) Replace(objs []interface{}) error {
+	notifications := make([]Entry, 0, len(objs))
+	defer func() {
+		for _, e := range notifications {
+			f.carrier(e)
+		}
+	}()
+
+	idToObj := make(map[string]interface{})
+	for _, v := range objs {
+		obj := checkType(v)
+		idToObj[obj.GetUID()] = v
+	}
+
+	f.lock.Lock()
+	defer f.lock.Unlock()
+
+	f.queue = f.queue[:0]
+	now := time.Now()
+	for id, v := range f.items {
+		if _, exists := idToObj[id]; !exists && !v.Is(DELETE_EVENT) {
+			// a non-deleted entry in the items list that doesn't show up in the
+			// new list: mark it as deleted
+			ent := v.(*entry)
+			ent.event = DELETE_EVENT
+			e := &deletedEntry{ent, now.Add(f.lingerTTL)}
+			f.items[id] = e
+			notifications = append(notifications, e)
+		}
+	}
+	for id, v := range idToObj {
+		obj := checkType(v)
+		f.queue = append(f.queue, id)
+		n := f.merge(id, obj)
+		notifications = append(notifications, n...)
+	}
+	if len(f.queue) > 0 {
+		f.cond.Broadcast()
+	}
+	return nil
+}
+
+// garbage collect DELETEd items whose TTL has expired; the IDs of such items are removed
+// from the queue. This impl assumes that caller has acquired state lock.
+func (f *HistoricalFIFO) gc() {
+	now := time.Now()
+	deleted := make(map[string]struct{})
+	for id, v := range f.items {
+		if v.Is(DELETE_EVENT) {
+			ent := v.(*deletedEntry)
+			if ent.expiration.Before(now) {
+				delete(f.items, id)
+				deleted[id] = struct{}{}
+			}
+		}
+	}
+	// remove deleted items from the queue, will likely (slightly) overallocate here
+	queue := make([]string, 0, len(f.queue))
+	for _, id := range f.queue {
+		if _, exists := deleted[id]; !exists {
+			queue = append(queue, id)
+		}
+	}
+	f.queue = queue
+}
+
+// Assumes that the caller has acquired the state lock.
+func (f *HistoricalFIFO) merge(id string, obj UniqueCopyable) (notifications []Entry) {
+	item, exists := f.items[id]
+	now := time.Now()
+	if !exists {
+		e := &entry{obj.Copy().(UniqueCopyable), ADD_EVENT}
+		f.items[id] = e
+		notifications = append(notifications, e)
+	} else {
+		if !item.Is(DELETE_EVENT) && item.Value().GetUID() != obj.GetUID() {
+			// hidden DELETE!
+			// (1) append a DELETE
+			// (2) append an ADD
+			// .. and notify listeners in that order
+			ent := item.(*entry)
+			ent.event = DELETE_EVENT
+			e1 := &deletedEntry{ent, now.Add(f.lingerTTL)}
+			e2 := &entry{obj.Copy().(UniqueCopyable), ADD_EVENT}
+			f.items[id] = e2
+			notifications = append(notifications, e1, e2)
+		} else if !reflect.DeepEqual(obj, item.Value()) {
+			//TODO(jdef): it would be nice if we could rely on resource versions
+			//instead of doing a DeepEqual. Maybe someday we'll be able to.
+			e := &entry{obj.Copy().(UniqueCopyable), UPDATE_EVENT}
+			f.items[id] = e
+			notifications = append(notifications, e)
+		}
+	}
+	// check for garbage collection
+	f.gcc++
+	if f.gcc%256 == 0 { //TODO(jdef): extract constant
+		f.gcc = 0
+		f.gc()
+	}
+	return
+}
+
+// NewHistorical returns a Store which can be used to queue up items to
+// process. If a non-nil Mux is provided, then modifications to the
+// the FIFO are delivered on a channel specific to this fifo.
+func NewHistorical(ch chan<- Entry) FIFO {
+	carrier := dead
+	if ch != nil {
+		carrier = func(msg Entry) {
+			if msg != nil {
+				ch <- msg.Copy().(Entry)
+			}
+		}
+	}
+	f := &HistoricalFIFO{
+		items:     map[string]Entry{},
+		queue:     []string{},
+		carrier:   carrier,
+		lingerTTL: 5 * time.Minute, // TODO(jdef): extract constant
+	}
+	f.cond.L = &f.lock
+	return f
+}
--- a/contrib/mesos/pkg/queue/historical_test.go
+++ b/contrib/mesos/pkg/queue/historical_test.go
@@ -0,0 +1,191 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package queue
+
+import (
+	"fmt"
+	"testing"
+	"time"
+)
+
+type _int int
+type _uint uint
+
+func (i _int) Copy() Copyable {
+	return i
+}
+
+func (i _int) GetUID() string {
+	return fmt.Sprintf("INT%d", int(i))
+}
+
+func (i _uint) Copy() Copyable {
+	return i
+}
+
+func (i _uint) GetUID() string {
+	return fmt.Sprintf("UINT%d", uint64(i))
+}
+
+type testObj struct {
+	id    string
+	value int
+}
+
+func (i *testObj) Copy() Copyable {
+	if i == nil {
+		return nil
+	} else {
+		return &testObj{i.id, i.value}
+	}
+}
+
+func (i *testObj) GetUID() string {
+	return i.id
+}
+
+func TestFIFO_basic(t *testing.T) {
+	f := NewHistorical(nil)
+	const amount = 500
+	go func() {
+		for i := 0; i < amount; i++ {
+			f.Add(_int(i + 1))
+		}
+	}()
+	go func() {
+		for u := uint(0); u < amount; u++ {
+			f.Add(_uint(u + 1))
+		}
+	}()
+
+	lastInt := _int(0)
+	lastUint := _uint(0)
+	for i := 0; i < amount*2; i++ {
+		switch obj := f.Pop().(type) {
+		case _int:
+			if obj <= lastInt {
+				t.Errorf("got %v (int) out of order, last was %v", obj, lastInt)
+			}
+			lastInt = obj
+		case _uint:
+			if obj <= lastUint {
+				t.Errorf("got %v (uint) out of order, last was %v", obj, lastUint)
+			} else {
+				lastUint = obj
+			}
+		default:
+			t.Fatalf("unexpected type %#v", obj)
+		}
+	}
+}
+
+func TestFIFO_addUpdate(t *testing.T) {
+	f := NewHistorical(nil)
+	f.Add(&testObj{"foo", 10})
+	f.Update(&testObj{"foo", 15})
+	got := make(chan *testObj, 2)
+	go func() {
+		for {
+			got <- f.Pop().(*testObj)
+		}
+	}()
+
+	first := <-got
+	if e, a := 15, first.value; e != a {
+		t.Errorf("Didn't get updated value (%v), got %v", e, a)
+	}
+	select {
+	case unexpected := <-got:
+		t.Errorf("Got second value %v", unexpected)
+	case <-time.After(50 * time.Millisecond):
+	}
+	_, exists, _ := f.GetByKey("foo")
+	if exists {
+		t.Errorf("item did not get removed")
+	}
+}
+
+func TestFIFO_addReplace(t *testing.T) {
+	f := NewHistorical(nil)
+	f.Add(&testObj{"foo", 10})
+	f.Replace([]interface{}{&testObj{"foo", 15}})
+	got := make(chan *testObj, 2)
+	go func() {
+		for {
+			got <- f.Pop().(*testObj)
+		}
+	}()
+
+	first := <-got
+	if e, a := 15, first.value; e != a {
+		t.Errorf("Didn't get updated value (%v), got %v", e, a)
+	}
+	select {
+	case unexpected := <-got:
+		t.Errorf("Got second value %v", unexpected)
+	case <-time.After(50 * time.Millisecond):
+	}
+	_, exists, _ := f.GetByKey("foo")
+	if exists {
+		t.Errorf("item did not get removed")
+	}
+}
+
+func TestFIFO_detectLineJumpers(t *testing.T) {
+	f := NewHistorical(nil)
+
+	f.Add(&testObj{"foo", 10})
+	f.Add(&testObj{"bar", 1})
+	f.Add(&testObj{"foo", 11})
+	f.Add(&testObj{"foo", 13})
+	f.Add(&testObj{"zab", 30})
+
+	err := error(nil)
+	done := make(chan struct{})
+	go func() {
+		defer close(done)
+		if e, a := 13, f.Pop().(*testObj).value; a != e {
+			err = fmt.Errorf("expected %d, got %d", e, a)
+			return
+		}
+
+		f.Add(&testObj{"foo", 14}) // ensure foo doesn't jump back in line
+
+		if e, a := 1, f.Pop().(*testObj).value; a != e {
+			err = fmt.Errorf("expected %d, got %d", e, a)
+			return
+		}
+
+		if e, a := 30, f.Pop().(*testObj).value; a != e {
+			err = fmt.Errorf("expected %d, got %d", e, a)
+			return
+		}
+
+		if e, a := 14, f.Pop().(*testObj).value; a != e {
+			err = fmt.Errorf("expected %d, got %d", e, a)
+			return
+		}
+	}()
+	select {
+	case <-done:
+		if err != nil {
+			t.Fatal(err)
+		}
+	case <-time.After(1 * time.Second):
+		t.Fatal("Deadlocked unit test")
+	}
+}
--- a/contrib/mesos/pkg/queue/interface.go
+++ b/contrib/mesos/pkg/queue/interface.go
@@ -0,0 +1,103 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package queue
+
+import (
+	"time"
+
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/client/cache"
+)
+
+type EventType int
+
+const (
+	ADD_EVENT EventType = 1 << iota
+	UPDATE_EVENT
+	DELETE_EVENT
+	POP_EVENT
+)
+
+type Entry interface {
+	Copyable
+	Value() UniqueCopyable
+	// types is a logically OR'd combination of EventType, e.g. ADD_EVENT|UPDATE_EVENT
+	Is(types EventType) bool
+}
+
+type Copyable interface {
+	// return an independent copy (deep clone) of the current object
+	Copy() Copyable
+}
+
+type UniqueID interface {
+	GetUID() string
+}
+
+type UniqueCopyable interface {
+	Copyable
+	UniqueID
+}
+
+type FIFO interface {
+	cache.Store
+
+	// Pop waits until an item is ready and returns it. If multiple items are
+	// ready, they are returned in the order in which they were added/updated.
+	// The item is removed from the queue (and the store) before it is returned,
+	// so if you don't succesfully process it, you need to add it back with Add().
+	Pop() interface{}
+
+	// Await attempts to Pop within the given interval; upon success the non-nil
+	// item is returned, otherwise nil
+	Await(timeout time.Duration) interface{}
+
+	// Is there an entry for the id that matches the event mask?
+	Poll(id string, types EventType) bool
+}
+
+type Delayed interface {
+	// return the remaining delay; a non-positive value indicates no delay
+	GetDelay() time.Duration
+}
+
+type Deadlined interface {
+	// when ok, returns the time when this object should be activated/executed/evaluated
+	Deadline() (deadline time.Time, ok bool)
+}
+
+// No objects are ever expected to be sent over this channel. References to BreakChan
+// instances may be nil (always blocking). Signalling over this channel is performed by
+// closing the channel. As such there can only ever be a single signal sent over the
+// lifetime of the channel.
+type BreakChan <-chan struct{}
+
+// an optional interface to be implemented by Delayed objects; returning a nil
+// channel from Breaker() results in waiting the full delay duration
+type Breakout interface {
+	// return a channel that signals early departure from a blocking delay
+	Breaker() BreakChan
+}
+
+type UniqueDelayed interface {
+	UniqueID
+	Delayed
+}
+
+type UniqueDeadlined interface {
+	UniqueID
+	Deadlined
+}
--- a/contrib/mesos/pkg/queue/policy.go
+++ b/contrib/mesos/pkg/queue/policy.go
@@ -0,0 +1,70 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package queue
+
+// Decide whether a pre-existing deadline for an item in a delay-queue should be
+// updated if an attempt is made to offer/add a new deadline for said item. Whether
+// the deadline changes or not has zero impact on the data blob associated with the
+// entry in the queue.
+type DeadlinePolicy int
+
+const (
+	PreferLatest DeadlinePolicy = iota
+	PreferEarliest
+)
+
+// Decide whether a pre-existing data blob in a delay-queue should be replaced if an
+// an attempt is made to add/offer a new data blob in its place. Whether the data is
+// replaced has no bearing on the deadline (priority) of the item in the queue.
+type ReplacementPolicy int
+
+const (
+	KeepExisting ReplacementPolicy = iota
+	ReplaceExisting
+)
+
+func (rp ReplacementPolicy) replacementValue(original, replacement interface{}) (result interface{}) {
+	switch rp {
+	case KeepExisting:
+		result = original
+	case ReplaceExisting:
+		fallthrough
+	default:
+		result = replacement
+	}
+	return
+}
+
+func (dp DeadlinePolicy) nextDeadline(a, b Priority) (result Priority) {
+	switch dp {
+	case PreferEarliest:
+		if a.ts.Before(b.ts) {
+			result = a
+		} else {
+			result = b
+		}
+	case PreferLatest:
+		fallthrough
+	default:
+		if a.ts.After(b.ts) {
+			result = a
+		} else {
+			result = b
+		}
+	}
+	return
+}
--- a/contrib/mesos/pkg/queue/priority.go
+++ b/contrib/mesos/pkg/queue/priority.go
@@ -0,0 +1,56 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package queue
+
+import (
+	"time"
+)
+
+type Priority struct {
+	ts     time.Time // timestamp
+	notify BreakChan // notification channel
+}
+
+func (p Priority) Equal(other Priority) bool {
+	return p.ts.Equal(other.ts) && p.notify == other.notify
+}
+
+func extractFromDelayed(d Delayed) Priority {
+	deadline := time.Now().Add(d.GetDelay())
+	breaker := BreakChan(nil)
+	if breakout, good := d.(Breakout); good {
+		breaker = breakout.Breaker()
+	}
+	return Priority{
+		ts:     deadline,
+		notify: breaker,
+	}
+}
+
+func extractFromDeadlined(d Deadlined) (Priority, bool) {
+	if ts, ok := d.Deadline(); ok {
+		breaker := BreakChan(nil)
+		if breakout, good := d.(Breakout); good {
+			breaker = breakout.Breaker()
+		}
+		return Priority{
+			ts:     ts,
+			notify: breaker,
+		}, true
+	}
+	return Priority{}, false
+}
--- a/contrib/mesos/pkg/redirfd/doc.go
+++ b/contrib/mesos/pkg/redirfd/doc.go
@@ -0,0 +1,19 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+// Some file descriptor manipulation funcs (Unix-Only), inspired by
+// https://github.com/skarnet/execline/blob/master/src/execline/redirfd.c
+package redirfd
--- a/contrib/mesos/pkg/redirfd/file_descriptor.go
+++ b/contrib/mesos/pkg/redirfd/file_descriptor.go
@@ -0,0 +1,41 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package redirfd
+
+import (
+	"fmt"
+	"strconv"
+)
+
+// FileDescriptor mirrors unix-specific indexes for cross-platform use
+type FileDescriptor int
+
+const (
+	InvalidFD FileDescriptor = -1
+	Stdin     FileDescriptor = 0
+	Stdout    FileDescriptor = 1
+	Stderr    FileDescriptor = 2
+)
+
+// ParseFileDescriptor parses a string formatted file descriptor
+func ParseFileDescriptor(fdstr string) (FileDescriptor, error) {
+	fdint, err := strconv.Atoi(fdstr)
+	if err != nil {
+		return InvalidFD, fmt.Errorf("file descriptor must be an integer: %q", fdstr)
+	}
+	return FileDescriptor(fdint), nil
+}
--- a/contrib/mesos/pkg/redirfd/file_descriptor_test.go
+++ b/contrib/mesos/pkg/redirfd/file_descriptor_test.go
@@ -0,0 +1,54 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package redirfd
+
+import (
+	"testing"
+
+	. "github.com/onsi/gomega"
+)
+
+func TestParseFileDescriptor(t *testing.T) {
+	RegisterTestingT(t)
+
+	valid := map[string]FileDescriptor{
+		"-1": InvalidFD,
+		"0":  Stdin,
+		"1":  Stdout,
+		"2":  Stderr,
+		"3":  FileDescriptor(3),
+	}
+
+	for input, expected := range valid {
+		fd, err := ParseFileDescriptor(input)
+		Expect(err).ToNot(HaveOccurred(), "Input: '%s'", input)
+		Expect(fd).To(Equal(expected), "Input: '%s'", input)
+	}
+
+	invalid := []string{
+		"a",
+		" 1",
+		"blue",
+		"stderr",
+		"STDERR",
+	}
+
+	for _, input := range invalid {
+		_, err := ParseFileDescriptor(input)
+		Expect(err).To(HaveOccurred(), "Input: '%s'", input)
+	}
+}
--- a/contrib/mesos/pkg/redirfd/redirfd_unix.go
+++ b/contrib/mesos/pkg/redirfd/redirfd_unix.go
@@ -0,0 +1,208 @@
+// +build !windows
+
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package redirfd
+
+import (
+	"fmt"
+	"os"
+	"syscall"
+)
+
+type RedirectMode int
+
+const (
+	Read           RedirectMode = iota // open file for reading
+	Write                              // open file for writing, truncating if it exists
+	Update                             // open file for read & write
+	Append                             // open file for append, create if it does not exist
+	AppendExisting                     // open file for append, do not create if it does not already exist
+	WriteNew                           // open file for writing, creating it, failing if it already exists
+)
+
+// see https://github.com/skarnet/execline/blob/master/src/execline/redirfd.c
+func (mode RedirectMode) Redirect(nonblock, changemode bool, fd FileDescriptor, name string) (*os.File, error) {
+	flags := 0
+	what := -1
+
+	switch mode {
+	case Read:
+		what = syscall.O_RDONLY
+		flags &= ^(syscall.O_APPEND | syscall.O_CREAT | syscall.O_TRUNC | syscall.O_EXCL)
+	case Write:
+		what = syscall.O_WRONLY
+		flags |= syscall.O_CREAT | syscall.O_TRUNC
+		flags &= ^(syscall.O_APPEND | syscall.O_EXCL)
+	case Update:
+		what = syscall.O_RDWR
+		flags &= ^(syscall.O_APPEND | syscall.O_CREAT | syscall.O_TRUNC | syscall.O_EXCL)
+	case Append:
+		what = syscall.O_WRONLY
+		flags |= syscall.O_CREAT | syscall.O_APPEND
+		flags &= ^(syscall.O_TRUNC | syscall.O_EXCL)
+	case AppendExisting:
+		what = syscall.O_WRONLY
+		flags |= syscall.O_APPEND
+		flags &= ^(syscall.O_CREAT | syscall.O_TRUNC | syscall.O_EXCL)
+	case WriteNew:
+		what = syscall.O_WRONLY
+		flags |= syscall.O_CREAT | syscall.O_EXCL
+		flags &= ^(syscall.O_APPEND | syscall.O_TRUNC)
+	default:
+		return nil, fmt.Errorf("unexpected mode %d", mode)
+	}
+	if nonblock {
+		flags |= syscall.O_NONBLOCK
+	}
+	flags |= what
+
+	fd2, e := open(name, flags, 0666)
+	if (what == syscall.O_WRONLY) && (e == syscall.ENXIO) {
+		// Opens file in read-only, non-blocking mode. Returns a valid fd number if it succeeds, or -1 (and sets errno) if it fails.
+		fdr, e2 := open(name, syscall.O_RDONLY|syscall.O_NONBLOCK, 0)
+		if e2 != nil {
+			return nil, &os.PathError{"open_read", name, e2}
+		}
+		fd2, e = open(name, flags, 0666)
+		fd_close(fdr)
+	}
+	if e != nil {
+		return nil, &os.PathError{"open", name, e}
+	}
+	if e = fd_move(fd, fd2); e != nil {
+		return nil, &os.PathError{"fd_move", name, e}
+	}
+	if changemode {
+		if nonblock {
+			e = ndelay_off(fd)
+		} else {
+			e = ndelay_on(fd)
+		}
+		if e != nil {
+			return nil, &os.PathError{"ndelay", name, e}
+		}
+	}
+	return os.NewFile(uintptr(fd2), name), nil
+}
+
+// proxy to return a FileDescriptor
+func open(path string, openmode int, perm uint32) (FileDescriptor, error) {
+	fdint, err := syscall.Open(path, openmode, perm)
+	return FileDescriptor(fdint), err
+}
+
+// see https://github.com/skarnet/skalibs/blob/master/src/libstddjb/fd_move.c
+func fd_move(to, from FileDescriptor) (err error) {
+	if to == from {
+		return
+	}
+	for {
+		_, _, e1 := syscall.RawSyscall(syscall.SYS_DUP2, uintptr(from), uintptr(to), 0)
+		if e1 != syscall.EINTR {
+			if e1 != 0 {
+				err = e1
+			}
+			break
+		}
+	}
+	if err != nil {
+		err = fd_close(from)
+	}
+	return
+	/*
+	   do
+	     r = dup2(from, to) ;
+	   while ((r == -1) && (errno == EINTR)) ;
+	   return (r == -1) ? -1 : fd_close(from) ;
+	*/
+}
+
+// see https://github.com/skarnet/skalibs/blob/master/src/libstddjb/fd_close.c
+func fd_close(fd FileDescriptor) (err error) {
+	i := 0
+	var e error
+	for {
+		if e = syscall.Close(int(fd)); e != nil {
+			return nil
+		}
+		i++
+		if e != syscall.EINTR {
+			break
+		}
+	}
+	if e == syscall.EBADF && i > 1 {
+		return nil
+	}
+	return e
+}
+
+/*
+int fd_close (int fd)
+{
+  register unsigned int i = 0 ;
+doit:
+  if (!close(fd)) return 0 ;
+  i++ ;
+  if (errno == EINTR) goto doit ;
+  return ((errno == EBADF) && (i > 1)) ? 0 : -1 ;
+}
+*/
+
+// see https://github.com/skarnet/skalibs/blob/master/src/libstddjb/ndelay_on.c
+func ndelay_on(fd FileDescriptor) error {
+	// 32-bit will likely break because it needs SYS_FCNTL64
+	got, _, e := syscall.Syscall(syscall.SYS_FCNTL, uintptr(fd), uintptr(syscall.F_GETFL), 0)
+	if e != 0 {
+		return e
+	}
+	_, _, e = syscall.Syscall(syscall.SYS_FCNTL, uintptr(fd), uintptr(syscall.F_SETFL), uintptr(got|syscall.O_NONBLOCK))
+	if e != 0 {
+		return e
+	}
+	return nil
+}
+
+/*
+int ndelay_on (int fd)
+{
+  register int got = fcntl(fd, F_GETFL) ;
+  return (got == -1) ? -1 : fcntl(fd, F_SETFL, got | O_NONBLOCK) ;
+}
+*/
+
+// see https://github.com/skarnet/skalibs/blob/master/src/libstddjb/ndelay_off.c
+func ndelay_off(fd FileDescriptor) error {
+	// 32-bit will likely break because it needs SYS_FCNTL64
+	got, _, e := syscall.Syscall(syscall.SYS_FCNTL, uintptr(fd), uintptr(syscall.F_GETFL), 0)
+	if e != 0 {
+		return e
+	}
+	_, _, e = syscall.Syscall(syscall.SYS_FCNTL, uintptr(fd), uintptr(syscall.F_SETFL), uintptr(int(got) & ^syscall.O_NONBLOCK))
+	if e != 0 {
+		return e
+	}
+	return nil
+}
+
+/*
+int ndelay_off (int fd)
+{
+  register int got = fcntl(fd, F_GETFL) ;
+  return (got == -1) ? -1 : fcntl(fd, F_SETFL, got & ^O_NONBLOCK) ;
+}
+*/
--- a/contrib/mesos/pkg/redirfd/redirfd_windows.go
+++ b/contrib/mesos/pkg/redirfd/redirfd_windows.go
@@ -0,0 +1,39 @@
+// +build windows
+
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package redirfd
+
+import (
+	"fmt"
+	"os"
+)
+
+type RedirectMode int
+
+const (
+	Read           RedirectMode = iota // open file for reading
+	Write                              // open file for writing, truncating if it exists
+	Update                             // open file for read & write
+	Append                             // open file for append, create if it does not exist
+	AppendExisting                     // open file for append, do not create if it does not already exist
+	WriteNew                           // open file for writing, creating it, failing if it already exists
+)
+
+func (mode RedirectMode) Redirect(nonblock, changemode bool, fd FileDescriptor, name string) (*os.File, error) {
+	return nil, fmt.Errorf("Redirect(%s, %s, %d, \"%s\") not supported on windows", nonblock, changemode, fd, name)
+}
--- a/contrib/mesos/pkg/runtime/doc.go
+++ b/contrib/mesos/pkg/runtime/doc.go
@@ -0,0 +1,19 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+// Package runtime provides utilities for semaphores (chan struct{}),
+// a simple Latch implementation, and metrics for reporting handled panics.
+package runtime
--- a/contrib/mesos/pkg/runtime/latch.go
+++ b/contrib/mesos/pkg/runtime/latch.go
@@ -0,0 +1,35 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package runtime
+
+import (
+	"sync/atomic"
+)
+
+type Latch struct {
+	int32
+}
+
+// return true if this latch was successfully acquired. concurrency safe. will only return true
+// upon the first invocation, all subsequent invocations will return false. always returns false
+// when self is nil.
+func (self *Latch) Acquire() bool {
+	if self == nil {
+		return false
+	}
+	return atomic.CompareAndSwapInt32(&self.int32, 0, 1)
+}
--- a/contrib/mesos/pkg/runtime/latch_test.go
+++ b/contrib/mesos/pkg/runtime/latch_test.go
@@ -0,0 +1,61 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package runtime
+
+import (
+	"sync"
+	"sync/atomic"
+	"testing"
+	"time"
+)
+
+func Test_LatchAcquireBasic(t *testing.T) {
+	var x Latch
+	if !x.Acquire() {
+		t.Fatalf("expected first acquire to succeed")
+	}
+	if x.Acquire() {
+		t.Fatalf("expected second acquire to fail")
+	}
+	if x.Acquire() {
+		t.Fatalf("expected third acquire to fail")
+	}
+}
+
+func Test_LatchAcquireConcurrent(t *testing.T) {
+	var x Latch
+	const NUM = 10
+	ch := make(chan struct{})
+	var success int32
+	var wg sync.WaitGroup
+	wg.Add(NUM)
+	for i := 0; i < NUM; i++ {
+		go func() {
+			defer wg.Done()
+			<-ch
+			if x.Acquire() {
+				atomic.AddInt32(&success, 1)
+			}
+		}()
+	}
+	time.Sleep(200 * time.Millisecond)
+	close(ch)
+	wg.Wait()
+	if success != 1 {
+		t.Fatalf("expected single acquire to succeed instead of %d", success)
+	}
+}
--- a/contrib/mesos/pkg/runtime/metrics.go
+++ b/contrib/mesos/pkg/runtime/metrics.go
@@ -0,0 +1,47 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package runtime
+
+import (
+	"sync"
+
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/util"
+	"github.com/prometheus/client_golang/prometheus"
+)
+
+const (
+	runtimeSubsystem = "runtime"
+)
+
+var (
+	panicCounter = prometheus.NewCounter(
+		prometheus.CounterOpts{
+			Subsystem: runtimeSubsystem,
+			Name:      "panics",
+			Help:      "Counter of panics handled by the internal crash handler.",
+		},
+	)
+)
+
+var registerMetrics sync.Once
+
+func Register() {
+	registerMetrics.Do(func() {
+		prometheus.MustRegister(panicCounter)
+		util.PanicHandlers = append(util.PanicHandlers, func(interface{}) { panicCounter.Inc() })
+	})
+}
--- a/contrib/mesos/pkg/runtime/util.go
+++ b/contrib/mesos/pkg/runtime/util.go
@@ -0,0 +1,122 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package runtime
+
+import (
+	"os"
+	"sync"
+	"time"
+
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/util"
+)
+
+type Signal <-chan struct{}
+
+// return a func that will close the signal chan.
+// multiple invocations of the returned func will not generate a panic.
+// two funcs from separate invocations of Closer() (on the same sig chan) will cause a panic if both invoked.
+// for example:
+//     // good
+//     x := runtime.After(func() { ... })
+//     f := x.Closer()
+//     f()
+//     f()
+//
+//     // bad
+//     x := runtime.After(func() { ... })
+//     f := x.Closer()
+//     g := x.Closer()
+//     f()
+//     g() // this will panic
+func Closer(sig chan<- struct{}) func() {
+	var once sync.Once
+	return func() {
+		once.Do(func() { close(sig) })
+	}
+}
+
+// upon receiving signal sig invoke function f and immediately return a signal
+// that indicates f's completion. used to chain handler funcs, for example:
+//    On(job.Done(), response.Send).Then(wg.Done)
+func (sig Signal) Then(f func()) Signal {
+	if sig == nil {
+		return nil
+	}
+	return On(sig, f)
+}
+
+// execute a callback function after the specified signal chan closes.
+// immediately returns a signal that indicates f's completion.
+func On(sig <-chan struct{}, f func()) Signal {
+	if sig == nil {
+		return nil
+	}
+	return After(func() {
+		<-sig
+		if f != nil {
+			f()
+		}
+	})
+}
+
+func OnOSSignal(sig <-chan os.Signal, f func(os.Signal)) Signal {
+	if sig == nil {
+		return nil
+	}
+	return After(func() {
+		if s, ok := <-sig; ok && f != nil {
+			f(s)
+		}
+	})
+}
+
+// spawn a goroutine to execute a func, immediately returns a chan that closes
+// upon completion of the func. returns a nil signal chan if the given func is nil.
+func After(f func()) Signal {
+	ch := make(chan struct{})
+	go func() {
+		defer close(ch)
+		defer util.HandleCrash()
+		if f != nil {
+			f()
+		}
+	}()
+	return Signal(ch)
+}
+
+// periodically execute the given function, stopping once stopCh is closed.
+// this func blocks until stopCh is closed, it's intended to be run as a goroutine.
+func Until(f func(), period time.Duration, stopCh <-chan struct{}) {
+	if f == nil {
+		return
+	}
+	for {
+		select {
+		case <-stopCh:
+			return
+		default:
+		}
+		func() {
+			defer util.HandleCrash()
+			f()
+		}()
+		select {
+		case <-stopCh:
+		case <-time.After(period):
+		}
+	}
+}
--- a/contrib/mesos/pkg/runtime/util_test.go
+++ b/contrib/mesos/pkg/runtime/util_test.go
@@ -0,0 +1,64 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package runtime
+
+import (
+	"testing"
+	"time"
+)
+
+func TestUntil(t *testing.T) {
+	ch := make(chan struct{})
+	close(ch)
+	Until(func() {
+		t.Fatal("should not have been invoked")
+	}, 0, ch)
+
+	//--
+	ch = make(chan struct{})
+	called := make(chan struct{})
+	After(func() {
+		Until(func() {
+			called <- struct{}{}
+		}, 0, ch)
+	}).Then(func() { close(called) })
+
+	<-called
+	close(ch)
+	<-called
+
+	//--
+	ch = make(chan struct{})
+	called = make(chan struct{})
+	running := make(chan struct{})
+	After(func() {
+		Until(func() {
+			close(running)
+			called <- struct{}{}
+		}, 2*time.Second, ch)
+	}).Then(func() { close(called) })
+
+	<-running
+	close(ch)
+	<-called // unblock the goroutine
+	now := time.Now()
+
+	<-called
+	if time.Since(now) > 1800*time.Millisecond {
+		t.Fatalf("Until should not have waited the full timeout period since we closed the stop chan")
+	}
+}
--- a/contrib/mesos/pkg/scheduler/config/config.go
+++ b/contrib/mesos/pkg/scheduler/config/config.go
@@ -0,0 +1,109 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package config
+
+import (
+	"io"
+	"time"
+
+	"code.google.com/p/gcfg"
+)
+
+const (
+	DefaultOfferTTL                           = 5 * time.Second   // duration an offer is viable, prior to being expired
+	DefaultOfferLingerTTL                     = 120 * time.Second // duration an expired offer lingers in history
+	DefaultListenerDelay                      = 1 * time.Second   // duration between offer listener notifications
+	DefaultUpdatesBacklog                     = 2048              // size of the pod updates channel
+	DefaultFrameworkIdRefreshInterval         = 30 * time.Second  // interval we update the frameworkId stored in etcd
+	DefaultInitialImplicitReconciliationDelay = 15 * time.Second  // wait this amount of time after initial registration before attempting implicit reconciliation
+	DefaultExplicitReconciliationMaxBackoff   = 2 * time.Minute   // interval in between internal task status checks/updates
+	DefaultExplicitReconciliationAbortTimeout = 30 * time.Second  // waiting period after attempting to cancel an ongoing reconciliation
+	DefaultInitialPodBackoff                  = 1 * time.Second
+	DefaultMaxPodBackoff                      = 60 * time.Second
+	DefaultHttpHandlerTimeout                 = 10 * time.Second
+	DefaultHttpBindInterval                   = 5 * time.Second
+)
+
+// Example scheduler configuration file:
+//
+// [scheduler]
+//  info-name        = Kubernetes
+//  offer-ttl        = 5s
+//  offer-linger-ttl = 2m
+
+type ConfigWrapper struct {
+	Scheduler Config
+}
+
+type Config struct {
+	OfferTTL                           WrappedDuration `gcfg:"offer-ttl"`
+	OfferLingerTTL                     WrappedDuration `gcfg:"offer-linger-ttl"`
+	ListenerDelay                      WrappedDuration `gcfg:"listener-delay"`
+	UpdatesBacklog                     int             `gcfg:"updates-backlog"`
+	FrameworkIdRefreshInterval         WrappedDuration `gcfg:"framework-id-refresh-interval"`
+	InitialImplicitReconciliationDelay WrappedDuration `gcfg:"initial-implicit-reconciliation-delay"`
+	ExplicitReconciliationMaxBackoff   WrappedDuration `gcfg:"explicit-reconciliantion-max-backoff"`
+	ExplicitReconciliationAbortTimeout WrappedDuration `gcfg:"explicit-reconciliantion-abort-timeout"`
+	InitialPodBackoff                  WrappedDuration `gcfg:"initial-pod-backoff"`
+	MaxPodBackoff                      WrappedDuration `gcfg:"max-pod-backoff"`
+	HttpHandlerTimeout                 WrappedDuration `gcfg:"http-handler-timeout"`
+	HttpBindInterval                   WrappedDuration `gcfg:"http-bind-interval"`
+}
+
+type WrappedDuration struct {
+	time.Duration
+}
+
+func (wd *WrappedDuration) UnmarshalText(data []byte) error {
+	d, err := time.ParseDuration(string(data))
+	if err == nil {
+		wd.Duration = d
+	}
+	return err
+}
+
+func (c *Config) SetDefaults() {
+	c.OfferTTL = WrappedDuration{DefaultOfferTTL}
+	c.OfferLingerTTL = WrappedDuration{DefaultOfferLingerTTL}
+	c.ListenerDelay = WrappedDuration{DefaultListenerDelay}
+	c.UpdatesBacklog = DefaultUpdatesBacklog
+	c.FrameworkIdRefreshInterval = WrappedDuration{DefaultFrameworkIdRefreshInterval}
+	c.InitialImplicitReconciliationDelay = WrappedDuration{DefaultInitialImplicitReconciliationDelay}
+	c.ExplicitReconciliationMaxBackoff = WrappedDuration{DefaultExplicitReconciliationMaxBackoff}
+	c.ExplicitReconciliationAbortTimeout = WrappedDuration{DefaultExplicitReconciliationAbortTimeout}
+	c.InitialPodBackoff = WrappedDuration{DefaultInitialPodBackoff}
+	c.MaxPodBackoff = WrappedDuration{DefaultMaxPodBackoff}
+	c.HttpHandlerTimeout = WrappedDuration{DefaultHttpHandlerTimeout}
+	c.HttpBindInterval = WrappedDuration{DefaultHttpBindInterval}
+}
+
+func CreateDefaultConfig() *Config {
+	c := &Config{}
+	c.SetDefaults()
+	return c
+}
+
+func (c *Config) Read(configReader io.Reader) error {
+	wrapper := &ConfigWrapper{Scheduler: *c}
+	if configReader != nil {
+		if err := gcfg.ReadInto(wrapper, configReader); err != nil {
+			return err
+		}
+		*c = wrapper.Scheduler
+	}
+	return nil
+}
--- a/contrib/mesos/pkg/scheduler/config/config_test.go
+++ b/contrib/mesos/pkg/scheduler/config/config_test.go
@@ -0,0 +1,112 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package config
+
+import (
+	"strings"
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func is_default(c *Config, t *testing.T) {
+	assert := assert.New(t)
+
+	assert.Equal(DefaultOfferTTL, c.OfferTTL.Duration)
+	assert.Equal(DefaultOfferLingerTTL, c.OfferLingerTTL.Duration)
+	assert.Equal(DefaultListenerDelay, c.ListenerDelay.Duration)
+	assert.Equal(DefaultUpdatesBacklog, c.UpdatesBacklog)
+	assert.Equal(DefaultFrameworkIdRefreshInterval, c.FrameworkIdRefreshInterval.Duration)
+	assert.Equal(DefaultInitialImplicitReconciliationDelay, c.InitialImplicitReconciliationDelay.Duration)
+	assert.Equal(DefaultExplicitReconciliationMaxBackoff, c.ExplicitReconciliationMaxBackoff.Duration)
+	assert.Equal(DefaultExplicitReconciliationAbortTimeout, c.ExplicitReconciliationAbortTimeout.Duration)
+	assert.Equal(DefaultInitialPodBackoff, c.InitialPodBackoff.Duration)
+	assert.Equal(DefaultMaxPodBackoff, c.MaxPodBackoff.Duration)
+	assert.Equal(DefaultHttpHandlerTimeout, c.HttpHandlerTimeout.Duration)
+	assert.Equal(DefaultHttpBindInterval, c.HttpBindInterval.Duration)
+}
+
+// Check that SetDefaults sets the default values
+func TestConfig_SetDefaults(t *testing.T) {
+	c := &Config{}
+	c.SetDefaults()
+	is_default(c, t)
+}
+
+// Check that CreateDefaultConfig returns a default config
+func TestConfig_CreateDefaultConfig(t *testing.T) {
+	c := CreateDefaultConfig()
+	is_default(c, t)
+}
+
+// Check that a config string can be parsed
+func TestConfig_Read(t *testing.T) {
+	assert := assert.New(t)
+
+	c := CreateDefaultConfig()
+	reader := strings.NewReader(`
+	[scheduler]
+	offer-ttl=42s
+	offer-linger-ttl=42s
+	listener-delay=42s
+	updates-backlog=42
+	framework-id-refresh-interval=42s
+	initial-implicit-reconciliation-delay=42s
+	explicit-reconciliantion-max-backoff=42s
+	explicit-reconciliantion-abort-timeout=42s
+	initial-pod-backoff=42s
+	max-pod-backoff=42s
+	http-handler-timeout=42s
+	http-bind-interval=42s
+	`)
+	err := c.Read(reader)
+	if err != nil {
+		t.Fatal("Cannot parse scheduler config: " + err.Error())
+	}
+
+	assert.Equal(42*time.Second, c.OfferTTL.Duration)
+	assert.Equal(42*time.Second, c.OfferLingerTTL.Duration)
+	assert.Equal(42*time.Second, c.ListenerDelay.Duration)
+	assert.Equal(42, c.UpdatesBacklog)
+	assert.Equal(42*time.Second, c.FrameworkIdRefreshInterval.Duration)
+	assert.Equal(42*time.Second, c.InitialImplicitReconciliationDelay.Duration)
+	assert.Equal(42*time.Second, c.ExplicitReconciliationMaxBackoff.Duration)
+	assert.Equal(42*time.Second, c.ExplicitReconciliationAbortTimeout.Duration)
+	assert.Equal(42*time.Second, c.InitialPodBackoff.Duration)
+	assert.Equal(42*time.Second, c.MaxPodBackoff.Duration)
+	assert.Equal(42*time.Second, c.HttpHandlerTimeout.Duration)
+	assert.Equal(42*time.Second, c.HttpBindInterval.Duration)
+}
+
+// check that an invalid config is rejected and non of the values to overwritten
+func TestConfig_ReadError(t *testing.T) {
+	assert := assert.New(t)
+
+	c := CreateDefaultConfig()
+	reader := strings.NewReader(`
+	[scheduler]
+	offer-ttl = 42s
+	invalid-setting = 42s
+	`)
+	err := c.Read(reader)
+	if err == nil {
+		t.Fatal("Invalid scheduler config should lead to an error")
+	}
+
+	assert.NotEqual(42*time.Second, c.OfferTTL.Duration)
+}
--- a/contrib/mesos/pkg/scheduler/config/doc.go
+++ b/contrib/mesos/pkg/scheduler/config/doc.go
@@ -0,0 +1,18 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+// Package config provides mechanisms for low-level scheduler tuning.
+package config
--- a/contrib/mesos/pkg/scheduler/constraint/constraint.go
+++ b/contrib/mesos/pkg/scheduler/constraint/constraint.go
@@ -0,0 +1,106 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package constraint
+
+import (
+	"encoding/json"
+	"fmt"
+)
+
+type OperatorType int
+
+const (
+	UniqueOperator OperatorType = iota
+	LikeOperator
+	ClusterOperator
+	GroupByOperator
+	UnlikeOperator
+)
+
+var (
+	labels = []string{
+		"UNIQUE",
+		"LIKE",
+		"CLUSTER",
+		"GROUP_BY",
+		"UNLIKE",
+	}
+
+	labelToType map[string]OperatorType
+)
+
+func init() {
+	labelToType = make(map[string]OperatorType)
+	for i, s := range labels {
+		labelToType[s] = OperatorType(i)
+	}
+}
+
+func (t OperatorType) String() string {
+	switch t {
+	case UniqueOperator, LikeOperator, ClusterOperator, GroupByOperator, UnlikeOperator:
+		return labels[int(t)]
+	default:
+		panic(fmt.Sprintf("unrecognized operator type: %d", int(t)))
+	}
+}
+
+func parseOperatorType(s string) (OperatorType, error) {
+	t, found := labelToType[s]
+	if !found {
+		return UniqueOperator, fmt.Errorf("unrecognized operator %q", s)
+	}
+	return t, nil
+}
+
+type Constraint struct {
+	Field    string       // required
+	Operator OperatorType // required
+	Value    string       // optional
+}
+
+func (c *Constraint) MarshalJSON() ([]byte, error) {
+	var a []string
+	if c != nil {
+		if c.Value != "" {
+			a = append(a, c.Field, c.Operator.String(), c.Value)
+		} else {
+			a = append(a, c.Field, c.Operator.String())
+		}
+	}
+	return json.Marshal(a)
+}
+
+func (c *Constraint) UnmarshalJSON(buf []byte) (err error) {
+	var a []string
+	if err = json.Unmarshal(buf, &a); err != nil {
+		return err
+	}
+	switch x := len(a); {
+	case x < 2:
+		err = fmt.Errorf("not enough arguments to form constraint")
+	case x > 3:
+		err = fmt.Errorf("too many arguments to form constraint")
+	case x == 3:
+		c.Value = a[2]
+		fallthrough
+	case x == 2:
+		c.Field = a[0]
+		c.Operator, err = parseOperatorType(a[1])
+	}
+	return err
+}
--- a/contrib/mesos/pkg/scheduler/constraint/constraint_test.go
+++ b/contrib/mesos/pkg/scheduler/constraint/constraint_test.go
@@ -0,0 +1,79 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package constraint
+
+import (
+	"encoding/json"
+	"testing"
+)
+
+func TestDeserialize(t *testing.T) {
+	shouldMatch := func(js string, field string, operator OperatorType, value string) (err error) {
+		constraint := Constraint{}
+		if err = json.Unmarshal(([]byte)(js), &constraint); err != nil {
+			return
+		}
+		if field != constraint.Field {
+			t.Fatalf("expected field %q instead of %q", field, constraint.Field)
+		}
+		if operator != constraint.Operator {
+			t.Fatalf("expected operator %v instead of %v", operator, constraint.Operator)
+		}
+		if value != constraint.Value {
+			t.Fatalf("expected value %q instead of %q", value, constraint.Value)
+		}
+		return
+	}
+	failOnError := func(err error) {
+		if err != nil {
+			t.Fatalf("unexpected error: %v", err)
+		}
+	}
+	failOnError(shouldMatch(`["hostname","UNIQUE"]`, "hostname", UniqueOperator, ""))
+	failOnError(shouldMatch(`["rackid","GROUP_BY","1"]`, "rackid", GroupByOperator, "1"))
+	failOnError(shouldMatch(`["jdk","LIKE","7"]`, "jdk", LikeOperator, "7"))
+	failOnError(shouldMatch(`["jdk","UNLIKE","7"]`, "jdk", UnlikeOperator, "7"))
+	failOnError(shouldMatch(`["bob","CLUSTER","foo"]`, "bob", ClusterOperator, "foo"))
+	err := shouldMatch(`["bill","NOT_REALLY_AN_OPERATOR","pete"]`, "bill", ClusterOperator, "pete")
+	if err == nil {
+		t.Fatalf("expected unmarshalling error for invalid operator")
+	}
+}
+
+func TestSerialize(t *testing.T) {
+	shouldMatch := func(expected string, constraint *Constraint) error {
+		data, err := json.Marshal(constraint)
+		if err != nil {
+			return err
+		}
+		js := string(data)
+		if js != expected {
+			t.Fatalf("expected json %q instead of %q", expected, js)
+		}
+		return nil
+	}
+	failOnError := func(err error) {
+		if err != nil {
+			t.Fatalf("unexpected error: %v", err)
+		}
+	}
+	failOnError(shouldMatch(`["hostname","UNIQUE"]`, &Constraint{"hostname", UniqueOperator, ""}))
+	failOnError(shouldMatch(`["rackid","GROUP_BY","1"]`, &Constraint{"rackid", GroupByOperator, "1"}))
+	failOnError(shouldMatch(`["jdk","LIKE","7"]`, &Constraint{"jdk", LikeOperator, "7"}))
+	failOnError(shouldMatch(`["jdk","UNLIKE","7"]`, &Constraint{"jdk", UnlikeOperator, "7"}))
+	failOnError(shouldMatch(`["bob","CLUSTER","foo"]`, &Constraint{"bob", ClusterOperator, "foo"}))
+}
--- a/contrib/mesos/pkg/scheduler/constraint/doc.go
+++ b/contrib/mesos/pkg/scheduler/constraint/doc.go
@@ -0,0 +1,19 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+// Package constraint exposes Marathon-like constraints for scheduling pods.
+// Incomplete.
+package constraint
--- a/contrib/mesos/pkg/scheduler/doc.go
+++ b/contrib/mesos/pkg/scheduler/doc.go
@@ -0,0 +1,18 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+// Package scheduler implements the Kubernetes Mesos scheduler.
+package scheduler
--- a/contrib/mesos/pkg/scheduler/fcfs.go
+++ b/contrib/mesos/pkg/scheduler/fcfs.go
@@ -0,0 +1,57 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package scheduler
+
+import (
+	"fmt"
+	log "github.com/golang/glog"
+
+	"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/offers"
+	"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/scheduler/podtask"
+)
+
+// A first-come-first-serve scheduler: acquires the first offer that can support the task
+func FCFSScheduleFunc(r offers.Registry, unused SlaveIndex, task *podtask.T) (offers.Perishable, error) {
+	podName := fmt.Sprintf("%s/%s", task.Pod.Namespace, task.Pod.Name)
+	var acceptedOffer offers.Perishable
+	err := r.Walk(func(p offers.Perishable) (bool, error) {
+		offer := p.Details()
+		if offer == nil {
+			return false, fmt.Errorf("nil offer while scheduling task %v", task.ID)
+		}
+		if task.AcceptOffer(offer) {
+			if p.Acquire() {
+				acceptedOffer = p
+				log.V(3).Infof("Pod %s accepted offer %v", podName, offer.Id.GetValue())
+				return true, nil // stop, we found an offer
+			}
+		}
+		return false, nil // continue
+	})
+	if acceptedOffer != nil {
+		if err != nil {
+			log.Warningf("problems walking the offer registry: %v, attempting to continue", err)
+		}
+		return acceptedOffer, nil
+	}
+	if err != nil {
+		log.V(2).Infof("failed to find a fit for pod: %s, err = %v", podName, err)
+		return nil, err
+	}
+	log.V(2).Infof("failed to find a fit for pod: %s", podName)
+	return nil, noSuitableOffersErr
+}
--- a/contrib/mesos/pkg/scheduler/ha/doc.go
+++ b/contrib/mesos/pkg/scheduler/ha/doc.go
@@ -0,0 +1,18 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+// Package ha encapsulates high-availability scheduler concerns.
+package ha
--- a/contrib/mesos/pkg/scheduler/ha/election.go
+++ b/contrib/mesos/pkg/scheduler/ha/election.go
@@ -0,0 +1,73 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package ha
+
+import (
+	"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/election"
+	log "github.com/golang/glog"
+)
+
+type roleType int
+
+const (
+	followerRole roleType = iota
+	masterRole
+	retiredRole
+)
+
+type candidateService struct {
+	sched     *SchedulerProcess
+	newDriver DriverFactory
+	role      roleType
+	valid     ValidationFunc
+}
+
+type ValidationFunc func(desiredUid, currentUid string)
+
+func NewCandidate(s *SchedulerProcess, f DriverFactory, v ValidationFunc) election.Service {
+	return &candidateService{
+		sched:     s,
+		newDriver: f,
+		role:      followerRole,
+		valid:     v,
+	}
+}
+
+func (self *candidateService) Validate(desired, current election.Master) {
+	if self.valid != nil {
+		self.valid(string(desired), string(current))
+	}
+}
+
+func (self *candidateService) Start() {
+	if self.role == followerRole {
+		log.Info("elected as master")
+		self.role = masterRole
+		self.sched.Elect(self.newDriver)
+	}
+}
+
+func (self *candidateService) Stop() {
+	if self.role == masterRole {
+		log.Info("retiring from master")
+		self.role = retiredRole
+		// order is important here, watchers of a SchedulerProcess will
+		// check SchedulerProcess.Failover() once Done() is closed.
+		close(self.sched.failover)
+		self.sched.End()
+	}
+}
--- a/contrib/mesos/pkg/scheduler/ha/ha.go
+++ b/contrib/mesos/pkg/scheduler/ha/ha.go
@@ -0,0 +1,285 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package ha
+
+import (
+	"fmt"
+	"sync/atomic"
+
+	"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/proc"
+	"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/runtime"
+	log "github.com/golang/glog"
+	mesos "github.com/mesos/mesos-go/mesosproto"
+	bindings "github.com/mesos/mesos-go/scheduler"
+)
+
+type DriverFactory func() (bindings.SchedulerDriver, error)
+
+type stageType int32
+
+const (
+	initStage stageType = iota
+	standbyStage
+	masterStage
+	finStage
+)
+
+func (stage *stageType) transition(from, to stageType) bool {
+	return atomic.CompareAndSwapInt32((*int32)(stage), int32(from), int32(to))
+}
+
+func (s *stageType) transitionTo(to stageType, unless ...stageType) bool {
+	if len(unless) == 0 {
+		atomic.StoreInt32((*int32)(s), int32(to))
+		return true
+	}
+	for {
+		state := s.get()
+		for _, x := range unless {
+			if state == x {
+				return false
+			}
+		}
+		if s.transition(state, to) {
+			return true
+		}
+	}
+}
+
+func (stage *stageType) get() stageType {
+	return stageType(atomic.LoadInt32((*int32)(stage)))
+}
+
+// execute some action in the deferred context of the process, but only if we
+// match the stage of the process at the time the action is executed.
+func (stage stageType) Do(p *SchedulerProcess, a proc.Action) <-chan error {
+	errOnce := proc.NewErrorOnce(p.fin)
+	errOuter := p.Do(proc.Action(func() {
+		switch stage {
+		case standbyStage:
+			//await standby signal or death
+			select {
+			case <-p.standby:
+			case <-p.Done():
+			}
+		case masterStage:
+			//await elected signal or death
+			select {
+			case <-p.elected:
+			case <-p.Done():
+			}
+		case finStage:
+			errOnce.Reportf("scheduler process is dying, dropping action")
+			return
+		default:
+		}
+		errOnce.Report(stage.When(p, a))
+	}))
+	return errOnce.Send(errOuter).Err()
+}
+
+// execute some action only if we match the stage of the scheduler process
+func (stage stageType) When(p *SchedulerProcess, a proc.Action) (err error) {
+	if stage != (&p.stage).get() {
+		err = fmt.Errorf("failed to execute deferred action, expected lifecycle stage %v instead of %v", stage, p.stage)
+	} else {
+		a()
+	}
+	return
+}
+
+type SchedulerProcess struct {
+	proc.Process
+	bindings.Scheduler
+	stage    stageType
+	elected  chan struct{} // upon close we've been elected
+	failover chan struct{} // closed indicates that we should failover upon End()
+	standby  chan struct{}
+	fin      chan struct{}
+}
+
+func New(sched bindings.Scheduler) *SchedulerProcess {
+	p := &SchedulerProcess{
+		Process:   proc.New(),
+		Scheduler: sched,
+		stage:     initStage,
+		elected:   make(chan struct{}),
+		failover:  make(chan struct{}),
+		standby:   make(chan struct{}),
+		fin:       make(chan struct{}),
+	}
+	runtime.On(p.Running(), p.begin)
+	return p
+}
+
+func (self *SchedulerProcess) begin() {
+	if (&self.stage).transition(initStage, standbyStage) {
+		close(self.standby)
+		log.Infoln("scheduler process entered standby stage")
+	} else {
+		log.Errorf("failed to transition from init to standby stage")
+	}
+}
+
+func (self *SchedulerProcess) End() <-chan struct{} {
+	if (&self.stage).transitionTo(finStage, finStage) {
+		defer close(self.fin)
+		log.Infoln("scheduler process entered fin stage")
+	}
+	return self.Process.End()
+}
+
+func (self *SchedulerProcess) Elect(newDriver DriverFactory) {
+	errOnce := proc.NewErrorOnce(self.fin)
+	proc.OnError(errOnce.Send(standbyStage.Do(self, proc.Action(func() {
+		if !(&self.stage).transition(standbyStage, masterStage) {
+			log.Errorf("failed to transition from standby to master stage, aborting")
+			self.End()
+			return
+		}
+		log.Infoln("scheduler process entered master stage")
+		drv, err := newDriver()
+		if err != nil {
+			log.Errorf("failed to fetch scheduler driver: %v", err)
+			self.End()
+			return
+		}
+		log.V(1).Infoln("starting driver...")
+		stat, err := drv.Start()
+		if stat == mesos.Status_DRIVER_RUNNING && err == nil {
+			log.Infoln("driver started successfully and is running")
+			close(self.elected)
+			go func() {
+				defer self.End()
+				_, err := drv.Join()
+				if err != nil {
+					log.Errorf("driver failed with error: %v", err)
+				}
+				errOnce.Report(err)
+			}()
+			return
+		}
+		defer self.End()
+		if err != nil {
+			log.Errorf("failed to start scheduler driver: %v", err)
+		} else {
+			log.Errorf("expected RUNNING status, not %v", stat)
+		}
+	}))).Err(), func(err error) {
+		defer self.End()
+		log.Errorf("failed to handle election event, aborting: %v", err)
+	}, self.fin)
+}
+
+func (self *SchedulerProcess) Terminal() <-chan struct{} {
+	return self.fin
+}
+
+func (self *SchedulerProcess) Elected() <-chan struct{} {
+	return self.elected
+}
+
+func (self *SchedulerProcess) Failover() <-chan struct{} {
+	return self.failover
+}
+
+type masterProcess struct {
+	*SchedulerProcess
+	doer proc.Doer
+}
+
+func (self *masterProcess) Done() <-chan struct{} {
+	return self.SchedulerProcess.Terminal()
+}
+
+func (self *masterProcess) Do(a proc.Action) <-chan error {
+	return self.doer.Do(a)
+}
+
+// returns a Process instance that will only execute a proc.Action if the scheduler is the elected master
+func (self *SchedulerProcess) Master() proc.Process {
+	return &masterProcess{
+		SchedulerProcess: self,
+		doer: proc.DoWith(self, proc.DoerFunc(func(a proc.Action) <-chan error {
+			return proc.ErrorChan(masterStage.When(self, a))
+		})),
+	}
+}
+
+func (self *SchedulerProcess) logError(ch <-chan error) {
+	self.OnError(ch, func(err error) {
+		log.Errorf("failed to execute scheduler action: %v", err)
+	})
+}
+
+func (self *SchedulerProcess) Registered(drv bindings.SchedulerDriver, fid *mesos.FrameworkID, mi *mesos.MasterInfo) {
+	self.logError(self.Master().Do(proc.Action(func() {
+		self.Scheduler.Registered(drv, fid, mi)
+	})))
+}
+
+func (self *SchedulerProcess) Reregistered(drv bindings.SchedulerDriver, mi *mesos.MasterInfo) {
+	self.logError(self.Master().Do(proc.Action(func() {
+		self.Scheduler.Reregistered(drv, mi)
+	})))
+}
+
+func (self *SchedulerProcess) Disconnected(drv bindings.SchedulerDriver) {
+	self.logError(self.Master().Do(proc.Action(func() {
+		self.Scheduler.Disconnected(drv)
+	})))
+}
+
+func (self *SchedulerProcess) ResourceOffers(drv bindings.SchedulerDriver, off []*mesos.Offer) {
+	self.logError(self.Master().Do(proc.Action(func() {
+		self.Scheduler.ResourceOffers(drv, off)
+	})))
+}
+
+func (self *SchedulerProcess) OfferRescinded(drv bindings.SchedulerDriver, oid *mesos.OfferID) {
+	self.logError(self.Master().Do(proc.Action(func() {
+		self.Scheduler.OfferRescinded(drv, oid)
+	})))
+}
+
+func (self *SchedulerProcess) StatusUpdate(drv bindings.SchedulerDriver, ts *mesos.TaskStatus) {
+	self.logError(self.Master().Do(proc.Action(func() {
+		self.Scheduler.StatusUpdate(drv, ts)
+	})))
+}
+
+func (self *SchedulerProcess) FrameworkMessage(drv bindings.SchedulerDriver, eid *mesos.ExecutorID, sid *mesos.SlaveID, m string) {
+	self.logError(self.Master().Do(proc.Action(func() {
+		self.Scheduler.FrameworkMessage(drv, eid, sid, m)
+	})))
+}
+
+func (self *SchedulerProcess) SlaveLost(drv bindings.SchedulerDriver, sid *mesos.SlaveID) {
+	self.logError(self.Master().Do(proc.Action(func() {
+		self.Scheduler.SlaveLost(drv, sid)
+	})))
+}
+
+func (self *SchedulerProcess) ExecutorLost(drv bindings.SchedulerDriver, eid *mesos.ExecutorID, sid *mesos.SlaveID, x int) {
+	self.logError(self.Master().Do(proc.Action(func() {
+		self.Scheduler.ExecutorLost(drv, eid, sid, x)
+	})))
+}
+
+func (self *SchedulerProcess) Error(drv bindings.SchedulerDriver, msg string) {
+	self.Scheduler.Error(drv, msg)
+}
--- a/contrib/mesos/pkg/scheduler/meta/annotations.go
+++ b/contrib/mesos/pkg/scheduler/meta/annotations.go
@@ -0,0 +1,30 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package meta
+
+// kubernetes api object annotations
+const (
+	BindingHostKey           = "k8s.mesosphere.io/bindingHost"
+	TaskIdKey                = "k8s.mesosphere.io/taskId"
+	SlaveIdKey               = "k8s.mesosphere.io/slaveId"
+	OfferIdKey               = "k8s.mesosphere.io/offerId"
+	ExecutorIdKey            = "k8s.mesosphere.io/executorId"
+	PortMappingKeyPrefix     = "k8s.mesosphere.io/port_"
+	PortMappingKeyFormat     = PortMappingKeyPrefix + "%s_%d"
+	PortNameMappingKeyPrefix = "k8s.mesosphere.io/portName_"
+	PortNameMappingKeyFormat = PortNameMappingKeyPrefix + "%s_%s"
+)
--- a/contrib/mesos/pkg/scheduler/meta/doc.go
+++ b/contrib/mesos/pkg/scheduler/meta/doc.go
@@ -0,0 +1,19 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+// Package meta defines framework constants used as keys in k8s annotations
+// that are attached to k8s pods
+package meta
--- a/contrib/mesos/pkg/scheduler/meta/store.go
+++ b/contrib/mesos/pkg/scheduler/meta/store.go
@@ -0,0 +1,24 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package meta
+
+// keys for things that we store
+const (
+	//TODO(jdef) this should also be a format instead of a fixed path
+	FrameworkIDKey        = "/mesos/k8sm/frameworkid"
+	DefaultElectionFormat = "/mesos/k8sm/framework/%s/leader"
+)
--- a/contrib/mesos/pkg/scheduler/metrics/doc.go
+++ b/contrib/mesos/pkg/scheduler/metrics/doc.go
@@ -0,0 +1,18 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+// Package metrics defines and exposes instrumentation metrics of the scheduler.
+package metrics
--- a/contrib/mesos/pkg/scheduler/metrics/metrics.go
+++ b/contrib/mesos/pkg/scheduler/metrics/metrics.go
@@ -0,0 +1,102 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package metrics
+
+import (
+	"sync"
+	"time"
+
+	"github.com/prometheus/client_golang/prometheus"
+)
+
+const (
+	schedulerSubsystem = "k8sm_scheduler"
+)
+
+var (
+	QueueWaitTime = prometheus.NewSummary(
+		prometheus.SummaryOpts{
+			Subsystem: schedulerSubsystem,
+			Name:      "queue_wait_time_microseconds",
+			Help:      "Launch queue wait time in microseconds",
+		},
+	)
+	BindLatency = prometheus.NewSummary(
+		prometheus.SummaryOpts{
+			Subsystem: schedulerSubsystem,
+			Name:      "bind_latency_microseconds",
+			Help:      "Latency in microseconds between pod-task launch and pod binding.",
+		},
+	)
+	StatusUpdates = prometheus.NewCounterVec(
+		prometheus.CounterOpts{
+			Subsystem: schedulerSubsystem,
+			Name:      "status_updates",
+			Help:      "Counter of TaskStatus updates, broken out by source, reason, state.",
+		},
+		[]string{"source", "reason", "state"},
+	)
+	ReconciliationLatency = prometheus.NewSummary(
+		prometheus.SummaryOpts{
+			Subsystem: schedulerSubsystem,
+			Name:      "reconciliation_latency_microseconds",
+			Help:      "Latency in microseconds to execute explicit task reconciliation.",
+		},
+	)
+	ReconciliationRequested = prometheus.NewCounterVec(
+		prometheus.CounterOpts{
+			Subsystem: schedulerSubsystem,
+			Name:      "reconciliation_requested",
+			Help:      "Counter of requested task reconciliations, broken out by kind.",
+		},
+		[]string{"kind"},
+	)
+	ReconciliationExecuted = prometheus.NewCounterVec(
+		prometheus.CounterOpts{
+			Subsystem: schedulerSubsystem,
+			Name:      "reconciliation_executed",
+			Help:      "Counter of executed task reconciliations requests, broken out by kind.",
+		},
+		[]string{"kind"},
+	)
+	ReconciliationCancelled = prometheus.NewCounterVec(
+		prometheus.CounterOpts{
+			Subsystem: schedulerSubsystem,
+			Name:      "reconciliation_cancelled",
+			Help:      "Counter of cancelled task reconciliations requests, broken out by kind.",
+		},
+		[]string{"kind"},
+	)
+)
+
+var registerMetrics sync.Once
+
+func Register() {
+	registerMetrics.Do(func() {
+		prometheus.MustRegister(QueueWaitTime)
+		prometheus.MustRegister(BindLatency)
+		prometheus.MustRegister(StatusUpdates)
+		prometheus.MustRegister(ReconciliationLatency)
+		prometheus.MustRegister(ReconciliationRequested)
+		prometheus.MustRegister(ReconciliationExecuted)
+		prometheus.MustRegister(ReconciliationCancelled)
+	})
+}
+
+func InMicroseconds(d time.Duration) float64 {
+	return float64(d.Nanoseconds() / time.Microsecond.Nanoseconds())
+}
--- a/contrib/mesos/pkg/scheduler/mock_test.go
+++ b/contrib/mesos/pkg/scheduler/mock_test.go
@@ -0,0 +1,203 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package scheduler
+
+import (
+	"sync"
+	"testing"
+
+	"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/offers"
+	"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/scheduler/podtask"
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/api"
+	mesos "github.com/mesos/mesos-go/mesosproto"
+	"github.com/stretchr/testify/mock"
+)
+
+// implements SchedulerInterface
+type MockScheduler struct {
+	sync.RWMutex
+	mock.Mock
+}
+
+func (m *MockScheduler) slaveFor(id string) (slave *Slave, ok bool) {
+	args := m.Called(id)
+	x := args.Get(0)
+	if x != nil {
+		slave = x.(*Slave)
+	}
+	ok = args.Bool(1)
+	return
+}
+func (m *MockScheduler) algorithm() (f PodScheduleFunc) {
+	args := m.Called()
+	x := args.Get(0)
+	if x != nil {
+		f = x.(PodScheduleFunc)
+	}
+	return
+}
+func (m *MockScheduler) createPodTask(ctx api.Context, pod *api.Pod) (task *podtask.T, err error) {
+	args := m.Called(ctx, pod)
+	x := args.Get(0)
+	if x != nil {
+		task = x.(*podtask.T)
+	}
+	err = args.Error(1)
+	return
+}
+func (m *MockScheduler) offers() (f offers.Registry) {
+	args := m.Called()
+	x := args.Get(0)
+	if x != nil {
+		f = x.(offers.Registry)
+	}
+	return
+}
+func (m *MockScheduler) tasks() (f podtask.Registry) {
+	args := m.Called()
+	x := args.Get(0)
+	if x != nil {
+		f = x.(podtask.Registry)
+	}
+	return
+}
+func (m *MockScheduler) killTask(taskId string) error {
+	args := m.Called(taskId)
+	return args.Error(0)
+}
+func (m *MockScheduler) launchTask(task *podtask.T) error {
+	args := m.Called(task)
+	return args.Error(0)
+}
+
+// @deprecated this is a placeholder for me to test the mock package
+func TestNoSlavesYet(t *testing.T) {
+	obj := &MockScheduler{}
+	obj.On("slaveFor", "foo").Return(nil, false)
+	obj.slaveFor("foo")
+	obj.AssertExpectations(t)
+}
+
+/*-----------------------------------------------------------------------------
+ |
+ |   this really belongs in the mesos-go package, but that's being updated soon
+ |   any way so just keep it here for now unless we *really* need it there.
+ |
+ \-----------------------------------------------------------------------------
+
+// Scheduler defines the interfaces that needed to be implemented.
+type Scheduler interface {
+        Registered(SchedulerDriver, *FrameworkID, *MasterInfo)
+        Reregistered(SchedulerDriver, *MasterInfo)
+        Disconnected(SchedulerDriver)
+        ResourceOffers(SchedulerDriver, []*Offer)
+        OfferRescinded(SchedulerDriver, *OfferID)
+        StatusUpdate(SchedulerDriver, *TaskStatus)
+        FrameworkMessage(SchedulerDriver, *ExecutorID, *SlaveID, string)
+        SlaveLost(SchedulerDriver, *SlaveID)
+        ExecutorLost(SchedulerDriver, *ExecutorID, *SlaveID, int)
+        Error(SchedulerDriver, string)
+}
+*/
+
+func status(args mock.Arguments, at int) (val mesos.Status) {
+	if x := args.Get(at); x != nil {
+		val = x.(mesos.Status)
+	}
+	return
+}
+
+type extendedMock struct {
+	mock.Mock
+}
+
+// Upon returns a chan that closes upon the execution of the most recently registered call.
+func (m *extendedMock) Upon() <-chan struct{} {
+	ch := make(chan struct{})
+	call := &m.ExpectedCalls[len(m.ExpectedCalls)-1]
+	f := call.Run
+	call.Run = func(args mock.Arguments) {
+		defer close(ch)
+		if f != nil {
+			f(args)
+		}
+	}
+	return ch
+}
+
+type MockSchedulerDriver struct {
+	extendedMock
+}
+
+func (m *MockSchedulerDriver) Init() error {
+	args := m.Called()
+	return args.Error(0)
+}
+func (m *MockSchedulerDriver) Start() (mesos.Status, error) {
+	args := m.Called()
+	return status(args, 0), args.Error(1)
+}
+func (m *MockSchedulerDriver) Stop(b bool) (mesos.Status, error) {
+	args := m.Called(b)
+	return status(args, 0), args.Error(1)
+}
+func (m *MockSchedulerDriver) Abort() (mesos.Status, error) {
+	args := m.Called()
+	return status(args, 0), args.Error(1)
+}
+func (m *MockSchedulerDriver) Join() (mesos.Status, error) {
+	args := m.Called()
+	return status(args, 0), args.Error(1)
+}
+func (m *MockSchedulerDriver) Run() (mesos.Status, error) {
+	args := m.Called()
+	return status(args, 0), args.Error(1)
+}
+func (m *MockSchedulerDriver) RequestResources(r []*mesos.Request) (mesos.Status, error) {
+	args := m.Called(r)
+	return status(args, 0), args.Error(1)
+}
+func (m *MockSchedulerDriver) ReconcileTasks(statuses []*mesos.TaskStatus) (mesos.Status, error) {
+	args := m.Called(statuses)
+	return status(args, 0), args.Error(1)
+}
+func (m *MockSchedulerDriver) LaunchTasks(offerIds []*mesos.OfferID, ti []*mesos.TaskInfo, f *mesos.Filters) (mesos.Status, error) {
+	args := m.Called(offerIds, ti, f)
+	return status(args, 0), args.Error(1)
+}
+func (m *MockSchedulerDriver) KillTask(tid *mesos.TaskID) (mesos.Status, error) {
+	args := m.Called(tid)
+	return status(args, 0), args.Error(1)
+}
+func (m *MockSchedulerDriver) DeclineOffer(oid *mesos.OfferID, f *mesos.Filters) (mesos.Status, error) {
+	args := m.Called(oid, f)
+	return status(args, 0), args.Error(1)
+}
+func (m *MockSchedulerDriver) ReviveOffers() (mesos.Status, error) {
+	args := m.Called()
+	return status(args, 0), args.Error(0)
+}
+func (m *MockSchedulerDriver) SendFrameworkMessage(eid *mesos.ExecutorID, sid *mesos.SlaveID, s string) (mesos.Status, error) {
+	args := m.Called(eid, sid, s)
+	return status(args, 0), args.Error(1)
+}
+func (m *MockSchedulerDriver) Destroy() {
+	m.Called()
+}
+func (m *MockSchedulerDriver) Wait() {
+	m.Called()
+}
--- a/contrib/mesos/pkg/scheduler/plugin.go
+++ b/contrib/mesos/pkg/scheduler/plugin.go
@@ -0,0 +1,875 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package scheduler
+
+import (
+	"fmt"
+	"io"
+	"net/http"
+	"strconv"
+	"sync"
+	"time"
+
+	"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/backoff"
+	"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/offers"
+	"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/queue"
+	"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/runtime"
+	annotation "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/scheduler/meta"
+	"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/scheduler/podtask"
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/api"
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/api/errors"
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/client"
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/client/cache"
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/client/record"
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/fields"
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/util"
+	plugin "github.com/GoogleCloudPlatform/kubernetes/plugin/pkg/scheduler"
+	"github.com/GoogleCloudPlatform/kubernetes/plugin/pkg/scheduler/algorithm"
+	log "github.com/golang/glog"
+	mesos "github.com/mesos/mesos-go/mesosproto"
+	mutil "github.com/mesos/mesos-go/mesosutil"
+)
+
+const (
+	enqueuePopTimeout   = 200 * time.Millisecond
+	enqueueWaitTimeout  = 1 * time.Second
+	yieldPopTimeout     = 200 * time.Millisecond
+	yieldWaitTimeout    = 1 * time.Second
+	pluginRecoveryDelay = 100 * time.Millisecond // delay after scheduler plugin crashes, before we resume scheduling
+)
+
+// scheduler abstraction to allow for easier unit testing
+type schedulerInterface interface {
+	sync.Locker // synchronize scheduler plugin operations
+	SlaveIndex
+	algorithm() PodScheduleFunc
+	offers() offers.Registry
+	tasks() podtask.Registry
+
+	// driver calls
+
+	killTask(taskId string) error
+	launchTask(*podtask.T) error
+
+	// convenience
+
+	createPodTask(api.Context, *api.Pod) (*podtask.T, error)
+}
+
+type k8smScheduler struct {
+	sync.Mutex
+	internal *KubernetesScheduler
+}
+
+func (k *k8smScheduler) algorithm() PodScheduleFunc {
+	return k.internal.scheduleFunc
+}
+
+func (k *k8smScheduler) offers() offers.Registry {
+	return k.internal.offers
+}
+
+func (k *k8smScheduler) tasks() podtask.Registry {
+	return k.internal.taskRegistry
+}
+
+func (k *k8smScheduler) createPodTask(ctx api.Context, pod *api.Pod) (*podtask.T, error) {
+	return podtask.New(ctx, "", *pod, k.internal.executor)
+}
+
+func (k *k8smScheduler) slaveFor(id string) (slave *Slave, ok bool) {
+	slave, ok = k.internal.slaves.getSlave(id)
+	return
+}
+
+func (k *k8smScheduler) killTask(taskId string) error {
+	killTaskId := mutil.NewTaskID(taskId)
+	_, err := k.internal.driver.KillTask(killTaskId)
+	return err
+}
+
+func (k *k8smScheduler) launchTask(task *podtask.T) error {
+	// assume caller is holding scheduler lock
+	taskList := []*mesos.TaskInfo{task.BuildTaskInfo()}
+	offerIds := []*mesos.OfferID{task.Offer.Details().Id}
+	filters := &mesos.Filters{}
+	_, err := k.internal.driver.LaunchTasks(offerIds, taskList, filters)
+	return err
+}
+
+type binder struct {
+	api schedulerInterface
+}
+
+// implements binding.Registry, launches the pod-associated-task in mesos
+func (b *binder) Bind(binding *api.Binding) error {
+
+	ctx := api.WithNamespace(api.NewContext(), binding.Namespace)
+
+	// default upstream scheduler passes pod.Name as binding.Name
+	podKey, err := podtask.MakePodKey(ctx, binding.Name)
+	if err != nil {
+		return err
+	}
+
+	b.api.Lock()
+	defer b.api.Unlock()
+
+	switch task, state := b.api.tasks().ForPod(podKey); state {
+	case podtask.StatePending:
+		return b.bind(ctx, binding, task)
+	default:
+		// in this case it's likely that the pod has been deleted between Schedule
+		// and Bind calls
+		log.Infof("No pending task for pod %s", podKey)
+		return noSuchPodErr //TODO(jdef) this error is somewhat misleading since the task could be running?!
+	}
+}
+
+func (b *binder) rollback(task *podtask.T, err error) error {
+	task.Offer.Release()
+	task.Reset()
+	if err2 := b.api.tasks().Update(task); err2 != nil {
+		log.Errorf("failed to update pod task: %v", err2)
+	}
+	return err
+}
+
+// assumes that: caller has acquired scheduler lock and that the task is still pending
+func (b *binder) bind(ctx api.Context, binding *api.Binding, task *podtask.T) (err error) {
+	// sanity check: ensure that the task hasAcceptedOffer(), it's possible that between
+	// Schedule() and now that the offer for this task was rescinded or invalidated.
+	// ((we should never see this here))
+	if !task.HasAcceptedOffer() {
+		return fmt.Errorf("task has not accepted a valid offer %v", task.ID)
+	}
+
+	// By this time, there is a chance that the slave is disconnected.
+	offerId := task.GetOfferId()
+	if offer, ok := b.api.offers().Get(offerId); !ok || offer.HasExpired() {
+		// already rescinded or timed out or otherwise invalidated
+		return b.rollback(task, fmt.Errorf("failed prior to launchTask due to expired offer for task %v", task.ID))
+	}
+
+	if err = b.prepareTaskForLaunch(ctx, binding.Target.Name, task, offerId); err == nil {
+		log.V(2).Infof("launching task: %q on target %q slave %q for pod \"%v/%v\"",
+			task.ID, binding.Target.Name, task.Spec.SlaveID, task.Pod.Namespace, task.Pod.Name)
+		if err = b.api.launchTask(task); err == nil {
+			b.api.offers().Invalidate(offerId)
+			task.Set(podtask.Launched)
+			if err = b.api.tasks().Update(task); err != nil {
+				// this should only happen if the task has been removed or has changed status,
+				// which SHOULD NOT HAPPEN as long as we're synchronizing correctly
+				log.Errorf("failed to update task w/ Launched status: %v", err)
+			}
+			return
+		}
+	}
+	return b.rollback(task, fmt.Errorf("Failed to launch task %v: %v", task.ID, err))
+}
+
+//TODO(jdef) unit test this, ensure that task's copy of api.Pod is not modified
+func (b *binder) prepareTaskForLaunch(ctx api.Context, machine string, task *podtask.T, offerId string) error {
+	pod := task.Pod
+
+	// we make an effort here to avoid making changes to the task's copy of the pod, since
+	// we want that to reflect the initial user spec, and not the modified spec that we
+	// build for the executor to consume.
+	oemCt := pod.Spec.Containers
+	pod.Spec.Containers = append([]api.Container{}, oemCt...) // (shallow) clone before mod
+
+	if pod.Annotations == nil {
+		pod.Annotations = make(map[string]string)
+	} else {
+		oemAnn := pod.Annotations
+		pod.Annotations = make(map[string]string)
+		for k, v := range oemAnn {
+			pod.Annotations[k] = v
+		}
+	}
+	pod.Annotations[annotation.BindingHostKey] = machine
+	task.SaveRecoveryInfo(pod.Annotations)
+
+	for _, entry := range task.Spec.PortMap {
+		oemPorts := pod.Spec.Containers[entry.ContainerIdx].Ports
+		ports := append([]api.ContainerPort{}, oemPorts...)
+		p := &ports[entry.PortIdx]
+		p.HostPort = int(entry.OfferPort)
+		op := strconv.FormatUint(entry.OfferPort, 10)
+		pod.Annotations[fmt.Sprintf(annotation.PortMappingKeyFormat, p.Protocol, p.ContainerPort)] = op
+		if p.Name != "" {
+			pod.Annotations[fmt.Sprintf(annotation.PortNameMappingKeyFormat, p.Protocol, p.Name)] = op
+		}
+		pod.Spec.Containers[entry.ContainerIdx].Ports = ports
+	}
+
+	// the kubelet-executor uses this to instantiate the pod
+	log.V(3).Infof("prepared pod spec: %+v", pod)
+
+	data, err := api.Codec.Encode(&pod)
+	if err != nil {
+		log.V(2).Infof("Failed to marshal the pod spec: %v", err)
+		return err
+	}
+	task.Spec.Data = data
+	return nil
+}
+
+type kubeScheduler struct {
+	api        schedulerInterface
+	podUpdates queue.FIFO
+}
+
+// Schedule implements the Scheduler interface of Kubernetes.
+// It returns the selectedMachine's name and error (if there's any).
+func (k *kubeScheduler) Schedule(pod *api.Pod, unused algorithm.MinionLister) (string, error) {
+	log.Infof("Try to schedule pod %v\n", pod.Name)
+	ctx := api.WithNamespace(api.NewDefaultContext(), pod.Namespace)
+
+	// default upstream scheduler passes pod.Name as binding.PodID
+	podKey, err := podtask.MakePodKey(ctx, pod.Name)
+	if err != nil {
+		return "", err
+	}
+
+	k.api.Lock()
+	defer k.api.Unlock()
+
+	switch task, state := k.api.tasks().ForPod(podKey); state {
+	case podtask.StateUnknown:
+		// There's a bit of a potential race here, a pod could have been yielded() and
+		// then before we get *here* it could be deleted.
+		// We use meta to index the pod in the store since that's what k8s reflector does.
+		podName, err := cache.MetaNamespaceKeyFunc(pod)
+		if err != nil {
+			log.Warningf("aborting Schedule, unable to understand pod object %+v", pod)
+			return "", noSuchPodErr
+		}
+		if deleted := k.podUpdates.Poll(podName, queue.DELETE_EVENT); deleted {
+			// avoid scheduling a pod that's been deleted between yieldPod() and Schedule()
+			log.Infof("aborting Schedule, pod has been deleted %+v", pod)
+			return "", noSuchPodErr
+		}
+		return k.doSchedule(k.api.tasks().Register(k.api.createPodTask(ctx, pod)))
+
+	//TODO(jdef) it's possible that the pod state has diverged from what
+	//we knew previously, we should probably update the task.Pod state here
+	//before proceeding with scheduling
+	case podtask.StatePending:
+		if pod.UID != task.Pod.UID {
+			// we're dealing with a brand new pod spec here, so the old one must have been
+			// deleted -- and so our task store is out of sync w/ respect to reality
+			//TODO(jdef) reconcile task
+			return "", fmt.Errorf("task %v spec is out of sync with pod %v spec, aborting schedule", task.ID, pod.Name)
+		} else if task.Has(podtask.Launched) {
+			// task has been marked as "launched" but the pod binding creation may have failed in k8s,
+			// but we're going to let someone else handle it, probably the mesos task error handler
+			return "", fmt.Errorf("task %s has already been launched, aborting schedule", task.ID)
+		} else {
+			return k.doSchedule(task, nil)
+		}
+
+	default:
+		return "", fmt.Errorf("task %s is not pending, nothing to schedule", task.ID)
+	}
+}
+
+// Call ScheduleFunc and subtract some resources, returning the name of the machine the task is scheduled on
+func (k *kubeScheduler) doSchedule(task *podtask.T, err error) (string, error) {
+	var offer offers.Perishable
+	if task.HasAcceptedOffer() {
+		// verify that the offer is still on the table
+		offerId := task.GetOfferId()
+		if offer, ok := k.api.offers().Get(offerId); ok && !offer.HasExpired() {
+			// skip tasks that have already have assigned offers
+			offer = task.Offer
+		} else {
+			task.Offer.Release()
+			task.Reset()
+			if err = k.api.tasks().Update(task); err != nil {
+				return "", err
+			}
+		}
+	}
+	if err == nil && offer == nil {
+		offer, err = k.api.algorithm()(k.api.offers(), k.api, task)
+	}
+	if err != nil {
+		return "", err
+	}
+	details := offer.Details()
+	if details == nil {
+		return "", fmt.Errorf("offer already invalid/expired for task %v", task.ID)
+	}
+	slaveId := details.GetSlaveId().GetValue()
+	if slave, ok := k.api.slaveFor(slaveId); !ok {
+		// not much sense in Release()ing the offer here since its owner died
+		offer.Release()
+		k.api.offers().Invalidate(details.Id.GetValue())
+		return "", fmt.Errorf("Slave disappeared (%v) while scheduling task %v", slaveId, task.ID)
+	} else {
+		if task.Offer != nil && task.Offer != offer {
+			return "", fmt.Errorf("task.offer assignment must be idempotent, task %+v: offer %+v", task, offer)
+		}
+		task.Offer = offer
+		task.FillFromDetails(details)
+		if err := k.api.tasks().Update(task); err != nil {
+			offer.Release()
+			return "", err
+		}
+		return slave.HostName, nil
+	}
+}
+
+type queuer struct {
+	lock            sync.Mutex       // shared by condition variables of this struct
+	podUpdates      queue.FIFO       // queue of pod updates to be processed
+	podQueue        *queue.DelayFIFO // queue of pods to be scheduled
+	deltaCond       sync.Cond        // pod changes are available for processing
+	unscheduledCond sync.Cond        // there are unscheduled pods for processing
+}
+
+func newQueuer(store queue.FIFO) *queuer {
+	q := &queuer{
+		podQueue:   queue.NewDelayFIFO(),
+		podUpdates: store,
+	}
+	q.deltaCond.L = &q.lock
+	q.unscheduledCond.L = &q.lock
+	return q
+}
+
+func (q *queuer) installDebugHandlers(mux *http.ServeMux) {
+	mux.HandleFunc("/debug/scheduler/podqueue", func(w http.ResponseWriter, r *http.Request) {
+		for _, x := range q.podQueue.List() {
+			if _, err := io.WriteString(w, fmt.Sprintf("%+v\n", x)); err != nil {
+				break
+			}
+		}
+	})
+	mux.HandleFunc("/debug/scheduler/podstore", func(w http.ResponseWriter, r *http.Request) {
+		for _, x := range q.podUpdates.List() {
+			if _, err := io.WriteString(w, fmt.Sprintf("%+v\n", x)); err != nil {
+				break
+			}
+		}
+	})
+}
+
+// signal that there are probably pod updates waiting to be processed
+func (q *queuer) updatesAvailable() {
+	q.deltaCond.Broadcast()
+}
+
+// delete a pod from the to-be-scheduled queue
+func (q *queuer) dequeue(id string) {
+	q.podQueue.Delete(id)
+}
+
+// re-add a pod to the to-be-scheduled queue, will not overwrite existing pod data (that
+// may have already changed).
+func (q *queuer) requeue(pod *Pod) {
+	// use KeepExisting in case the pod has already been updated (can happen if binding fails
+	// due to constraint voilations); we don't want to overwrite a newer entry with stale data.
+	q.podQueue.Add(pod, queue.KeepExisting)
+	q.unscheduledCond.Broadcast()
+}
+
+// same as requeue but calls podQueue.Offer instead of podQueue.Add
+func (q *queuer) reoffer(pod *Pod) {
+	// use KeepExisting in case the pod has already been updated (can happen if binding fails
+	// due to constraint voilations); we don't want to overwrite a newer entry with stale data.
+	if q.podQueue.Offer(pod, queue.KeepExisting) {
+		q.unscheduledCond.Broadcast()
+	}
+}
+
+// spawns a go-routine to watch for unscheduled pods and queue them up
+// for scheduling. returns immediately.
+func (q *queuer) Run(done <-chan struct{}) {
+	go runtime.Until(func() {
+		log.Info("Watching for newly created pods")
+		q.lock.Lock()
+		defer q.lock.Unlock()
+
+		for {
+			// limit blocking here for short intervals so that scheduling
+			// may proceed even if there have been no recent pod changes
+			p := q.podUpdates.Await(enqueuePopTimeout)
+			if p == nil {
+				signalled := runtime.After(q.deltaCond.Wait)
+				// we've yielded the lock
+				select {
+				case <-time.After(enqueueWaitTimeout):
+					q.deltaCond.Broadcast() // abort Wait()
+					<-signalled             // wait for lock re-acquisition
+					log.V(4).Infoln("timed out waiting for a pod update")
+				case <-signalled:
+					// we've acquired the lock and there may be
+					// changes for us to process now
+				}
+				continue
+			}
+
+			pod := p.(*Pod)
+			if pod.Spec.NodeName != "" {
+				log.V(3).Infof("dequeuing pod for scheduling: %v", pod.Pod.Name)
+				q.dequeue(pod.GetUID())
+			} else {
+				// use ReplaceExisting because we are always pushing the latest state
+				now := time.Now()
+				pod.deadline = &now
+				if q.podQueue.Offer(pod, queue.ReplaceExisting) {
+					q.unscheduledCond.Broadcast()
+					log.V(3).Infof("queued pod for scheduling: %v", pod.Pod.Name)
+				} else {
+					log.Warningf("failed to queue pod for scheduling: %v", pod.Pod.Name)
+				}
+			}
+		}
+	}, 1*time.Second, done)
+}
+
+// implementation of scheduling plugin's NextPod func; see k8s plugin/pkg/scheduler
+func (q *queuer) yield() *api.Pod {
+	log.V(2).Info("attempting to yield a pod")
+	q.lock.Lock()
+	defer q.lock.Unlock()
+
+	for {
+		// limit blocking here to short intervals so that we don't block the
+		// enqueuer Run() routine for very long
+		kpod := q.podQueue.Await(yieldPopTimeout)
+		if kpod == nil {
+			signalled := runtime.After(q.unscheduledCond.Wait)
+			// lock is yielded at this point and we're going to wait for either
+			// a timeout, or a signal that there's data
+			select {
+			case <-time.After(yieldWaitTimeout):
+				q.unscheduledCond.Broadcast() // abort Wait()
+				<-signalled                   // wait for the go-routine, and the lock
+				log.V(4).Infoln("timed out waiting for a pod to yield")
+			case <-signalled:
+				// we have acquired the lock, and there
+				// may be a pod for us to pop now
+			}
+			continue
+		}
+
+		pod := kpod.(*Pod).Pod
+		if podName, err := cache.MetaNamespaceKeyFunc(pod); err != nil {
+			log.Warningf("yield unable to understand pod object %+v, will skip: %v", pod, err)
+		} else if !q.podUpdates.Poll(podName, queue.POP_EVENT) {
+			log.V(1).Infof("yield popped a transitioning pod, skipping: %+v", pod)
+		} else if pod.Spec.NodeName != "" {
+			// should never happen if enqueuePods is filtering properly
+			log.Warningf("yield popped an already-scheduled pod, skipping: %+v", pod)
+		} else {
+			return pod
+		}
+	}
+}
+
+type errorHandler struct {
+	api     schedulerInterface
+	backoff *backoff.Backoff
+	qr      *queuer
+}
+
+// implementation of scheduling plugin's Error func; see plugin/pkg/scheduler
+func (k *errorHandler) handleSchedulingError(pod *api.Pod, schedulingErr error) {
+
+	if schedulingErr == noSuchPodErr {
+		log.V(2).Infof("Not rescheduling non-existent pod %v", pod.Name)
+		return
+	}
+
+	log.Infof("Error scheduling %v: %v; retrying", pod.Name, schedulingErr)
+	defer util.HandleCrash()
+
+	// default upstream scheduler passes pod.Name as binding.PodID
+	ctx := api.WithNamespace(api.NewDefaultContext(), pod.Namespace)
+	podKey, err := podtask.MakePodKey(ctx, pod.Name)
+	if err != nil {
+		log.Errorf("Failed to construct pod key, aborting scheduling for pod %v: %v", pod.Name, err)
+		return
+	}
+
+	k.backoff.GC()
+	k.api.Lock()
+	defer k.api.Unlock()
+
+	switch task, state := k.api.tasks().ForPod(podKey); state {
+	case podtask.StateUnknown:
+		// if we don't have a mapping here any more then someone deleted the pod
+		log.V(2).Infof("Could not resolve pod to task, aborting pod reschdule: %s", podKey)
+		return
+
+	case podtask.StatePending:
+		if task.Has(podtask.Launched) {
+			log.V(2).Infof("Skipping re-scheduling for already-launched pod %v", podKey)
+			return
+		}
+		breakoutEarly := queue.BreakChan(nil)
+		if schedulingErr == noSuitableOffersErr {
+			log.V(3).Infof("adding backoff breakout handler for pod %v", podKey)
+			breakoutEarly = queue.BreakChan(k.api.offers().Listen(podKey, func(offer *mesos.Offer) bool {
+				k.api.Lock()
+				defer k.api.Unlock()
+				switch task, state := k.api.tasks().Get(task.ID); state {
+				case podtask.StatePending:
+					return !task.Has(podtask.Launched) && task.AcceptOffer(offer)
+				default:
+					// no point in continuing to check for matching offers
+					return true
+				}
+			}))
+		}
+		delay := k.backoff.Get(podKey)
+		log.V(3).Infof("requeuing pod %v with delay %v", podKey, delay)
+		k.qr.requeue(&Pod{Pod: pod, delay: &delay, notify: breakoutEarly})
+
+	default:
+		log.V(2).Infof("Task is no longer pending, aborting reschedule for pod %v", podKey)
+	}
+}
+
+type deleter struct {
+	api schedulerInterface
+	qr  *queuer
+}
+
+// currently monitors for "pod deleted" events, upon which handle()
+// is invoked.
+func (k *deleter) Run(updates <-chan queue.Entry, done <-chan struct{}) {
+	go runtime.Until(func() {
+		for {
+			entry := <-updates
+			pod := entry.Value().(*Pod)
+			if entry.Is(queue.DELETE_EVENT) {
+				if err := k.deleteOne(pod); err != nil {
+					log.Error(err)
+				}
+			} else if !entry.Is(queue.POP_EVENT) {
+				k.qr.updatesAvailable()
+			}
+		}
+	}, 1*time.Second, done)
+}
+
+func (k *deleter) deleteOne(pod *Pod) error {
+	ctx := api.WithNamespace(api.NewDefaultContext(), pod.Namespace)
+	podKey, err := podtask.MakePodKey(ctx, pod.Name)
+	if err != nil {
+		return err
+	}
+
+	log.V(2).Infof("pod deleted: %v", podKey)
+
+	// order is important here: we want to make sure we have the lock before
+	// removing the pod from the scheduling queue. this makes the concurrent
+	// execution of scheduler-error-handling and delete-handling easier to
+	// reason about.
+	k.api.Lock()
+	defer k.api.Unlock()
+
+	// prevent the scheduler from attempting to pop this; it's also possible that
+	// it's concurrently being scheduled (somewhere between pod scheduling and
+	// binding) - if so, then we'll end up removing it from taskRegistry which
+	// will abort Bind()ing
+	k.qr.dequeue(pod.GetUID())
+
+	switch task, state := k.api.tasks().ForPod(podKey); state {
+	case podtask.StateUnknown:
+		log.V(2).Infof("Could not resolve pod '%s' to task id", podKey)
+		return noSuchPodErr
+
+	// determine if the task has already been launched to mesos, if not then
+	// cleanup is easier (unregister) since there's no state to sync
+	case podtask.StatePending:
+		if !task.Has(podtask.Launched) {
+			// we've been invoked in between Schedule() and Bind()
+			if task.HasAcceptedOffer() {
+				task.Offer.Release()
+				task.Reset()
+				task.Set(podtask.Deleted)
+				//TODO(jdef) probably want better handling here
+				if err := k.api.tasks().Update(task); err != nil {
+					return err
+				}
+			}
+			k.api.tasks().Unregister(task)
+			return nil
+		}
+		fallthrough
+
+	case podtask.StateRunning:
+		// signal to watchers that the related pod is going down
+		task.Set(podtask.Deleted)
+		if err := k.api.tasks().Update(task); err != nil {
+			log.Errorf("failed to update task w/ Deleted status: %v", err)
+		}
+		return k.api.killTask(task.ID)
+
+	default:
+		log.Infof("cannot kill pod '%s': non-terminal task not found %v", podKey, task.ID)
+		return noSuchTaskErr
+	}
+}
+
+// Create creates a scheduler plugin and all supporting background functions.
+func (k *KubernetesScheduler) NewDefaultPluginConfig(terminate <-chan struct{}, mux *http.ServeMux) *PluginConfig {
+	// use ListWatch watching pods using the client by default
+	return k.NewPluginConfig(terminate, mux, createAllPodsLW(k.client))
+}
+
+func (k *KubernetesScheduler) NewPluginConfig(terminate <-chan struct{}, mux *http.ServeMux,
+	podsWatcher *cache.ListWatch) *PluginConfig {
+
+	// Watch and queue pods that need scheduling.
+	updates := make(chan queue.Entry, k.schedcfg.UpdatesBacklog)
+	podUpdates := &podStoreAdapter{queue.NewHistorical(updates)}
+	reflector := cache.NewReflector(podsWatcher, &api.Pod{}, podUpdates, 0)
+
+	// lock that guards critial sections that involve transferring pods from
+	// the store (cache) to the scheduling queue; its purpose is to maintain
+	// an ordering (vs interleaving) of operations that's easier to reason about.
+	kapi := &k8smScheduler{internal: k}
+	q := newQueuer(podUpdates)
+	podDeleter := &deleter{
+		api: kapi,
+		qr:  q,
+	}
+	eh := &errorHandler{
+		api:     kapi,
+		backoff: backoff.New(k.schedcfg.InitialPodBackoff.Duration, k.schedcfg.MaxPodBackoff.Duration),
+		qr:      q,
+	}
+	startLatch := make(chan struct{})
+	eventBroadcaster := record.NewBroadcaster()
+	runtime.On(startLatch, func() {
+		eventBroadcaster.StartRecordingToSink(k.client.Events(""))
+		reflector.Run() // TODO(jdef) should listen for termination
+		podDeleter.Run(updates, terminate)
+		q.Run(terminate)
+
+		q.installDebugHandlers(mux)
+		podtask.InstallDebugHandlers(k.taskRegistry, mux)
+	})
+	return &PluginConfig{
+		Config: &plugin.Config{
+			MinionLister: nil,
+			Algorithm: &kubeScheduler{
+				api:        kapi,
+				podUpdates: podUpdates,
+			},
+			Binder:   &binder{api: kapi},
+			NextPod:  q.yield,
+			Error:    eh.handleSchedulingError,
+			Recorder: eventBroadcaster.NewRecorder(api.EventSource{Component: "scheduler"}),
+		},
+		api:      kapi,
+		client:   k.client,
+		qr:       q,
+		deleter:  podDeleter,
+		starting: startLatch,
+	}
+}
+
+type PluginConfig struct {
+	*plugin.Config
+	api      schedulerInterface
+	client   *client.Client
+	qr       *queuer
+	deleter  *deleter
+	starting chan struct{} // startup latch
+}
+
+func NewPlugin(c *PluginConfig) PluginInterface {
+	return &schedulingPlugin{
+		config:   c.Config,
+		api:      c.api,
+		client:   c.client,
+		qr:       c.qr,
+		deleter:  c.deleter,
+		starting: c.starting,
+	}
+}
+
+type schedulingPlugin struct {
+	config   *plugin.Config
+	api      schedulerInterface
+	client   *client.Client
+	qr       *queuer
+	deleter  *deleter
+	starting chan struct{}
+}
+
+func (s *schedulingPlugin) Run(done <-chan struct{}) {
+	defer close(s.starting)
+	go runtime.Until(s.scheduleOne, pluginRecoveryDelay, done)
+}
+
+// hacked from GoogleCloudPlatform/kubernetes/plugin/pkg/scheduler/scheduler.go,
+// with the Modeler stuff removed since we don't use it because we have mesos.
+func (s *schedulingPlugin) scheduleOne() {
+	pod := s.config.NextPod()
+	log.V(3).Infof("Attempting to schedule: %v", pod)
+	dest, err := s.config.Algorithm.Schedule(pod, s.config.MinionLister) // call kubeScheduler.Schedule
+	if err != nil {
+		log.V(1).Infof("Failed to schedule: %v", pod)
+		s.config.Recorder.Eventf(pod, "failedScheduling", "Error scheduling: %v", err)
+		s.config.Error(pod, err)
+		return
+	}
+	b := &api.Binding{
+		ObjectMeta: api.ObjectMeta{Namespace: pod.Namespace, Name: pod.Name},
+		Target: api.ObjectReference{
+			Kind: "Node",
+			Name: dest,
+		},
+	}
+	if err := s.config.Binder.Bind(b); err != nil {
+		log.V(1).Infof("Failed to bind pod: %v", err)
+		s.config.Recorder.Eventf(pod, "failedScheduling", "Binding rejected: %v", err)
+		s.config.Error(pod, err)
+		return
+	}
+	s.config.Recorder.Eventf(pod, "scheduled", "Successfully assigned %v to %v", pod.Name, dest)
+}
+
+// this pod may be out of sync with respect to the API server registry:
+//      this pod   |  apiserver registry
+//    -------------|----------------------
+//      host=.*    |  404           ; pod was deleted
+//      host=.*    |  5xx           ; failed to sync, try again later?
+//      host=""    |  host=""       ; perhaps no updates to process?
+//      host=""    |  host="..."    ; pod has been scheduled and assigned, is there a task assigned? (check TaskIdKey in binding?)
+//      host="..." |  host=""       ; pod is no longer scheduled, does it need to be re-queued?
+//      host="..." |  host="..."    ; perhaps no updates to process?
+//
+// TODO(jdef) this needs an integration test
+func (s *schedulingPlugin) reconcilePod(oldPod api.Pod) {
+	log.V(1).Infof("reconcile pod %v", oldPod.Name)
+	ctx := api.WithNamespace(api.NewDefaultContext(), oldPod.Namespace)
+	pod, err := s.client.Pods(api.NamespaceValue(ctx)).Get(oldPod.Name)
+	if err != nil {
+		if errors.IsNotFound(err) {
+			// attempt to delete
+			if err = s.deleter.deleteOne(&Pod{Pod: &oldPod}); err != nil && err != noSuchPodErr && err != noSuchTaskErr {
+				log.Errorf("failed to delete pod: %v: %v", oldPod.Name, err)
+			}
+		} else {
+			//TODO(jdef) other errors should probably trigger a retry (w/ backoff).
+			//For now, drop the pod on the floor
+			log.Warning("aborting reconciliation for pod %v: %v", oldPod.Name, err)
+		}
+		return
+	}
+	if oldPod.Spec.NodeName != pod.Spec.NodeName {
+		if pod.Spec.NodeName == "" {
+			// pod is unscheduled.
+			// it's possible that we dropped the pod in the scheduler error handler
+			// because of task misalignment with the pod (task.Has(podtask.Launched) == true)
+
+			podKey, err := podtask.MakePodKey(ctx, pod.Name)
+			if err != nil {
+				log.Error(err)
+				return
+			}
+
+			s.api.Lock()
+			defer s.api.Unlock()
+
+			if _, state := s.api.tasks().ForPod(podKey); state != podtask.StateUnknown {
+				//TODO(jdef) reconcile the task
+				log.Errorf("task already registered for pod %v", pod.Name)
+				return
+			}
+
+			now := time.Now()
+			log.V(3).Infof("reoffering pod %v", podKey)
+			s.qr.reoffer(&Pod{
+				Pod:      pod,
+				deadline: &now,
+			})
+		} else {
+			// pod is scheduled.
+			// not sure how this happened behind our backs. attempt to reconstruct
+			// at least a partial podtask.T record.
+			//TODO(jdef) reconcile the task
+			log.Errorf("pod already scheduled: %v", pod.Name)
+		}
+	} else {
+		//TODO(jdef) for now, ignore the fact that the rest of the spec may be different
+		//and assume that our knowledge of the pod aligns with that of the apiserver
+		log.Error("pod reconciliation does not support updates; not yet implemented")
+	}
+}
+
+func parseSelectorOrDie(s string) fields.Selector {
+	selector, err := fields.ParseSelector(s)
+	if err != nil {
+		panic(err)
+	}
+	return selector
+}
+
+// createAllPodsLW returns a listWatch that finds all pods
+func createAllPodsLW(cl *client.Client) *cache.ListWatch {
+	return cache.NewListWatchFromClient(cl, "pods", api.NamespaceAll, parseSelectorOrDie(""))
+}
+
+// Consumes *api.Pod, produces *Pod; the k8s reflector wants to push *api.Pod
+// objects at us, but we want to store more flexible (Pod) type defined in
+// this package. The adapter implementation facilitates this. It's a little
+// hackish since the object type going in is different than the object type
+// coming out -- you've been warned.
+type podStoreAdapter struct {
+	queue.FIFO
+}
+
+func (psa *podStoreAdapter) Add(obj interface{}) error {
+	pod := obj.(*api.Pod)
+	return psa.FIFO.Add(&Pod{Pod: pod})
+}
+
+func (psa *podStoreAdapter) Update(obj interface{}) error {
+	pod := obj.(*api.Pod)
+	return psa.FIFO.Update(&Pod{Pod: pod})
+}
+
+func (psa *podStoreAdapter) Delete(obj interface{}) error {
+	pod := obj.(*api.Pod)
+	return psa.FIFO.Delete(&Pod{Pod: pod})
+}
+
+func (psa *podStoreAdapter) Get(obj interface{}) (interface{}, bool, error) {
+	pod := obj.(*api.Pod)
+	return psa.FIFO.Get(&Pod{Pod: pod})
+}
+
+// Replace will delete the contents of the store, using instead the
+// given map. This store implementation does NOT take ownership of the map.
+func (psa *podStoreAdapter) Replace(objs []interface{}) error {
+	newobjs := make([]interface{}, len(objs))
+	for i, v := range objs {
+		pod := v.(*api.Pod)
+		newobjs[i] = &Pod{Pod: pod}
+	}
+	return psa.FIFO.Replace(newobjs)
+}
--- a/contrib/mesos/pkg/scheduler/plugin_test.go
+++ b/contrib/mesos/pkg/scheduler/plugin_test.go
@@ -0,0 +1,700 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package scheduler
+
+import (
+	"fmt"
+	"net/http"
+	"net/http/httptest"
+	"sync"
+	"testing"
+	"time"
+
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/api"
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/api/testapi"
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/client"
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/client/cache"
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/runtime"
+	kutil "github.com/GoogleCloudPlatform/kubernetes/pkg/util"
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/watch"
+
+	assertext "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/assert"
+	"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/executor/messages"
+	"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/queue"
+	schedcfg "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/scheduler/config"
+	"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/scheduler/ha"
+	"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/scheduler/podtask"
+	log "github.com/golang/glog"
+	mesos "github.com/mesos/mesos-go/mesosproto"
+	util "github.com/mesos/mesos-go/mesosutil"
+	bindings "github.com/mesos/mesos-go/scheduler"
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/mock"
+)
+
+// A apiserver mock which partially mocks the pods API
+type TestServer struct {
+	server *httptest.Server
+	stats  map[string]uint
+	lock   sync.Mutex
+}
+
+func NewTestServer(t *testing.T, namespace string, mockPodListWatch *MockPodsListWatch) *TestServer {
+	ts := TestServer{
+		stats: map[string]uint{},
+	}
+	mux := http.NewServeMux()
+
+	mux.HandleFunc(testapi.ResourcePath("pods", namespace, ""), func(w http.ResponseWriter, r *http.Request) {
+		w.WriteHeader(http.StatusOK)
+		pods := mockPodListWatch.Pods()
+		w.Write([]byte(runtime.EncodeOrDie(testapi.Codec(), &pods)))
+	})
+
+	podsPrefix := testapi.ResourcePath("pods", namespace, "") + "/"
+	mux.HandleFunc(podsPrefix, func(w http.ResponseWriter, r *http.Request) {
+		name := r.URL.Path[len(podsPrefix):]
+
+		// update statistics for this pod
+		ts.lock.Lock()
+		defer ts.lock.Unlock()
+		ts.stats[name] = ts.stats[name] + 1
+
+		p := mockPodListWatch.GetPod(name)
+		if p != nil {
+			w.WriteHeader(http.StatusOK)
+			w.Write([]byte(runtime.EncodeOrDie(testapi.Codec(), p)))
+			return
+		}
+		w.WriteHeader(http.StatusNotFound)
+	})
+
+	mux.HandleFunc(testapi.ResourcePath("events", namespace, ""), func(w http.ResponseWriter, r *http.Request) {
+		w.WriteHeader(http.StatusOK)
+	})
+
+	mux.HandleFunc("/", func(res http.ResponseWriter, req *http.Request) {
+		t.Errorf("unexpected request: %v", req.RequestURI)
+		res.WriteHeader(http.StatusNotFound)
+	})
+
+	ts.server = httptest.NewServer(mux)
+	return &ts
+}
+func (ts *TestServer) Stats(name string) uint {
+	ts.lock.Lock()
+	defer ts.lock.Unlock()
+
+	return ts.stats[name]
+}
+
+// Create mock of pods ListWatch, usually listening on the apiserver pods watch endpoint
+type MockPodsListWatch struct {
+	ListWatch   cache.ListWatch
+	fakeWatcher *watch.FakeWatcher
+	list        api.PodList
+	lock        sync.Mutex
+}
+
+func NewMockPodsListWatch(initialPodList api.PodList) *MockPodsListWatch {
+	lw := MockPodsListWatch{
+		fakeWatcher: watch.NewFake(),
+		list:        initialPodList,
+	}
+	lw.ListWatch = cache.ListWatch{
+		WatchFunc: func(resourceVersion string) (watch.Interface, error) {
+			return lw.fakeWatcher, nil
+		},
+		ListFunc: func() (runtime.Object, error) {
+			return &lw.list, nil
+		},
+	}
+	return &lw
+}
+func (lw *MockPodsListWatch) Pods() api.PodList {
+	lw.lock.Lock()
+	defer lw.lock.Unlock()
+
+	return lw.list
+}
+func (lw *MockPodsListWatch) GetPod(name string) *api.Pod {
+	lw.lock.Lock()
+	defer lw.lock.Unlock()
+
+	for _, p := range lw.list.Items {
+		if p.Name == name {
+			return &p
+		}
+	}
+
+	return nil
+}
+func (lw *MockPodsListWatch) Add(pod *api.Pod, notify bool) {
+	lw.lock.Lock()
+	defer lw.lock.Unlock()
+
+	lw.list.Items = append(lw.list.Items, *pod)
+	if notify {
+		lw.fakeWatcher.Add(pod)
+	}
+}
+func (lw *MockPodsListWatch) Modify(pod *api.Pod, notify bool) {
+	lw.lock.Lock()
+	defer lw.lock.Unlock()
+
+	for i, otherPod := range lw.list.Items {
+		if otherPod.Name == pod.Name {
+			lw.list.Items[i] = *pod
+			if notify {
+				lw.fakeWatcher.Modify(pod)
+			}
+			return
+		}
+	}
+	log.Fatalf("Cannot find pod %v to modify in MockPodsListWatch", pod.Name)
+}
+func (lw *MockPodsListWatch) Delete(pod *api.Pod, notify bool) {
+	lw.lock.Lock()
+	defer lw.lock.Unlock()
+
+	for i, otherPod := range lw.list.Items {
+		if otherPod.Name == pod.Name {
+			lw.list.Items = append(lw.list.Items[:i], lw.list.Items[i+1:]...)
+			if notify {
+				lw.fakeWatcher.Delete(&otherPod)
+			}
+			return
+		}
+	}
+	log.Fatalf("Cannot find pod %v to delete in MockPodsListWatch", pod.Name)
+}
+
+// Create a pod with a given index, requiring one port
+func NewTestPod(i int) *api.Pod {
+	name := fmt.Sprintf("pod%d", i)
+	return &api.Pod{
+		TypeMeta: api.TypeMeta{APIVersion: testapi.Version()},
+		ObjectMeta: api.ObjectMeta{
+			Name:      name,
+			Namespace: "default",
+			SelfLink:  fmt.Sprintf("http://1.2.3.4/api/v1beta1/pods/%s", name),
+		},
+		Spec: api.PodSpec{
+			Containers: []api.Container{
+				{
+					Ports: []api.ContainerPort{
+						{
+							ContainerPort: 8000 + i,
+							Protocol:      api.ProtocolTCP,
+						},
+					},
+				},
+			},
+		},
+		Status: api.PodStatus{
+			PodIP: fmt.Sprintf("1.2.3.%d", 4+i),
+			Conditions: []api.PodCondition{
+				{
+					Type:   api.PodReady,
+					Status: api.ConditionTrue,
+				},
+			},
+		},
+	}
+}
+
+// Offering some cpus and memory and the 8000-9000 port range
+func NewTestOffer(i int) *mesos.Offer {
+	hostname := fmt.Sprintf("h%d", i)
+	cpus := util.NewScalarResource("cpus", 3.75)
+	mem := util.NewScalarResource("mem", 940)
+	var port8000 uint64 = 8000
+	var port9000 uint64 = 9000
+	ports8000to9000 := mesos.Value_Range{Begin: &port8000, End: &port9000}
+	ports := util.NewRangesResource("ports", []*mesos.Value_Range{&ports8000to9000})
+	return &mesos.Offer{
+		Id:        util.NewOfferID(fmt.Sprintf("offer%d", i)),
+		Hostname:  &hostname,
+		SlaveId:   util.NewSlaveID(hostname),
+		Resources: []*mesos.Resource{cpus, mem, ports},
+	}
+}
+
+// Add assertions to reason about event streams
+type Event struct {
+	Object  runtime.Object
+	Reason  string
+	Message string
+}
+
+type EventPredicate func(e Event) bool
+
+type EventAssertions struct {
+	assert.Assertions
+}
+
+// EventObserver implements record.EventRecorder for the purposes of validation via EventAssertions.
+type EventObserver struct {
+	fifo chan Event
+}
+
+func NewEventObserver() *EventObserver {
+	return &EventObserver{
+		fifo: make(chan Event, 1000),
+	}
+}
+func (o *EventObserver) Event(object runtime.Object, reason, message string) {
+	o.fifo <- Event{Object: object, Reason: reason, Message: message}
+}
+func (o *EventObserver) Eventf(object runtime.Object, reason, messageFmt string, args ...interface{}) {
+	o.fifo <- Event{Object: object, Reason: reason, Message: fmt.Sprintf(messageFmt, args...)}
+}
+func (o *EventObserver) PastEventf(object runtime.Object, timestamp kutil.Time, reason, messageFmt string, args ...interface{}) {
+	o.fifo <- Event{Object: object, Reason: reason, Message: fmt.Sprintf(messageFmt, args...)}
+}
+
+func (a *EventAssertions) Event(observer *EventObserver, pred EventPredicate, msgAndArgs ...interface{}) bool {
+	// parse msgAndArgs: first possibly a duration, otherwise a format string with further args
+	timeout := time.Second * 2
+	msg := "event not received"
+	msgArgStart := 0
+	if len(msgAndArgs) > 0 {
+		switch msgAndArgs[0].(type) {
+		case time.Duration:
+			timeout = msgAndArgs[0].(time.Duration)
+			msgArgStart += 1
+		}
+	}
+	if len(msgAndArgs) > msgArgStart {
+		msg = fmt.Sprintf(msgAndArgs[msgArgStart].(string), msgAndArgs[msgArgStart+1:]...)
+	}
+
+	// watch events
+	result := make(chan bool)
+	stop := make(chan struct{})
+	go func() {
+		for {
+			select {
+			case e, ok := <-observer.fifo:
+				if !ok {
+					result <- false
+					return
+				} else if pred(e) {
+					log.V(3).Infof("found asserted event for reason '%v': %v", e.Reason, e.Message)
+					result <- true
+					return
+				} else {
+					log.V(5).Infof("ignoring not-asserted event for reason '%v': %v", e.Reason, e.Message)
+				}
+			case _, ok := <-stop:
+				if !ok {
+					return
+				}
+			}
+		}
+	}()
+	defer close(stop)
+
+	// wait for watch to match or timeout
+	select {
+	case matched := <-result:
+		return matched
+	case <-time.After(timeout):
+		return a.Fail(msg)
+	}
+}
+func (a *EventAssertions) EventWithReason(observer *EventObserver, reason string, msgAndArgs ...interface{}) bool {
+	return a.Event(observer, func(e Event) bool {
+		return e.Reason == reason
+	}, msgAndArgs...)
+}
+
+type joinableDriver struct {
+	MockSchedulerDriver
+	joinFunc func() (mesos.Status, error)
+}
+
+// Join invokes joinFunc if it has been set, otherwise blocks forever
+func (m *joinableDriver) Join() (mesos.Status, error) {
+	if m.joinFunc != nil {
+		return m.joinFunc()
+	}
+	select {}
+}
+
+// Create mesos.TaskStatus for a given task
+func newTaskStatusForTask(task *mesos.TaskInfo, state mesos.TaskState) *mesos.TaskStatus {
+	healthy := state == mesos.TaskState_TASK_RUNNING
+	ts := float64(time.Now().Nanosecond()) / 1000000000.0
+	source := mesos.TaskStatus_SOURCE_EXECUTOR
+	return &mesos.TaskStatus{
+		TaskId:     task.TaskId,
+		State:      &state,
+		SlaveId:    task.SlaveId,
+		ExecutorId: task.Executor.ExecutorId,
+		Timestamp:  &ts,
+		Healthy:    &healthy,
+		Source:     &source,
+		Data:       task.Data,
+	}
+}
+
+// Test to create the scheduler plugin with an empty plugin config
+func TestPlugin_New(t *testing.T) {
+	assert := assert.New(t)
+
+	c := PluginConfig{}
+	p := NewPlugin(&c)
+	assert.NotNil(p)
+}
+
+// Test to create the scheduler plugin with the config returned by the scheduler,
+// and play through the whole life cycle of the plugin while creating pods, deleting
+// and failing them.
+func TestPlugin_LifeCycle(t *testing.T) {
+	assert := &EventAssertions{*assert.New(t)}
+
+	// create a fake pod watch. We use that below to submit new pods to the scheduler
+	podListWatch := NewMockPodsListWatch(api.PodList{})
+
+	// create fake apiserver
+	testApiServer := NewTestServer(t, api.NamespaceDefault, podListWatch)
+	defer testApiServer.server.Close()
+
+	// create scheduler
+	testScheduler := New(Config{
+		Executor: util.NewExecutorInfo(
+			util.NewExecutorID("executor-id"),
+			util.NewCommandInfo("executor-cmd"),
+		),
+		Client:       client.NewOrDie(&client.Config{Host: testApiServer.server.URL, Version: testapi.Version()}),
+		ScheduleFunc: FCFSScheduleFunc,
+		Schedcfg:     *schedcfg.CreateDefaultConfig(),
+	})
+
+	assert.NotNil(testScheduler.client, "client is nil")
+	assert.NotNil(testScheduler.executor, "executor is nil")
+	assert.NotNil(testScheduler.offers, "offer registry is nil")
+
+	// create scheduler process
+	schedulerProcess := ha.New(testScheduler)
+
+	// get plugin config from it
+	c := testScheduler.NewPluginConfig(schedulerProcess.Terminal(), http.DefaultServeMux, &podListWatch.ListWatch)
+	assert.NotNil(c)
+
+	// make events observable
+	eventObserver := NewEventObserver()
+	c.Recorder = eventObserver
+
+	// create plugin
+	p := NewPlugin(c)
+	assert.NotNil(p)
+
+	// run plugin
+	p.Run(schedulerProcess.Terminal())
+	defer schedulerProcess.End()
+
+	// init scheduler
+	err := testScheduler.Init(schedulerProcess.Master(), p, http.DefaultServeMux)
+	assert.NoError(err)
+
+	// create mock mesos scheduler driver
+	mockDriver := &joinableDriver{}
+	mockDriver.On("Start").Return(mesos.Status_DRIVER_RUNNING, nil).Once()
+	started := mockDriver.Upon()
+
+	mAny := mock.AnythingOfType
+	mockDriver.On("ReconcileTasks", mAny("[]*mesosproto.TaskStatus")).Return(mesos.Status_DRIVER_RUNNING, nil)
+	mockDriver.On("SendFrameworkMessage", mAny("*mesosproto.ExecutorID"), mAny("*mesosproto.SlaveID"), mAny("string")).
+		Return(mesos.Status_DRIVER_RUNNING, nil)
+
+	launchedTasks := make(chan *mesos.TaskInfo, 1)
+	launchTasksCalledFunc := func(args mock.Arguments) {
+		taskInfos := args.Get(1).([]*mesos.TaskInfo)
+		assert.Equal(1, len(taskInfos))
+		launchedTasks <- taskInfos[0]
+	}
+	mockDriver.On("LaunchTasks", mAny("[]*mesosproto.OfferID"), mAny("[]*mesosproto.TaskInfo"), mAny("*mesosproto.Filters")).
+		Return(mesos.Status_DRIVER_RUNNING, nil).Run(launchTasksCalledFunc)
+
+	// elect master with mock driver
+	driverFactory := ha.DriverFactory(func() (bindings.SchedulerDriver, error) {
+		return mockDriver, nil
+	})
+	schedulerProcess.Elect(driverFactory)
+	elected := schedulerProcess.Elected()
+
+	// driver will be started
+	<-started
+
+	// tell scheduler to be registered
+	testScheduler.Registered(
+		mockDriver,
+		util.NewFrameworkID("kubernetes-id"),
+		util.NewMasterInfo("master-id", (192<<24)+(168<<16)+(0<<8)+1, 5050),
+	)
+
+	// wait for being elected
+	<-elected
+
+	//TODO(jdef) refactor things above here into a test suite setup of some sort
+
+	// fake new, unscheduled pod
+	pod1 := NewTestPod(1)
+	podListWatch.Add(pod1, true) // notify watchers
+
+	// wait for failedScheduling event because there is no offer
+	assert.EventWithReason(eventObserver, "failedScheduling", "failedScheduling event not received")
+
+	// add some matching offer
+	offers1 := []*mesos.Offer{NewTestOffer(1)}
+	testScheduler.ResourceOffers(nil, offers1)
+
+	// and wait for scheduled pod
+	assert.EventWithReason(eventObserver, "scheduled")
+	select {
+	case launchedTask := <-launchedTasks:
+		// report back that the task has been staged, and then started by mesos
+		testScheduler.StatusUpdate(mockDriver, newTaskStatusForTask(launchedTask, mesos.TaskState_TASK_STAGING))
+		testScheduler.StatusUpdate(mockDriver, newTaskStatusForTask(launchedTask, mesos.TaskState_TASK_RUNNING))
+
+		// report back that the task has been lost
+		mockDriver.AssertNumberOfCalls(t, "SendFrameworkMessage", 0)
+		testScheduler.StatusUpdate(mockDriver, newTaskStatusForTask(launchedTask, mesos.TaskState_TASK_LOST))
+
+		// and wait that framework message is sent to executor
+		mockDriver.AssertNumberOfCalls(t, "SendFrameworkMessage", 1)
+
+	case <-time.After(5 * time.Second):
+		t.Fatalf("timed out waiting for launchTasks call")
+	}
+
+	// start another pod
+	podNum := 1
+	startPod := func(offers []*mesos.Offer) (*api.Pod, *mesos.TaskInfo) {
+		podNum = podNum + 1
+
+		// create pod and matching offer
+		pod := NewTestPod(podNum)
+		podListWatch.Add(pod, true) // notify watchers
+		testScheduler.ResourceOffers(mockDriver, offers)
+		assert.EventWithReason(eventObserver, "scheduled")
+
+		// wait for driver.launchTasks call
+		select {
+		case launchedTask := <-launchedTasks:
+			testScheduler.StatusUpdate(mockDriver, newTaskStatusForTask(launchedTask, mesos.TaskState_TASK_STAGING))
+			testScheduler.StatusUpdate(mockDriver, newTaskStatusForTask(launchedTask, mesos.TaskState_TASK_RUNNING))
+			return pod, launchedTask
+
+		case <-time.After(5 * time.Second):
+			t.Fatal("timed out waiting for launchTasks")
+			return nil, nil
+		}
+	}
+
+	pod, launchedTask := startPod(offers1)
+
+	// mock drvier.KillTask, should be invoked when a pod is deleted
+	mockDriver.On("KillTask", mAny("*mesosproto.TaskID")).Return(mesos.Status_DRIVER_RUNNING, nil).Run(func(args mock.Arguments) {
+		killedTaskId := *(args.Get(0).(*mesos.TaskID))
+		assert.Equal(*launchedTask.TaskId, killedTaskId, "expected same TaskID as during launch")
+	})
+	killTaskCalled := mockDriver.Upon()
+
+	// stop it again via the apiserver mock
+	podListWatch.Delete(pod, true) // notify watchers
+
+	// and wait for the driver killTask call with the correct TaskId
+	select {
+	case <-killTaskCalled:
+		// report back that the task is finished
+		testScheduler.StatusUpdate(mockDriver, newTaskStatusForTask(launchedTask, mesos.TaskState_TASK_FINISHED))
+
+	case <-time.After(5 * time.Second):
+		t.Fatal("timed out waiting for KillTask")
+	}
+
+	// start pods:
+	// - which are failing while binding,
+	// - leading to reconciliation
+	// - with different states on the apiserver
+
+	failPodFromExecutor := func(task *mesos.TaskInfo) {
+		beforePodLookups := testApiServer.Stats(pod.Name)
+		status := newTaskStatusForTask(task, mesos.TaskState_TASK_FAILED)
+		message := messages.CreateBindingFailure
+		status.Message = &message
+		testScheduler.StatusUpdate(mockDriver, status)
+
+		// wait until pod is looked up at the apiserver
+		assertext.EventuallyTrue(t, time.Second, func() bool {
+			return testApiServer.Stats(pod.Name) == beforePodLookups+1
+		}, "expect that reconcilePod will access apiserver for pod %v", pod.Name)
+	}
+
+	// 1. with pod deleted from the apiserver
+	pod, launchedTask = startPod(offers1)
+	podListWatch.Delete(pod, false) // not notifying the watchers
+	failPodFromExecutor(launchedTask)
+
+	// 2. with pod still on the apiserver, not bound
+	pod, launchedTask = startPod(offers1)
+	failPodFromExecutor(launchedTask)
+
+	// 3. with pod still on the apiserver, bound i.e. host!=""
+	pod, launchedTask = startPod(offers1)
+	pod.Spec.NodeName = *offers1[0].Hostname
+	podListWatch.Modify(pod, false) // not notifying the watchers
+	failPodFromExecutor(launchedTask)
+
+	// 4. with pod still on the apiserver, bound i.e. host!="", notified via ListWatch
+	pod, launchedTask = startPod(offers1)
+	pod.Spec.NodeName = *offers1[0].Hostname
+	podListWatch.Modify(pod, true) // notifying the watchers
+	time.Sleep(time.Second / 2)
+	failPodFromExecutor(launchedTask)
+}
+
+func TestDeleteOne_NonexistentPod(t *testing.T) {
+	assert := assert.New(t)
+	obj := &MockScheduler{}
+	reg := podtask.NewInMemoryRegistry()
+	obj.On("tasks").Return(reg)
+
+	qr := newQueuer(nil)
+	assert.Equal(0, len(qr.podQueue.List()))
+	d := &deleter{
+		api: obj,
+		qr:  qr,
+	}
+	pod := &Pod{Pod: &api.Pod{
+		ObjectMeta: api.ObjectMeta{
+			Name:      "foo",
+			Namespace: api.NamespaceDefault,
+		}}}
+	err := d.deleteOne(pod)
+	assert.Equal(err, noSuchPodErr)
+	obj.AssertExpectations(t)
+}
+
+func TestDeleteOne_PendingPod(t *testing.T) {
+	assert := assert.New(t)
+	obj := &MockScheduler{}
+	reg := podtask.NewInMemoryRegistry()
+	obj.On("tasks").Return(reg)
+
+	pod := &Pod{Pod: &api.Pod{
+		ObjectMeta: api.ObjectMeta{
+			Name:      "foo",
+			UID:       "foo0",
+			Namespace: api.NamespaceDefault,
+		}}}
+	_, err := reg.Register(podtask.New(api.NewDefaultContext(), "bar", *pod.Pod, &mesos.ExecutorInfo{}))
+	if err != nil {
+		t.Fatalf("failed to create task: %v", err)
+	}
+
+	// preconditions
+	qr := newQueuer(nil)
+	qr.podQueue.Add(pod, queue.ReplaceExisting)
+	assert.Equal(1, len(qr.podQueue.List()))
+	_, found := qr.podQueue.Get("default/foo")
+	assert.True(found)
+
+	// exec & post conditions
+	d := &deleter{
+		api: obj,
+		qr:  qr,
+	}
+	err = d.deleteOne(pod)
+	assert.Nil(err)
+	_, found = qr.podQueue.Get("foo0")
+	assert.False(found)
+	assert.Equal(0, len(qr.podQueue.List()))
+	obj.AssertExpectations(t)
+}
+
+func TestDeleteOne_Running(t *testing.T) {
+	assert := assert.New(t)
+	obj := &MockScheduler{}
+	reg := podtask.NewInMemoryRegistry()
+	obj.On("tasks").Return(reg)
+
+	pod := &Pod{Pod: &api.Pod{
+		ObjectMeta: api.ObjectMeta{
+			Name:      "foo",
+			UID:       "foo0",
+			Namespace: api.NamespaceDefault,
+		}}}
+	task, err := reg.Register(podtask.New(api.NewDefaultContext(), "bar", *pod.Pod, &mesos.ExecutorInfo{}))
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+
+	task.Set(podtask.Launched)
+	err = reg.Update(task)
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+
+	// preconditions
+	qr := newQueuer(nil)
+	qr.podQueue.Add(pod, queue.ReplaceExisting)
+	assert.Equal(1, len(qr.podQueue.List()))
+	_, found := qr.podQueue.Get("default/foo")
+	assert.True(found)
+
+	obj.On("killTask", task.ID).Return(nil)
+
+	// exec & post conditions
+	d := &deleter{
+		api: obj,
+		qr:  qr,
+	}
+	err = d.deleteOne(pod)
+	assert.Nil(err)
+	_, found = qr.podQueue.Get("foo0")
+	assert.False(found)
+	assert.Equal(0, len(qr.podQueue.List()))
+	obj.AssertExpectations(t)
+}
+
+func TestDeleteOne_badPodNaming(t *testing.T) {
+	assert := assert.New(t)
+	obj := &MockScheduler{}
+	pod := &Pod{Pod: &api.Pod{}}
+	d := &deleter{
+		api: obj,
+		qr:  newQueuer(nil),
+	}
+
+	err := d.deleteOne(pod)
+	assert.NotNil(err)
+
+	pod.Pod.ObjectMeta.Name = "foo"
+	err = d.deleteOne(pod)
+	assert.NotNil(err)
+
+	pod.Pod.ObjectMeta.Name = ""
+	pod.Pod.ObjectMeta.Namespace = "bar"
+	err = d.deleteOne(pod)
+	assert.NotNil(err)
+
+	obj.AssertExpectations(t)
+}
--- a/contrib/mesos/pkg/scheduler/pod.go
+++ b/contrib/mesos/pkg/scheduler/pod.go
@@ -0,0 +1,80 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package scheduler
+
+import (
+	"fmt"
+	"time"
+
+	"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/queue"
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/api"
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/client/cache"
+)
+
+// wrapper for the k8s pod type so that we can define additional methods on a "pod"
+type Pod struct {
+	*api.Pod
+	deadline *time.Time
+	delay    *time.Duration
+	notify   queue.BreakChan
+}
+
+// implements Copyable
+func (p *Pod) Copy() queue.Copyable {
+	if p == nil {
+		return nil
+	}
+	//TODO(jdef) we may need a better "deep-copy" implementation
+	pod := *(p.Pod)
+	return &Pod{Pod: &pod}
+}
+
+// implements Unique
+func (p *Pod) GetUID() string {
+	if id, err := cache.MetaNamespaceKeyFunc(p.Pod); err != nil {
+		panic(fmt.Sprintf("failed to determine pod id for '%+v'", p.Pod))
+	} else {
+		return id
+	}
+}
+
+// implements Deadlined
+func (dp *Pod) Deadline() (time.Time, bool) {
+	if dp.deadline != nil {
+		return *(dp.deadline), true
+	}
+	return time.Time{}, false
+}
+
+func (dp *Pod) GetDelay() time.Duration {
+	if dp.delay != nil {
+		return *(dp.delay)
+	}
+	return 0
+}
+
+func (p *Pod) Breaker() queue.BreakChan {
+	return p.notify
+}
+
+func (p *Pod) String() string {
+	displayDeadline := "<none>"
+	if deadline, ok := p.Deadline(); ok {
+		displayDeadline = deadline.String()
+	}
+	return fmt.Sprintf("{pod:%v, deadline:%v, delay:%v}", p.Pod.Name, displayDeadline, p.GetDelay())
+}
--- a/contrib/mesos/pkg/scheduler/podtask/debug.go
+++ b/contrib/mesos/pkg/scheduler/podtask/debug.go
@@ -0,0 +1,54 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package podtask
+
+import (
+	"fmt"
+	"io"
+	"net/http"
+
+	log "github.com/golang/glog"
+)
+
+//TODO(jdef) we use a Locker to guard against concurrent task state changes, but it would be
+//really, really nice to avoid doing this. Maybe someday the registry won't return data ptrs
+//but plain structs instead.
+func InstallDebugHandlers(reg Registry, mux *http.ServeMux) {
+	mux.HandleFunc("/debug/registry/tasks", func(w http.ResponseWriter, r *http.Request) {
+		//TODO(jdef) support filtering tasks based on status
+		alltasks := reg.List(nil)
+		io.WriteString(w, fmt.Sprintf("task_count=%d\n", len(alltasks)))
+		for _, task := range alltasks {
+			if err := func() (err error) {
+				podName := task.Pod.Name
+				podNamespace := task.Pod.Namespace
+				offerId := ""
+				if task.Offer != nil {
+					offerId = task.Offer.Id()
+				}
+				_, err = io.WriteString(w, fmt.Sprintf("%v\t%v/%v\t%v\t%v\n", task.ID, podNamespace, podName, task.State, offerId))
+				return
+			}(); err != nil {
+				log.Warningf("aborting debug handler: %v", err)
+				break // stop listing on I/O errors
+			}
+		}
+		if flusher, ok := w.(http.Flusher); ok {
+			flusher.Flush()
+		}
+	})
+}
--- a/contrib/mesos/pkg/scheduler/podtask/doc.go
+++ b/contrib/mesos/pkg/scheduler/podtask/doc.go
@@ -0,0 +1,18 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+// Package podtask maps Kubernetes pods to Mesos tasks.
+package podtask
--- a/contrib/mesos/pkg/scheduler/podtask/leaky.go
+++ b/contrib/mesos/pkg/scheduler/podtask/leaky.go
@@ -0,0 +1,29 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package podtask
+
+// Concepts that have leaked to where they should not have.
+
+import (
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/api"
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/registry/etcd"
+)
+
+// makePodKey constructs etcd paths to pod items enforcing namespace rules.
+func MakePodKey(ctx api.Context, id string) (string, error) {
+	return etcd.MakeEtcdItemKey(ctx, PodPath, id)
+}
--- a/contrib/mesos/pkg/scheduler/podtask/pod_task.go
+++ b/contrib/mesos/pkg/scheduler/podtask/pod_task.go
@@ -0,0 +1,373 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package podtask
+
+import (
+	"fmt"
+	"strings"
+	"time"
+
+	"code.google.com/p/go-uuid/uuid"
+	"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/offers"
+	annotation "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/scheduler/meta"
+	"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/scheduler/metrics"
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/api"
+	"github.com/gogo/protobuf/proto"
+	log "github.com/golang/glog"
+	mesos "github.com/mesos/mesos-go/mesosproto"
+	mutil "github.com/mesos/mesos-go/mesosutil"
+)
+
+const (
+	containerCpus = 0.25 // initial CPU allocated for executor
+	containerMem  = 64   // initial MB of memory allocated for executor
+)
+
+type StateType int
+
+const (
+	StatePending StateType = iota
+	StateRunning
+	StateFinished
+	StateUnknown
+)
+
+type FlagType string
+
+const (
+	Launched = FlagType("launched")
+	Bound    = FlagType("bound")
+	Deleted  = FlagType("deleted")
+)
+
+// A struct that describes a pod task.
+type T struct {
+	ID          string
+	Pod         api.Pod
+	Spec        Spec
+	Offer       offers.Perishable // thread-safe
+	State       StateType
+	Flags       map[FlagType]struct{}
+	CreateTime  time.Time
+	UpdatedTime time.Time // time of the most recent StatusUpdate we've seen from the mesos master
+
+	podStatus  api.PodStatus
+	executor   *mesos.ExecutorInfo // readonly
+	podKey     string
+	launchTime time.Time
+	bindTime   time.Time
+	mapper     HostPortMappingType
+}
+
+type Spec struct {
+	SlaveID string
+	CPU     float64
+	Memory  float64
+	PortMap []HostPortMapping
+	Ports   []uint64
+	Data    []byte
+}
+
+// mostly-clone this pod task. the clone will actually share the some fields:
+//   - executor    // OK because it's read only
+//   - Offer       // OK because it's guarantees safe concurrent access
+func (t *T) Clone() *T {
+	if t == nil {
+		return nil
+	}
+
+	// shallow-copy
+	clone := *t
+
+	// deep copy
+	(&t.Spec).copyTo(&clone.Spec)
+	clone.Flags = map[FlagType]struct{}{}
+	for k := range t.Flags {
+		clone.Flags[k] = struct{}{}
+	}
+	return &clone
+}
+
+func (old *Spec) copyTo(new *Spec) {
+	if len(old.PortMap) > 0 {
+		new.PortMap = append(([]HostPortMapping)(nil), old.PortMap...)
+	}
+	if len(old.Ports) > 0 {
+		new.Ports = append(([]uint64)(nil), old.Ports...)
+	}
+	if len(old.Data) > 0 {
+		new.Data = append(([]byte)(nil), old.Data...)
+	}
+}
+
+func (t *T) HasAcceptedOffer() bool {
+	return t.Spec.SlaveID != ""
+}
+
+func (t *T) GetOfferId() string {
+	if t.Offer == nil {
+		return ""
+	}
+	return t.Offer.Details().Id.GetValue()
+}
+
+func generateTaskName(pod *api.Pod) string {
+	ns := pod.Namespace
+	if ns == "" {
+		ns = api.NamespaceDefault
+	}
+	return fmt.Sprintf("%s.%s.pods", pod.Name, ns)
+}
+
+func (t *T) BuildTaskInfo() *mesos.TaskInfo {
+	info := &mesos.TaskInfo{
+		Name:     proto.String(generateTaskName(&t.Pod)),
+		TaskId:   mutil.NewTaskID(t.ID),
+		SlaveId:  mutil.NewSlaveID(t.Spec.SlaveID),
+		Executor: t.executor,
+		Data:     t.Spec.Data,
+		Resources: []*mesos.Resource{
+			mutil.NewScalarResource("cpus", t.Spec.CPU),
+			mutil.NewScalarResource("mem", t.Spec.Memory),
+		},
+	}
+	if portsResource := rangeResource("ports", t.Spec.Ports); portsResource != nil {
+		info.Resources = append(info.Resources, portsResource)
+	}
+	return info
+}
+
+// Fill the Spec in the T, should be called during k8s scheduling,
+// before binding.
+func (t *T) FillFromDetails(details *mesos.Offer) error {
+	if details == nil {
+		//programming error
+		panic("offer details are nil")
+	}
+
+	log.V(3).Infof("Recording offer(s) %v against pod %v", details.Id, t.Pod.Name)
+
+	t.Spec = Spec{
+		SlaveID: details.GetSlaveId().GetValue(),
+		CPU:     containerCpus,
+		Memory:  containerMem,
+	}
+
+	if mapping, err := t.mapper.Generate(t, details); err != nil {
+		t.Reset()
+		return err
+	} else {
+		ports := []uint64{}
+		for _, entry := range mapping {
+			ports = append(ports, entry.OfferPort)
+		}
+		t.Spec.PortMap = mapping
+		t.Spec.Ports = ports
+	}
+
+	// hostname needs of the executor needs to match that of the offer, otherwise
+	// the kubelet node status checker/updater is very unhappy
+	const HOSTNAME_OVERRIDE_FLAG = "--hostname-override="
+	hostname := details.GetHostname() // required field, non-empty
+	hostnameOverride := HOSTNAME_OVERRIDE_FLAG + hostname
+
+	argv := t.executor.Command.Arguments
+	overwrite := false
+	for i, arg := range argv {
+		if strings.HasPrefix(arg, HOSTNAME_OVERRIDE_FLAG) {
+			overwrite = true
+			argv[i] = hostnameOverride
+			break
+		}
+	}
+	if !overwrite {
+		t.executor.Command.Arguments = append(argv, hostnameOverride)
+	}
+	return nil
+}
+
+// Clear offer-related details from the task, should be called if/when an offer
+// has already been assigned to a task but for some reason is no longer valid.
+func (t *T) Reset() {
+	log.V(3).Infof("Clearing offer(s) from pod %v", t.Pod.Name)
+	t.Offer = nil
+	t.Spec = Spec{}
+}
+
+func (t *T) AcceptOffer(offer *mesos.Offer) bool {
+	if offer == nil {
+		return false
+	}
+	var (
+		cpus float64 = 0
+		mem  float64 = 0
+	)
+	for _, resource := range offer.Resources {
+		if resource.GetName() == "cpus" {
+			cpus = *resource.GetScalar().Value
+		}
+
+		if resource.GetName() == "mem" {
+			mem = *resource.GetScalar().Value
+		}
+	}
+	if _, err := t.mapper.Generate(t, offer); err != nil {
+		log.V(3).Info(err)
+		return false
+	}
+
+	// for now hard-coded, constant values are used for cpus and mem. This is necessary
+	// until parent-cgroup integration is finished for mesos and k8sm. Then the k8sm
+	// executor can become the parent of pods and subsume their resource usage and
+	// therefore be compliant with expectations of mesos executors w/ respect to
+	// resource allocation and management.
+	//
+	// TODO(jdef): remove hardcoded values and make use of actual pod resource settings
+	if (cpus < containerCpus) || (mem < containerMem) {
+		log.V(3).Infof("not enough resources: cpus: %f mem: %f", cpus, mem)
+		return false
+	}
+	return true
+}
+
+func (t *T) Set(f FlagType) {
+	t.Flags[f] = struct{}{}
+	if Launched == f {
+		t.launchTime = time.Now()
+		queueWaitTime := t.launchTime.Sub(t.CreateTime)
+		metrics.QueueWaitTime.Observe(metrics.InMicroseconds(queueWaitTime))
+	}
+}
+
+func (t *T) Has(f FlagType) (exists bool) {
+	_, exists = t.Flags[f]
+	return
+}
+
+func New(ctx api.Context, id string, pod api.Pod, executor *mesos.ExecutorInfo) (*T, error) {
+	if executor == nil {
+		return nil, fmt.Errorf("illegal argument: executor was nil")
+	}
+	key, err := MakePodKey(ctx, pod.Name)
+	if err != nil {
+		return nil, err
+	}
+	if id == "" {
+		id = "pod." + uuid.NewUUID().String()
+	}
+	task := &T{
+		ID:       id,
+		Pod:      pod,
+		State:    StatePending,
+		podKey:   key,
+		mapper:   MappingTypeForPod(&pod),
+		Flags:    make(map[FlagType]struct{}),
+		executor: proto.Clone(executor).(*mesos.ExecutorInfo),
+	}
+	task.CreateTime = time.Now()
+	return task, nil
+}
+
+func (t *T) SaveRecoveryInfo(dict map[string]string) {
+	dict[annotation.TaskIdKey] = t.ID
+	dict[annotation.SlaveIdKey] = t.Spec.SlaveID
+	dict[annotation.OfferIdKey] = t.Offer.Details().Id.GetValue()
+	dict[annotation.ExecutorIdKey] = t.executor.ExecutorId.GetValue()
+}
+
+// reconstruct a task from metadata stashed in a pod entry. there are limited pod states that
+// support reconstruction. if we expect to be able to reconstruct state but encounter errors
+// in the process then those errors are returned. if the pod is in a seemingly valid state but
+// otherwise does not support task reconstruction return false. if we're able to reconstruct
+// state then return a reconstructed task and true.
+//
+// at this time task reconstruction is only supported for pods that have been annotated with
+// binding metadata, which implies that they've previously been associated with a task and
+// that mesos knows about it.
+//
+// assumes that the pod data comes from the k8s registry and reflects the desired state.
+//
+func RecoverFrom(pod api.Pod) (*T, bool, error) {
+	// we only expect annotations if pod has been bound, which implies that it has already
+	// been scheduled and launched
+	if pod.Spec.NodeName == "" && len(pod.Annotations) == 0 {
+		log.V(1).Infof("skipping recovery for unbound pod %v/%v", pod.Namespace, pod.Name)
+		return nil, false, nil
+	}
+
+	// only process pods that are not in a terminal state
+	switch pod.Status.Phase {
+	case api.PodPending, api.PodRunning, api.PodUnknown: // continue
+	default:
+		log.V(1).Infof("skipping recovery for terminal pod %v/%v", pod.Namespace, pod.Name)
+		return nil, false, nil
+	}
+
+	ctx := api.WithNamespace(api.NewDefaultContext(), pod.Namespace)
+	key, err := MakePodKey(ctx, pod.Name)
+	if err != nil {
+		return nil, false, err
+	}
+
+	//TODO(jdef) recover ports (and other resource requirements?) from the pod spec as well
+
+	now := time.Now()
+	t := &T{
+		Pod:        pod,
+		CreateTime: now,
+		podKey:     key,
+		State:      StatePending, // possibly running? mesos will tell us during reconciliation
+		Flags:      make(map[FlagType]struct{}),
+		mapper:     MappingTypeForPod(&pod),
+		launchTime: now,
+		bindTime:   now,
+	}
+	var (
+		offerId  string
+		hostname string
+	)
+	for _, k := range []string{
+		annotation.BindingHostKey,
+		annotation.TaskIdKey,
+		annotation.SlaveIdKey,
+		annotation.OfferIdKey,
+		annotation.ExecutorIdKey,
+	} {
+		v, found := pod.Annotations[k]
+		if !found {
+			return nil, false, fmt.Errorf("incomplete metadata: missing value for pod annotation: %v", k)
+		}
+		switch k {
+		case annotation.BindingHostKey:
+			hostname = v
+		case annotation.SlaveIdKey:
+			t.Spec.SlaveID = v
+		case annotation.OfferIdKey:
+			offerId = v
+		case annotation.TaskIdKey:
+			t.ID = v
+		case annotation.ExecutorIdKey:
+			// this is nowhere near sufficient to re-launch a task, but we really just
+			// want this for tracking
+			t.executor = &mesos.ExecutorInfo{ExecutorId: mutil.NewExecutorID(v)}
+		}
+	}
+	t.Offer = offers.Expired(offerId, hostname, 0)
+	t.Flags[Launched] = struct{}{}
+	t.Flags[Bound] = struct{}{}
+	return t, true, nil
+}
--- a/contrib/mesos/pkg/scheduler/podtask/pod_task_test.go
+++ b/contrib/mesos/pkg/scheduler/podtask/pod_task_test.go
@@ -0,0 +1,153 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package podtask
+
+import (
+	"testing"
+
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/api"
+	mesos "github.com/mesos/mesos-go/mesosproto"
+	mutil "github.com/mesos/mesos-go/mesosutil"
+)
+
+const (
+	t_min_cpu = 128
+	t_min_mem = 128
+)
+
+func fakePodTask(id string) (*T, error) {
+	return New(api.NewDefaultContext(), "", api.Pod{
+		ObjectMeta: api.ObjectMeta{
+			Name:      id,
+			Namespace: api.NamespaceDefault,
+		},
+	}, &mesos.ExecutorInfo{})
+}
+
+func TestEmptyOffer(t *testing.T) {
+	t.Parallel()
+	task, err := fakePodTask("foo")
+	if err != nil {
+		t.Fatal(err)
+	}
+	if ok := task.AcceptOffer(nil); ok {
+		t.Fatalf("accepted nil offer")
+	}
+	if ok := task.AcceptOffer(&mesos.Offer{}); ok {
+		t.Fatalf("accepted empty offer")
+	}
+}
+
+func TestNoPortsInPodOrOffer(t *testing.T) {
+	t.Parallel()
+	task, err := fakePodTask("foo")
+	if err != nil || task == nil {
+		t.Fatal(err)
+	}
+
+	offer := &mesos.Offer{
+		Resources: []*mesos.Resource{
+			mutil.NewScalarResource("cpus", 0.001),
+			mutil.NewScalarResource("mem", 0.001),
+		},
+	}
+	if ok := task.AcceptOffer(offer); ok {
+		t.Fatalf("accepted offer %v:", offer)
+	}
+
+	offer = &mesos.Offer{
+		Resources: []*mesos.Resource{
+			mutil.NewScalarResource("cpus", t_min_cpu),
+			mutil.NewScalarResource("mem", t_min_mem),
+		},
+	}
+	if ok := task.AcceptOffer(offer); !ok {
+		t.Fatalf("did not accepted offer %v:", offer)
+	}
+}
+
+func TestAcceptOfferPorts(t *testing.T) {
+	t.Parallel()
+	task, _ := fakePodTask("foo")
+	pod := &task.Pod
+
+	offer := &mesos.Offer{
+		Resources: []*mesos.Resource{
+			mutil.NewScalarResource("cpus", t_min_cpu),
+			mutil.NewScalarResource("mem", t_min_mem),
+			rangeResource("ports", []uint64{1, 1}),
+		},
+	}
+	if ok := task.AcceptOffer(offer); !ok {
+		t.Fatalf("did not accepted offer %v:", offer)
+	}
+
+	pod.Spec = api.PodSpec{
+		Containers: []api.Container{{
+			Ports: []api.ContainerPort{{
+				HostPort: 123,
+			}},
+		}},
+	}
+	if ok := task.AcceptOffer(offer); ok {
+		t.Fatalf("accepted offer %v:", offer)
+	}
+
+	pod.Spec.Containers[0].Ports[0].HostPort = 1
+	if ok := task.AcceptOffer(offer); !ok {
+		t.Fatalf("did not accepted offer %v:", offer)
+	}
+
+	pod.Spec.Containers[0].Ports[0].HostPort = 0
+	if ok := task.AcceptOffer(offer); !ok {
+		t.Fatalf("did not accepted offer %v:", offer)
+	}
+
+	offer.Resources = []*mesos.Resource{
+		mutil.NewScalarResource("cpus", t_min_cpu),
+		mutil.NewScalarResource("mem", t_min_mem),
+	}
+	if ok := task.AcceptOffer(offer); ok {
+		t.Fatalf("accepted offer %v:", offer)
+	}
+
+	pod.Spec.Containers[0].Ports[0].HostPort = 1
+	if ok := task.AcceptOffer(offer); ok {
+		t.Fatalf("accepted offer %v:", offer)
+	}
+}
+
+func TestGeneratePodName(t *testing.T) {
+	p := &api.Pod{
+		ObjectMeta: api.ObjectMeta{
+			Name:      "foo",
+			Namespace: "bar",
+		},
+	}
+	name := generateTaskName(p)
+	expected := "foo.bar.pods"
+	if name != expected {
+		t.Fatalf("expected %q instead of %q", expected, name)
+	}
+
+	p.Namespace = ""
+	name = generateTaskName(p)
+	expected = "foo.default.pods"
+	if name != expected {
+		t.Fatalf("expected %q instead of %q", expected, name)
+	}
+}
--- a/contrib/mesos/pkg/scheduler/podtask/port_mapping.go
+++ b/contrib/mesos/pkg/scheduler/podtask/port_mapping.go
@@ -0,0 +1,185 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package podtask
+
+import (
+	"fmt"
+
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/api"
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/labels"
+	log "github.com/golang/glog"
+	mesos "github.com/mesos/mesos-go/mesosproto"
+)
+
+type HostPortMappingType string
+
+const (
+	// maps a Container.HostPort to the same exact offered host port, ignores .HostPort = 0
+	HostPortMappingFixed HostPortMappingType = "fixed"
+	// same as HostPortMappingFixed, except that .HostPort of 0 are mapped to any port offered
+	HostPortMappingWildcard = "wildcard"
+)
+
+type HostPortMapper interface {
+	// abstracts the way that host ports are mapped to pod container ports
+	Generate(t *T, offer *mesos.Offer) ([]HostPortMapping, error)
+}
+
+type HostPortMapping struct {
+	ContainerIdx int // index of the container in the pod spec
+	PortIdx      int // index of the port in a container's port spec
+	OfferPort    uint64
+}
+
+func (self HostPortMappingType) Generate(t *T, offer *mesos.Offer) ([]HostPortMapping, error) {
+	switch self {
+	case HostPortMappingWildcard:
+		return wildcardHostPortMapping(t, offer)
+	case HostPortMappingFixed:
+	default:
+		log.Warningf("illegal host-port mapping spec %q, defaulting to %q", self, HostPortMappingFixed)
+	}
+	return defaultHostPortMapping(t, offer)
+}
+
+type PortAllocationError struct {
+	PodId string
+	Ports []uint64
+}
+
+func (err *PortAllocationError) Error() string {
+	return fmt.Sprintf("Could not schedule pod %s: %d port(s) could not be allocated", err.PodId, len(err.Ports))
+}
+
+type DuplicateHostPortError struct {
+	m1, m2 HostPortMapping
+}
+
+func (err *DuplicateHostPortError) Error() string {
+	return fmt.Sprintf(
+		"Host port %d is specified for container %d, pod %d and container %d, pod %d",
+		err.m1.OfferPort, err.m1.ContainerIdx, err.m1.PortIdx, err.m2.ContainerIdx, err.m2.PortIdx)
+}
+
+// wildcard k8s host port mapping implementation: hostPort == 0 gets mapped to any available offer port
+func wildcardHostPortMapping(t *T, offer *mesos.Offer) ([]HostPortMapping, error) {
+	mapping, err := defaultHostPortMapping(t, offer)
+	if err != nil {
+		return nil, err
+	}
+	taken := make(map[uint64]struct{})
+	for _, entry := range mapping {
+		taken[entry.OfferPort] = struct{}{}
+	}
+	wildports := []HostPortMapping{}
+	for i, container := range t.Pod.Spec.Containers {
+		for pi, port := range container.Ports {
+			if port.HostPort == 0 {
+				wildports = append(wildports, HostPortMapping{
+					ContainerIdx: i,
+					PortIdx:      pi,
+				})
+			}
+		}
+	}
+	remaining := len(wildports)
+	foreachRange(offer, "ports", func(bp, ep uint64) {
+		log.V(3).Infof("Searching for wildcard port in range {%d:%d}", bp, ep)
+		for _, entry := range wildports {
+			if entry.OfferPort != 0 {
+				continue
+			}
+			for port := bp; port <= ep && remaining > 0; port++ {
+				if _, inuse := taken[port]; inuse {
+					continue
+				}
+				entry.OfferPort = port
+				mapping = append(mapping, entry)
+				remaining--
+				taken[port] = struct{}{}
+				break
+			}
+		}
+	})
+	if remaining > 0 {
+		err := &PortAllocationError{
+			PodId: t.Pod.Name,
+		}
+		// it doesn't make sense to include a port list here because they were all zero (wildcards)
+		return nil, err
+	}
+	return mapping, nil
+}
+
+// default k8s host port mapping implementation: hostPort == 0 means containerPort remains pod-private, and so
+// no offer ports will be mapped to such Container ports.
+func defaultHostPortMapping(t *T, offer *mesos.Offer) ([]HostPortMapping, error) {
+	requiredPorts := make(map[uint64]HostPortMapping)
+	mapping := []HostPortMapping{}
+	for i, container := range t.Pod.Spec.Containers {
+		// strip all port==0 from this array; k8s already knows what to do with zero-
+		// ports (it does not create 'port bindings' on the minion-host); we need to
+		// remove the wildcards from this array since they don't consume host resources
+		for pi, port := range container.Ports {
+			if port.HostPort == 0 {
+				continue // ignore
+			}
+			m := HostPortMapping{
+				ContainerIdx: i,
+				PortIdx:      pi,
+				OfferPort:    uint64(port.HostPort),
+			}
+			if entry, inuse := requiredPorts[uint64(port.HostPort)]; inuse {
+				return nil, &DuplicateHostPortError{entry, m}
+			}
+			requiredPorts[uint64(port.HostPort)] = m
+		}
+	}
+	foreachRange(offer, "ports", func(bp, ep uint64) {
+		for port := range requiredPorts {
+			log.V(3).Infof("evaluating port range {%d:%d} %d", bp, ep, port)
+			if (bp <= port) && (port <= ep) {
+				mapping = append(mapping, requiredPorts[port])
+				delete(requiredPorts, port)
+			}
+		}
+	})
+	unsatisfiedPorts := len(requiredPorts)
+	if unsatisfiedPorts > 0 {
+		err := &PortAllocationError{
+			PodId: t.Pod.Name,
+		}
+		for p := range requiredPorts {
+			err.Ports = append(err.Ports, p)
+		}
+		return nil, err
+	}
+	return mapping, nil
+}
+
+const PortMappingLabelKey = "k8s.mesosphere.io/portMapping"
+
+func MappingTypeForPod(pod *api.Pod) HostPortMappingType {
+	filter := map[string]string{
+		PortMappingLabelKey: string(HostPortMappingFixed),
+	}
+	selector := labels.Set(filter).AsSelector()
+	if selector.Matches(labels.Set(pod.Labels)) {
+		return HostPortMappingFixed
+	}
+	return HostPortMappingWildcard
+}
--- a/contrib/mesos/pkg/scheduler/podtask/port_mapping_test.go
+++ b/contrib/mesos/pkg/scheduler/podtask/port_mapping_test.go
@@ -0,0 +1,205 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package podtask
+
+import (
+	"testing"
+
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/api"
+	mesos "github.com/mesos/mesos-go/mesosproto"
+)
+
+func TestDefaultHostPortMatching(t *testing.T) {
+	t.Parallel()
+	task, _ := fakePodTask("foo")
+	pod := &task.Pod
+
+	offer := &mesos.Offer{
+		Resources: []*mesos.Resource{
+			rangeResource("ports", []uint64{1, 1}),
+		},
+	}
+	mapping, err := defaultHostPortMapping(task, offer)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if len(mapping) > 0 {
+		t.Fatalf("Found mappings for a pod without ports: %v", pod)
+	}
+
+	//--
+	pod.Spec = api.PodSpec{
+		Containers: []api.Container{{
+			Ports: []api.ContainerPort{{
+				HostPort: 123,
+			}, {
+				HostPort: 123,
+			}},
+		}},
+	}
+	task, err = New(api.NewDefaultContext(), "", *pod, &mesos.ExecutorInfo{})
+	if err != nil {
+		t.Fatal(err)
+	}
+	_, err = defaultHostPortMapping(task, offer)
+	if err, _ := err.(*DuplicateHostPortError); err == nil {
+		t.Fatal("Expected duplicate port error")
+	} else if err.m1.OfferPort != 123 {
+		t.Fatal("Expected duplicate host port 123")
+	}
+}
+
+func TestWildcardHostPortMatching(t *testing.T) {
+	t.Parallel()
+	task, _ := fakePodTask("foo")
+	pod := &task.Pod
+
+	offer := &mesos.Offer{}
+	mapping, err := wildcardHostPortMapping(task, offer)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if len(mapping) > 0 {
+		t.Fatalf("Found mappings for an empty offer and a pod without ports: %v", pod)
+	}
+
+	//--
+	offer = &mesos.Offer{
+		Resources: []*mesos.Resource{
+			rangeResource("ports", []uint64{1, 1}),
+		},
+	}
+	mapping, err = wildcardHostPortMapping(task, offer)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if len(mapping) > 0 {
+		t.Fatalf("Found mappings for a pod without ports: %v", pod)
+	}
+
+	//--
+	pod.Spec = api.PodSpec{
+		Containers: []api.Container{{
+			Ports: []api.ContainerPort{{
+				HostPort: 123,
+			}},
+		}},
+	}
+	task, err = New(api.NewDefaultContext(), "", *pod, &mesos.ExecutorInfo{})
+	if err != nil {
+		t.Fatal(err)
+	}
+	mapping, err = wildcardHostPortMapping(task, offer)
+	if err == nil {
+		t.Fatalf("expected error instead of mappings: %#v", mapping)
+	} else if err, _ := err.(*PortAllocationError); err == nil {
+		t.Fatal("Expected port allocation error")
+	} else if !(len(err.Ports) == 1 && err.Ports[0] == 123) {
+		t.Fatal("Expected port allocation error for host port 123")
+	}
+
+	//--
+	pod.Spec = api.PodSpec{
+		Containers: []api.Container{{
+			Ports: []api.ContainerPort{{
+				HostPort: 0,
+			}, {
+				HostPort: 123,
+			}},
+		}},
+	}
+	task, err = New(api.NewDefaultContext(), "", *pod, &mesos.ExecutorInfo{})
+	if err != nil {
+		t.Fatal(err)
+	}
+	mapping, err = wildcardHostPortMapping(task, offer)
+	if err, _ := err.(*PortAllocationError); err == nil {
+		t.Fatal("Expected port allocation error")
+	} else if !(len(err.Ports) == 1 && err.Ports[0] == 123) {
+		t.Fatal("Expected port allocation error for host port 123")
+	}
+
+	//--
+	pod.Spec = api.PodSpec{
+		Containers: []api.Container{{
+			Ports: []api.ContainerPort{{
+				HostPort: 0,
+			}, {
+				HostPort: 1,
+			}},
+		}},
+	}
+	task, err = New(api.NewDefaultContext(), "", *pod, &mesos.ExecutorInfo{})
+	if err != nil {
+		t.Fatal(err)
+	}
+	mapping, err = wildcardHostPortMapping(task, offer)
+	if err, _ := err.(*PortAllocationError); err == nil {
+		t.Fatal("Expected port allocation error")
+	} else if len(err.Ports) != 0 {
+		t.Fatal("Expected port allocation error for wildcard port")
+	}
+
+	//--
+	offer = &mesos.Offer{
+		Resources: []*mesos.Resource{
+			rangeResource("ports", []uint64{1, 2}),
+		},
+	}
+	mapping, err = wildcardHostPortMapping(task, offer)
+	if err != nil {
+		t.Fatal(err)
+	} else if len(mapping) != 2 {
+		t.Fatal("Expected both ports allocated")
+	}
+	valid := 0
+	for _, entry := range mapping {
+		if entry.ContainerIdx == 0 && entry.PortIdx == 0 && entry.OfferPort == 2 {
+			valid++
+		}
+		if entry.ContainerIdx == 0 && entry.PortIdx == 1 && entry.OfferPort == 1 {
+			valid++
+		}
+	}
+	if valid < 2 {
+		t.Fatalf("Expected 2 valid port mappings, not %d", valid)
+	}
+}
+
+func TestMappingTypeForPod(t *testing.T) {
+	pod := &api.Pod{
+		ObjectMeta: api.ObjectMeta{
+			Labels: map[string]string{},
+		},
+	}
+	mt := MappingTypeForPod(pod)
+	if mt != HostPortMappingWildcard {
+		t.Fatalf("expected wildcard mapping")
+	}
+
+	pod.Labels[PortMappingLabelKey] = string(HostPortMappingFixed)
+	mt = MappingTypeForPod(pod)
+	if mt != HostPortMappingFixed {
+		t.Fatalf("expected fixed mapping")
+	}
+
+	pod.Labels[PortMappingLabelKey] = string(HostPortMappingWildcard)
+	mt = MappingTypeForPod(pod)
+	if mt != HostPortMappingWildcard {
+		t.Fatalf("expected wildcard mapping")
+	}
+}
--- a/contrib/mesos/pkg/scheduler/podtask/protobuf.go
+++ b/contrib/mesos/pkg/scheduler/podtask/protobuf.go
@@ -0,0 +1,57 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package podtask
+
+import (
+	"github.com/gogo/protobuf/proto"
+	mesos "github.com/mesos/mesos-go/mesosproto"
+)
+
+// create a range resource for the listed ports
+func rangeResource(name string, ports []uint64) *mesos.Resource {
+	if len(ports) == 0 {
+		// pod may consist of a container that doesn't expose any ports on the host
+		return nil
+	}
+	return &mesos.Resource{
+		Name:   proto.String(name),
+		Type:   mesos.Value_RANGES.Enum(),
+		Ranges: newRanges(ports),
+	}
+}
+
+// generate port ranges from a list of ports. this implementation is very naive
+func newRanges(ports []uint64) *mesos.Value_Ranges {
+	r := make([]*mesos.Value_Range, 0)
+	for _, port := range ports {
+		x := proto.Uint64(port)
+		r = append(r, &mesos.Value_Range{Begin: x, End: x})
+	}
+	return &mesos.Value_Ranges{Range: r}
+}
+
+func foreachRange(offer *mesos.Offer, resourceName string, f func(begin, end uint64)) {
+	for _, resource := range offer.Resources {
+		if resource.GetName() == resourceName {
+			for _, r := range (*resource).GetRanges().Range {
+				bp := r.GetBegin()
+				ep := r.GetEnd()
+				f(bp, ep)
+			}
+		}
+	}
+}
--- a/contrib/mesos/pkg/scheduler/podtask/registry.go
+++ b/contrib/mesos/pkg/scheduler/podtask/registry.go
@@ -0,0 +1,335 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package podtask
+
+import (
+	"container/ring"
+	"encoding/json"
+	"fmt"
+	"sync"
+	"time"
+
+	"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/scheduler/metrics"
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/api"
+	log "github.com/golang/glog"
+	mesos "github.com/mesos/mesos-go/mesosproto"
+)
+
+const (
+	//TODO(jdef) move this somewhere else
+	PodPath = "/pods"
+
+	// length of historical record of finished tasks
+	defaultFinishedTasksSize = 1024
+)
+
+// state store for pod tasks
+type Registry interface {
+	// register the specified task with this registry, as long as the current error
+	// condition is nil. if no errors occur then return a copy of the registered task.
+	Register(*T, error) (*T, error)
+
+	// unregister the specified task from this registry
+	Unregister(*T)
+
+	// update state for the registered task identified by task.ID, returning a copy of
+	// the updated task, if any.
+	Update(task *T) error
+
+	// return the task registered for the specified task ID and its current state.
+	// if there is no such task then StateUnknown is returned.
+	Get(taskId string) (task *T, currentState StateType)
+
+	// return the non-terminal task corresponding to the specified pod ID
+	ForPod(podID string) (task *T, currentState StateType)
+
+	// update the task status given the specified mesos task status update, returning a
+	// copy of the updated task (if any) and its state.
+	UpdateStatus(status *mesos.TaskStatus) (*T, StateType)
+
+	// return a list of task ID's that match the given filter, or all task ID's if filter == nil.
+	List(filter func(*T) bool) []*T
+}
+
+type inMemoryRegistry struct {
+	rw            sync.RWMutex
+	taskRegistry  map[string]*T
+	tasksFinished *ring.Ring
+	podToTask     map[string]string
+}
+
+func NewInMemoryRegistry() Registry {
+	return &inMemoryRegistry{
+		taskRegistry:  make(map[string]*T),
+		tasksFinished: ring.New(defaultFinishedTasksSize),
+		podToTask:     make(map[string]string),
+	}
+}
+
+func (k *inMemoryRegistry) List(accepts func(t *T) bool) (tasks []*T) {
+	k.rw.RLock()
+	defer k.rw.RUnlock()
+	for _, task := range k.taskRegistry {
+		if accepts == nil || accepts(task) {
+			tasks = append(tasks, task.Clone())
+		}
+	}
+	return
+}
+
+func (k *inMemoryRegistry) ForPod(podID string) (task *T, currentState StateType) {
+	k.rw.RLock()
+	defer k.rw.RUnlock()
+	tid, ok := k.podToTask[podID]
+	if !ok {
+		return nil, StateUnknown
+	}
+	t, state := k._get(tid)
+	return t.Clone(), state
+}
+
+// registers a pod task unless the spec'd error is not nil
+func (k *inMemoryRegistry) Register(task *T, err error) (*T, error) {
+	if err == nil {
+		k.rw.Lock()
+		defer k.rw.Unlock()
+		if _, found := k.podToTask[task.podKey]; found {
+			return nil, fmt.Errorf("task already registered for pod key %q", task.podKey)
+		}
+		if _, found := k.taskRegistry[task.ID]; found {
+			return nil, fmt.Errorf("task already registered for id %q", task.ID)
+		}
+		k.podToTask[task.podKey] = task.ID
+		k.taskRegistry[task.ID] = task
+	}
+	return task.Clone(), err
+}
+
+// updates internal task state. updates are limited to Spec, Flags, and Offer for
+// StatePending tasks, and are limited to Flag updates (additive only) for StateRunning tasks.
+func (k *inMemoryRegistry) Update(task *T) error {
+	if task == nil {
+		return nil
+	}
+	k.rw.Lock()
+	defer k.rw.Unlock()
+	switch internal, state := k._get(task.ID); state {
+	case StateUnknown:
+		return fmt.Errorf("no such task: %v", task.ID)
+	case StatePending:
+		internal.Offer = task.Offer
+		internal.Spec = task.Spec
+		(&task.Spec).copyTo(&internal.Spec)
+		internal.Flags = map[FlagType]struct{}{}
+		fallthrough
+	case StateRunning:
+		for k, v := range task.Flags {
+			internal.Flags[k] = v
+		}
+		return nil
+	default:
+		return fmt.Errorf("may not update task %v in state %v", task.ID, state)
+	}
+}
+
+func (k *inMemoryRegistry) Unregister(task *T) {
+	k.rw.Lock()
+	defer k.rw.Unlock()
+	delete(k.podToTask, task.podKey)
+	delete(k.taskRegistry, task.ID)
+}
+
+func (k *inMemoryRegistry) Get(taskId string) (*T, StateType) {
+	k.rw.RLock()
+	defer k.rw.RUnlock()
+	t, state := k._get(taskId)
+	return t.Clone(), state
+}
+
+// assume that the caller has already locked around access to task state.
+// the caller is also responsible for cloning the task object before it leaves
+// the context of this registry.
+func (k *inMemoryRegistry) _get(taskId string) (*T, StateType) {
+	if task, found := k.taskRegistry[taskId]; found {
+		return task, task.State
+	}
+	return nil, StateUnknown
+}
+
+func (k *inMemoryRegistry) UpdateStatus(status *mesos.TaskStatus) (*T, StateType) {
+	taskId := status.GetTaskId().GetValue()
+
+	k.rw.Lock()
+	defer k.rw.Unlock()
+	task, state := k._get(taskId)
+
+	switch status.GetState() {
+	case mesos.TaskState_TASK_STAGING:
+		k.handleTaskStaging(task, state, status)
+	case mesos.TaskState_TASK_STARTING:
+		k.handleTaskStarting(task, state, status)
+	case mesos.TaskState_TASK_RUNNING:
+		k.handleTaskRunning(task, state, status)
+	case mesos.TaskState_TASK_FINISHED:
+		k.handleTaskFinished(task, state, status)
+	case mesos.TaskState_TASK_FAILED:
+		k.handleTaskFailed(task, state, status)
+	case mesos.TaskState_TASK_KILLED:
+		k.handleTaskKilled(task, state, status)
+	case mesos.TaskState_TASK_LOST:
+		k.handleTaskLost(task, state, status)
+	default:
+		log.Warningf("unhandled status update for task: %v", taskId)
+	}
+	return task.Clone(), state
+}
+
+func (k *inMemoryRegistry) handleTaskStaging(task *T, state StateType, status *mesos.TaskStatus) {
+	if status.GetSource() != mesos.TaskStatus_SOURCE_MASTER {
+		log.Errorf("received STAGING for task %v with unexpected source: %v",
+			status.GetTaskId().GetValue(), status.GetSource())
+	}
+}
+
+func (k *inMemoryRegistry) handleTaskStarting(task *T, state StateType, status *mesos.TaskStatus) {
+	// we expect to receive this when a launched task is finally "bound"
+	// via the API server. however, there's nothing specific for us to do here.
+	switch state {
+	case StatePending:
+		task.UpdatedTime = time.Now()
+		if !task.Has(Bound) {
+			task.Set(Bound)
+			task.bindTime = task.UpdatedTime
+			timeToBind := task.bindTime.Sub(task.launchTime)
+			metrics.BindLatency.Observe(metrics.InMicroseconds(timeToBind))
+		}
+	default:
+		taskId := status.GetTaskId().GetValue()
+		log.Warningf("Ignore status TASK_STARTING because the task %v is not pending", taskId)
+	}
+}
+
+func (k *inMemoryRegistry) handleTaskRunning(task *T, state StateType, status *mesos.TaskStatus) {
+	taskId := status.GetTaskId().GetValue()
+	switch state {
+	case StatePending:
+		task.UpdatedTime = time.Now()
+		log.Infof("Received running status for pending task: %v", taskId)
+		fillRunningPodInfo(task, status)
+		task.State = StateRunning
+	case StateRunning:
+		task.UpdatedTime = time.Now()
+		log.V(2).Infof("Ignore status TASK_RUNNING because the task %v is already running", taskId)
+	case StateFinished:
+		log.Warningf("Ignore status TASK_RUNNING because the task %v is already finished", taskId)
+	default:
+		log.Warningf("Ignore status TASK_RUNNING because the task %v is discarded", taskId)
+	}
+}
+
+func ParsePodStatusResult(taskStatus *mesos.TaskStatus) (result api.PodStatusResult, err error) {
+	if taskStatus.Data != nil {
+		err = json.Unmarshal(taskStatus.Data, &result)
+	} else {
+		err = fmt.Errorf("missing TaskStatus.Data")
+	}
+	return
+}
+
+func fillRunningPodInfo(task *T, taskStatus *mesos.TaskStatus) {
+	if taskStatus.GetReason() == mesos.TaskStatus_REASON_RECONCILIATION && taskStatus.GetSource() == mesos.TaskStatus_SOURCE_MASTER {
+		// there is no data..
+		return
+	}
+	//TODO(jdef) determine the usefullness of this information (if any)
+	if result, err := ParsePodStatusResult(taskStatus); err != nil {
+		log.Errorf("invalid TaskStatus.Data for task '%v': %v", task.ID, err)
+	} else {
+		task.podStatus = result.Status
+		log.Infof("received pod status for task %v: %+v", task.ID, result.Status)
+	}
+}
+
+func (k *inMemoryRegistry) handleTaskFinished(task *T, state StateType, status *mesos.TaskStatus) {
+	taskId := status.GetTaskId().GetValue()
+	switch state {
+	case StatePending:
+		panic(fmt.Sprintf("Pending task %v finished, this couldn't happen", taskId))
+	case StateRunning:
+		log.V(2).Infof("received finished status for running task: %v", taskId)
+		delete(k.podToTask, task.podKey)
+		task.State = StateFinished
+		task.UpdatedTime = time.Now()
+		k.tasksFinished = k.recordFinishedTask(task.ID)
+	case StateFinished:
+		log.Warningf("Ignore status TASK_FINISHED because the task %v is already finished", taskId)
+	default:
+		log.Warningf("Ignore status TASK_FINISHED because the task %v is not running", taskId)
+	}
+}
+
+// record that a task has finished.
+// older record are expunged one at a time once the historical ring buffer is saturated.
+// assumes caller is holding state lock.
+func (k *inMemoryRegistry) recordFinishedTask(taskId string) *ring.Ring {
+	slot := k.tasksFinished.Next()
+	if slot.Value != nil {
+		// garbage collect older finished task from the registry
+		gctaskId := slot.Value.(string)
+		if gctask, found := k.taskRegistry[gctaskId]; found && gctask.State == StateFinished {
+			delete(k.taskRegistry, gctaskId)
+		}
+	}
+	slot.Value = taskId
+	return slot
+}
+
+func (k *inMemoryRegistry) handleTaskFailed(task *T, state StateType, status *mesos.TaskStatus) {
+	switch state {
+	case StatePending:
+		delete(k.taskRegistry, task.ID)
+		delete(k.podToTask, task.podKey)
+	case StateRunning:
+		delete(k.taskRegistry, task.ID)
+		delete(k.podToTask, task.podKey)
+	}
+}
+
+func (k *inMemoryRegistry) handleTaskKilled(task *T, state StateType, status *mesos.TaskStatus) {
+	defer func() {
+		msg := fmt.Sprintf("task killed: %+v, task %+v", status, task)
+		if task != nil && task.Has(Deleted) {
+			// we were expecting this, nothing out of the ordinary
+			log.V(2).Infoln(msg)
+		} else {
+			log.Errorln(msg)
+		}
+	}()
+	switch state {
+	case StatePending, StateRunning:
+		delete(k.taskRegistry, task.ID)
+		delete(k.podToTask, task.podKey)
+	}
+}
+
+func (k *inMemoryRegistry) handleTaskLost(task *T, state StateType, status *mesos.TaskStatus) {
+	switch state {
+	case StateRunning, StatePending:
+		delete(k.taskRegistry, task.ID)
+		delete(k.podToTask, task.podKey)
+	}
+}
--- a/contrib/mesos/pkg/scheduler/podtask/registry_test.go
+++ b/contrib/mesos/pkg/scheduler/podtask/registry_test.go
@@ -0,0 +1,320 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package podtask
+
+import (
+	"testing"
+	"time"
+
+	"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/offers"
+	"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/proc"
+	mesos "github.com/mesos/mesos-go/mesosproto"
+	"github.com/mesos/mesos-go/mesosutil"
+	"github.com/stretchr/testify/assert"
+)
+
+func TestInMemoryRegistry_RegisterGetUnregister(t *testing.T) {
+	assert := assert.New(t)
+
+	registry := NewInMemoryRegistry()
+
+	// it's empty at the beginning
+	tasks := registry.List(func(t *T) bool { return true })
+	assert.Empty(tasks)
+
+	// add a task
+	a, _ := fakePodTask("a")
+	a_clone, err := registry.Register(a, nil)
+	assert.NoError(err)
+	assert.Equal(a_clone.ID, a.ID)
+	assert.Equal(a_clone.podKey, a.podKey)
+
+	// add another task
+	b, _ := fakePodTask("b")
+	b_clone, err := registry.Register(b, nil)
+	assert.NoError(err)
+	assert.Equal(b_clone.ID, b.ID)
+	assert.Equal(b_clone.podKey, b.podKey)
+
+	// find tasks in the registry
+	tasks = registry.List(func(t *T) bool { return true })
+	assert.Len(tasks, 2)
+	assert.Contains(tasks, a_clone)
+	assert.Contains(tasks, b_clone)
+
+	tasks = registry.List(func(t *T) bool { return t.ID == a.ID })
+	assert.Len(tasks, 1)
+	assert.Contains(tasks, a_clone)
+
+	task, _ := registry.ForPod(a.podKey)
+	assert.NotNil(task)
+	assert.Equal(task.ID, a.ID)
+
+	task, _ = registry.ForPod(b.podKey)
+	assert.NotNil(task)
+	assert.Equal(task.ID, b.ID)
+
+	task, _ = registry.ForPod("no-pod-key")
+	assert.Nil(task)
+
+	task, _ = registry.Get(a.ID)
+	assert.NotNil(task)
+	assert.Equal(task.ID, a.ID)
+
+	task, _ = registry.Get("unknown-task-id")
+	assert.Nil(task)
+
+	// re-add a task
+	a_clone, err = registry.Register(a, nil)
+	assert.Error(err)
+	assert.Nil(a_clone)
+
+	// re-add a task with another podKey, but same task id
+	another_a := a.Clone()
+	another_a.podKey = "another-pod"
+	another_a_clone, err := registry.Register(another_a, nil)
+	assert.Error(err)
+	assert.Nil(another_a_clone)
+
+	// re-add a task with another task ID, but same podKey
+	another_b := b.Clone()
+	another_b.ID = "another-task-id"
+	another_b_clone, err := registry.Register(another_b, nil)
+	assert.Error(err)
+	assert.Nil(another_b_clone)
+
+	// unregister a task
+	registry.Unregister(b)
+
+	tasks = registry.List(func(t *T) bool { return true })
+	assert.Len(tasks, 1)
+	assert.Contains(tasks, a)
+
+	// unregister a task not registered
+	unregistered_task, _ := fakePodTask("unregistered-task")
+	registry.Unregister(unregistered_task)
+}
+
+func fakeStatusUpdate(taskId string, state mesos.TaskState) *mesos.TaskStatus {
+	status := mesosutil.NewTaskStatus(mesosutil.NewTaskID(taskId), state)
+	status.Data = []byte("{}") // empty json
+	masterSource := mesos.TaskStatus_SOURCE_MASTER
+	status.Source = &masterSource
+	return status
+}
+
+func TestInMemoryRegistry_State(t *testing.T) {
+	assert := assert.New(t)
+
+	registry := NewInMemoryRegistry()
+
+	// add a task
+	a, _ := fakePodTask("a")
+	a_clone, err := registry.Register(a, nil)
+	assert.NoError(err)
+	assert.Equal(a.State, a_clone.State)
+
+	// update the status
+	assert.Equal(a_clone.State, StatePending)
+	a_clone, state := registry.UpdateStatus(fakeStatusUpdate(a.ID, mesos.TaskState_TASK_RUNNING))
+	assert.Equal(state, StatePending)         // old state
+	assert.Equal(a_clone.State, StateRunning) // new state
+
+	// update unknown task
+	unknown_clone, state := registry.UpdateStatus(fakeStatusUpdate("unknown-task-id", mesos.TaskState_TASK_RUNNING))
+	assert.Nil(unknown_clone)
+	assert.Equal(state, StateUnknown)
+}
+
+func TestInMemoryRegistry_Update(t *testing.T) {
+	assert := assert.New(t)
+
+	// create offers registry
+	ttl := time.Second / 4
+	config := offers.RegistryConfig{
+		DeclineOffer: func(offerId string) <-chan error {
+			return proc.ErrorChan(nil)
+		},
+		Compat: func(o *mesos.Offer) bool {
+			return true
+		},
+		TTL:       ttl,
+		LingerTTL: 2 * ttl,
+	}
+	storage := offers.CreateRegistry(config)
+
+	// Add offer
+	offerId := mesosutil.NewOfferID("foo")
+	mesosOffer := &mesos.Offer{Id: offerId}
+	storage.Add([]*mesos.Offer{mesosOffer})
+	offer, ok := storage.Get(offerId.GetValue())
+	assert.True(ok)
+
+	// create registry
+	registry := NewInMemoryRegistry()
+	a, _ := fakePodTask("a")
+	registry.Register(a.Clone(), nil) // here clone a because we change it below
+
+	// state changes are ignored
+	a.State = StateRunning
+	err := registry.Update(a)
+	assert.NoError(err)
+	a_clone, _ := registry.Get(a.ID)
+	assert.Equal(StatePending, a_clone.State)
+
+	// offer is updated while pending
+	a.Offer = offer
+	err = registry.Update(a)
+	assert.NoError(err)
+	a_clone, _ = registry.Get(a.ID)
+	assert.Equal(offer.Id(), a_clone.Offer.Id())
+
+	// spec is updated while pending
+	a.Spec = Spec{SlaveID: "slave-1"}
+	err = registry.Update(a)
+	assert.NoError(err)
+	a_clone, _ = registry.Get(a.ID)
+	assert.Equal("slave-1", a_clone.Spec.SlaveID)
+
+	// flags are updated while pending
+	a.Flags[Launched] = struct{}{}
+	err = registry.Update(a)
+	assert.NoError(err)
+	a_clone, _ = registry.Get(a.ID)
+
+	_, found_launched := a_clone.Flags[Launched]
+	assert.True(found_launched)
+
+	// flags are updated while running
+	registry.UpdateStatus(fakeStatusUpdate(a.ID, mesos.TaskState_TASK_RUNNING))
+	a.Flags[Bound] = struct{}{}
+	err = registry.Update(a)
+	assert.NoError(err)
+	a_clone, _ = registry.Get(a.ID)
+
+	_, found_launched = a_clone.Flags[Launched]
+	assert.True(found_launched)
+	_, found_bound := a_clone.Flags[Bound]
+	assert.True(found_bound)
+
+	// spec is ignored while running
+	a.Spec = Spec{SlaveID: "slave-2"}
+	err = registry.Update(a)
+	assert.NoError(err)
+	a_clone, _ = registry.Get(a.ID)
+	assert.Equal("slave-1", a_clone.Spec.SlaveID)
+
+	// error when finished
+	registry.UpdateStatus(fakeStatusUpdate(a.ID, mesos.TaskState_TASK_FINISHED))
+	err = registry.Update(a)
+	assert.Error(err)
+
+	// update unknown task
+	unknown_task, _ := fakePodTask("unknown-task")
+	err = registry.Update(unknown_task)
+	assert.Error(err)
+
+	// update nil task
+	err = registry.Update(nil)
+	assert.Nil(err)
+}
+
+type transition struct {
+	statusUpdate  mesos.TaskState
+	expectedState *StateType
+	expectPanic   bool
+}
+
+func NewTransition(statusUpdate mesos.TaskState, expectedState StateType) transition {
+	return transition{statusUpdate: statusUpdate, expectedState: &expectedState, expectPanic: false}
+}
+
+func NewTransitionToDeletedTask(statusUpdate mesos.TaskState) transition {
+	return transition{statusUpdate: statusUpdate, expectedState: nil, expectPanic: false}
+}
+
+func NewTransitionWhichPanics(statusUpdate mesos.TaskState) transition {
+	return transition{statusUpdate: statusUpdate, expectPanic: true}
+}
+
+func testStateTrace(t *testing.T, transitions []transition) *Registry {
+	assert := assert.New(t)
+
+	registry := NewInMemoryRegistry()
+	a, _ := fakePodTask("a")
+	a, _ = registry.Register(a, nil)
+
+	// initial pending state
+	assert.Equal(a.State, StatePending)
+
+	for _, transition := range transitions {
+		if transition.expectPanic {
+			assert.Panics(func() {
+				registry.UpdateStatus(fakeStatusUpdate(a.ID, transition.statusUpdate))
+			})
+		} else {
+			a, _ = registry.UpdateStatus(fakeStatusUpdate(a.ID, transition.statusUpdate))
+			if transition.expectedState == nil {
+				a, _ = registry.Get(a.ID)
+				assert.Nil(a, "expected task to be deleted from registry after status update to %v", transition.statusUpdate)
+			} else {
+				assert.Equal(a.State, *transition.expectedState)
+			}
+		}
+	}
+
+	return &registry
+}
+
+func TestInMemoryRegistry_TaskLifeCycle(t *testing.T) {
+	testStateTrace(t, []transition{
+		NewTransition(mesos.TaskState_TASK_STAGING, StatePending),
+		NewTransition(mesos.TaskState_TASK_STARTING, StatePending),
+		NewTransitionWhichPanics(mesos.TaskState_TASK_FINISHED),
+		NewTransition(mesos.TaskState_TASK_RUNNING, StateRunning),
+		NewTransition(mesos.TaskState_TASK_RUNNING, StateRunning),
+		NewTransition(mesos.TaskState_TASK_STARTING, StateRunning),
+		NewTransition(mesos.TaskState_TASK_FINISHED, StateFinished),
+		NewTransition(mesos.TaskState_TASK_FINISHED, StateFinished),
+		NewTransition(mesos.TaskState_TASK_RUNNING, StateFinished),
+	})
+}
+
+func TestInMemoryRegistry_NotFinished(t *testing.T) {
+	// all these behave the same
+	notFinishedStates := []mesos.TaskState{
+		mesos.TaskState_TASK_FAILED,
+		mesos.TaskState_TASK_KILLED,
+		mesos.TaskState_TASK_LOST,
+	}
+	for _, notFinishedState := range notFinishedStates {
+		testStateTrace(t, []transition{
+			NewTransitionToDeletedTask(notFinishedState),
+		})
+
+		testStateTrace(t, []transition{
+			NewTransition(mesos.TaskState_TASK_RUNNING, StateRunning),
+			NewTransitionToDeletedTask(notFinishedState),
+		})
+
+		testStateTrace(t, []transition{
+			NewTransition(mesos.TaskState_TASK_RUNNING, StateRunning),
+			NewTransition(mesos.TaskState_TASK_FINISHED, StateFinished),
+			NewTransition(notFinishedState, StateFinished),
+		})
+	}
+}
--- a/contrib/mesos/pkg/scheduler/scheduler.go
+++ b/contrib/mesos/pkg/scheduler/scheduler.go
@@ -0,0 +1,924 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package scheduler
+
+import (
+	"fmt"
+	"io"
+	"math"
+	"net/http"
+	"reflect"
+	"sync"
+	"time"
+
+	execcfg "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/executor/config"
+	"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/executor/messages"
+	"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/offers"
+	offerMetrics "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/offers/metrics"
+	"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/proc"
+	"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/runtime"
+	schedcfg "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/scheduler/config"
+	"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/scheduler/meta"
+	"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/scheduler/metrics"
+	"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/scheduler/podtask"
+	"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/scheduler/uid"
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/api"
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/api/errors"
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/client"
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/fields"
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/kubelet/container"
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/labels"
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/tools"
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/util"
+	log "github.com/golang/glog"
+	mesos "github.com/mesos/mesos-go/mesosproto"
+	mutil "github.com/mesos/mesos-go/mesosutil"
+	bindings "github.com/mesos/mesos-go/scheduler"
+)
+
+type Slave struct {
+	HostName string
+}
+
+func newSlave(hostName string) *Slave {
+	return &Slave{
+		HostName: hostName,
+	}
+}
+
+type slaveStorage struct {
+	sync.Mutex
+	slaves map[string]*Slave // SlaveID => slave.
+}
+
+func newSlaveStorage() *slaveStorage {
+	return &slaveStorage{
+		slaves: make(map[string]*Slave),
+	}
+}
+
+// Create a mapping between a slaveID and slave if not existing.
+func (self *slaveStorage) checkAndAdd(slaveId, slaveHostname string) {
+	self.Lock()
+	defer self.Unlock()
+	_, exists := self.slaves[slaveId]
+	if !exists {
+		self.slaves[slaveId] = newSlave(slaveHostname)
+	}
+}
+
+func (self *slaveStorage) getSlaveIds() []string {
+	self.Lock()
+	defer self.Unlock()
+	slaveIds := make([]string, 0, len(self.slaves))
+	for slaveID := range self.slaves {
+		slaveIds = append(slaveIds, slaveID)
+	}
+	return slaveIds
+}
+
+func (self *slaveStorage) getSlave(slaveId string) (*Slave, bool) {
+	self.Lock()
+	defer self.Unlock()
+	slave, exists := self.slaves[slaveId]
+	return slave, exists
+}
+
+type PluginInterface interface {
+	// the apiserver may have a different state for the pod than we do
+	// so reconcile our records, but only for this one pod
+	reconcilePod(api.Pod)
+
+	// execute the Scheduling plugin, should start a go routine and return immediately
+	Run(<-chan struct{})
+}
+
+// KubernetesScheduler implements:
+// 1: A mesos scheduler.
+// 2: A kubernetes scheduler plugin.
+// 3: A kubernetes pod.Registry.
+type KubernetesScheduler struct {
+	// We use a lock here to avoid races
+	// between invoking the mesos callback
+	// and the invoking the pod registry interfaces.
+	// In particular, changes to podtask.T objects are currently guarded by this lock.
+	*sync.RWMutex
+
+	// Config related, write-once
+
+	schedcfg          *schedcfg.Config
+	executor          *mesos.ExecutorInfo
+	executorGroup     uint64
+	scheduleFunc      PodScheduleFunc
+	client            *client.Client
+	etcdClient        tools.EtcdGetSet
+	failoverTimeout   float64 // in seconds
+	reconcileInterval int64
+
+	// Mesos context.
+
+	driver         bindings.SchedulerDriver // late initialization
+	frameworkId    *mesos.FrameworkID
+	masterInfo     *mesos.MasterInfo
+	registered     bool
+	registration   chan struct{} // signal chan that closes upon first successful registration
+	onRegistration sync.Once
+	offers         offers.Registry
+	slaves         *slaveStorage
+
+	// unsafe state, needs to be guarded
+
+	taskRegistry podtask.Registry
+
+	// via deferred init
+
+	plugin             PluginInterface
+	reconciler         *Reconciler
+	reconcileCooldown  time.Duration
+	asRegisteredMaster proc.Doer
+	terminate          <-chan struct{} // signal chan, closes when we should kill background tasks
+}
+
+type Config struct {
+	Schedcfg          schedcfg.Config
+	Executor          *mesos.ExecutorInfo
+	ScheduleFunc      PodScheduleFunc
+	Client            *client.Client
+	EtcdClient        tools.EtcdGetSet
+	FailoverTimeout   float64
+	ReconcileInterval int64
+	ReconcileCooldown time.Duration
+}
+
+// New creates a new KubernetesScheduler
+func New(config Config) *KubernetesScheduler {
+	var k *KubernetesScheduler
+	k = &KubernetesScheduler{
+		schedcfg:          &config.Schedcfg,
+		RWMutex:           new(sync.RWMutex),
+		executor:          config.Executor,
+		executorGroup:     uid.Parse(config.Executor.ExecutorId.GetValue()).Group(),
+		scheduleFunc:      config.ScheduleFunc,
+		client:            config.Client,
+		etcdClient:        config.EtcdClient,
+		failoverTimeout:   config.FailoverTimeout,
+		reconcileInterval: config.ReconcileInterval,
+		offers: offers.CreateRegistry(offers.RegistryConfig{
+			Compat: func(o *mesos.Offer) bool {
+				// filter the offers: the executor IDs must not identify a kubelet-
+				// executor with a group that doesn't match ours
+				for _, eid := range o.GetExecutorIds() {
+					execuid := uid.Parse(eid.GetValue())
+					if execuid.Name() == execcfg.DefaultInfoID && execuid.Group() != k.executorGroup {
+						return false
+					}
+				}
+				return true
+			},
+			DeclineOffer: func(id string) <-chan error {
+				errOnce := proc.NewErrorOnce(k.terminate)
+				errOuter := k.asRegisteredMaster.Do(func() {
+					var err error
+					defer errOnce.Report(err)
+					offerId := mutil.NewOfferID(id)
+					filters := &mesos.Filters{}
+					_, err = k.driver.DeclineOffer(offerId, filters)
+				})
+				return errOnce.Send(errOuter).Err()
+			},
+			// remember expired offers so that we can tell if a previously scheduler offer relies on one
+			LingerTTL:     config.Schedcfg.OfferLingerTTL.Duration,
+			TTL:           config.Schedcfg.OfferTTL.Duration,
+			ListenerDelay: config.Schedcfg.ListenerDelay.Duration,
+		}),
+		slaves:            newSlaveStorage(),
+		taskRegistry:      podtask.NewInMemoryRegistry(),
+		reconcileCooldown: config.ReconcileCooldown,
+		registration:      make(chan struct{}),
+		asRegisteredMaster: proc.DoerFunc(func(proc.Action) <-chan error {
+			return proc.ErrorChanf("cannot execute action with unregistered scheduler")
+		}),
+	}
+	return k
+}
+
+func (k *KubernetesScheduler) Init(electedMaster proc.Process, pl PluginInterface, mux *http.ServeMux) error {
+	log.V(1).Infoln("initializing kubernetes mesos scheduler")
+
+	k.asRegisteredMaster = proc.DoerFunc(func(a proc.Action) <-chan error {
+		if !k.registered {
+			return proc.ErrorChanf("failed to execute action, scheduler is disconnected")
+		}
+		return electedMaster.Do(a)
+	})
+	k.terminate = electedMaster.Done()
+	k.plugin = pl
+	k.offers.Init(k.terminate)
+	k.InstallDebugHandlers(mux)
+	return k.recoverTasks()
+}
+
+func (k *KubernetesScheduler) asMaster() proc.Doer {
+	k.RLock()
+	defer k.RUnlock()
+	return k.asRegisteredMaster
+}
+
+func (k *KubernetesScheduler) InstallDebugHandlers(mux *http.ServeMux) {
+	wrappedHandler := func(uri string, h http.Handler) {
+		mux.HandleFunc(uri, func(w http.ResponseWriter, r *http.Request) {
+			ch := make(chan struct{})
+			closer := runtime.Closer(ch)
+			proc.OnError(k.asMaster().Do(func() {
+				defer closer()
+				h.ServeHTTP(w, r)
+			}), func(err error) {
+				defer closer()
+				log.Warningf("failed HTTP request for %s: %v", uri, err)
+				w.WriteHeader(http.StatusServiceUnavailable)
+			}, k.terminate)
+			select {
+			case <-time.After(k.schedcfg.HttpHandlerTimeout.Duration):
+				log.Warningf("timed out waiting for request to be processed")
+				w.WriteHeader(http.StatusServiceUnavailable)
+				return
+			case <-ch: // noop
+			}
+		})
+	}
+	requestReconciliation := func(uri string, requestAction func()) {
+		wrappedHandler(uri, http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+			requestAction()
+			w.WriteHeader(http.StatusNoContent)
+		}))
+	}
+	requestReconciliation("/debug/actions/requestExplicit", k.reconciler.RequestExplicit)
+	requestReconciliation("/debug/actions/requestImplicit", k.reconciler.RequestImplicit)
+
+	wrappedHandler("/debug/actions/kamikaze", http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		slaves := k.slaves.getSlaveIds()
+		for _, slaveId := range slaves {
+			_, err := k.driver.SendFrameworkMessage(
+				k.executor.ExecutorId,
+				mutil.NewSlaveID(slaveId),
+				messages.Kamikaze)
+			if err != nil {
+				log.Warningf("failed to send kamikaze message to slave %s: %v", slaveId, err)
+			} else {
+				io.WriteString(w, fmt.Sprintf("kamikaze slave %s\n", slaveId))
+			}
+		}
+		io.WriteString(w, "OK")
+	}))
+}
+
+func (k *KubernetesScheduler) Registration() <-chan struct{} {
+	return k.registration
+}
+
+// Registered is called when the scheduler registered with the master successfully.
+func (k *KubernetesScheduler) Registered(drv bindings.SchedulerDriver, fid *mesos.FrameworkID, mi *mesos.MasterInfo) {
+	log.Infof("Scheduler registered with the master: %v with frameworkId: %v\n", mi, fid)
+
+	k.driver = drv
+	k.frameworkId = fid
+	k.masterInfo = mi
+	k.registered = true
+
+	k.onRegistration.Do(func() { k.onInitialRegistration(drv) })
+	k.reconciler.RequestExplicit()
+}
+
+func (k *KubernetesScheduler) storeFrameworkId() {
+	// TODO(jdef): port FrameworkId store to generic Kubernetes config store as soon as available
+	_, err := k.etcdClient.Set(meta.FrameworkIDKey, k.frameworkId.GetValue(), uint64(k.failoverTimeout))
+	if err != nil {
+		log.Errorf("failed to renew frameworkId TTL: %v", err)
+	}
+}
+
+// Reregistered is called when the scheduler re-registered with the master successfully.
+// This happends when the master fails over.
+func (k *KubernetesScheduler) Reregistered(drv bindings.SchedulerDriver, mi *mesos.MasterInfo) {
+	log.Infof("Scheduler reregistered with the master: %v\n", mi)
+
+	k.driver = drv
+	k.masterInfo = mi
+	k.registered = true
+
+	k.onRegistration.Do(func() { k.onInitialRegistration(drv) })
+	k.reconciler.RequestExplicit()
+}
+
+// perform one-time initialization actions upon the first registration event received from Mesos.
+func (k *KubernetesScheduler) onInitialRegistration(driver bindings.SchedulerDriver) {
+	defer close(k.registration)
+
+	if k.failoverTimeout > 0 {
+		refreshInterval := k.schedcfg.FrameworkIdRefreshInterval.Duration
+		if k.failoverTimeout < k.schedcfg.FrameworkIdRefreshInterval.Duration.Seconds() {
+			refreshInterval = time.Duration(math.Max(1, k.failoverTimeout/2)) * time.Second
+		}
+		go runtime.Until(k.storeFrameworkId, refreshInterval, k.terminate)
+	}
+
+	r1 := k.makeTaskRegistryReconciler()
+	r2 := k.makePodRegistryReconciler()
+
+	k.reconciler = newReconciler(k.asRegisteredMaster, k.makeCompositeReconciler(r1, r2),
+		k.reconcileCooldown, k.schedcfg.ExplicitReconciliationAbortTimeout.Duration, k.terminate)
+	go k.reconciler.Run(driver)
+
+	if k.reconcileInterval > 0 {
+		ri := time.Duration(k.reconcileInterval) * time.Second
+		time.AfterFunc(k.schedcfg.InitialImplicitReconciliationDelay.Duration, func() { runtime.Until(k.reconciler.RequestImplicit, ri, k.terminate) })
+		log.Infof("will perform implicit task reconciliation at interval: %v after %v", ri, k.schedcfg.InitialImplicitReconciliationDelay.Duration)
+	}
+}
+
+// Disconnected is called when the scheduler loses connection to the master.
+func (k *KubernetesScheduler) Disconnected(driver bindings.SchedulerDriver) {
+	log.Infof("Master disconnected!\n")
+
+	k.registered = false
+
+	// discard all cached offers to avoid unnecessary TASK_LOST updates
+	k.offers.Invalidate("")
+}
+
+// ResourceOffers is called when the scheduler receives some offers from the master.
+func (k *KubernetesScheduler) ResourceOffers(driver bindings.SchedulerDriver, offers []*mesos.Offer) {
+	log.V(2).Infof("Received offers %+v", offers)
+
+	// Record the offers in the global offer map as well as each slave's offer map.
+	k.offers.Add(offers)
+	for _, offer := range offers {
+		slaveId := offer.GetSlaveId().GetValue()
+		k.slaves.checkAndAdd(slaveId, offer.GetHostname())
+	}
+}
+
+// OfferRescinded is called when the resources are recinded from the scheduler.
+func (k *KubernetesScheduler) OfferRescinded(driver bindings.SchedulerDriver, offerId *mesos.OfferID) {
+	log.Infof("Offer rescinded %v\n", offerId)
+
+	oid := offerId.GetValue()
+	k.offers.Delete(oid, offerMetrics.OfferRescinded)
+}
+
+// StatusUpdate is called when a status update message is sent to the scheduler.
+func (k *KubernetesScheduler) StatusUpdate(driver bindings.SchedulerDriver, taskStatus *mesos.TaskStatus) {
+
+	source, reason := "none", "none"
+	if taskStatus.Source != nil {
+		source = (*taskStatus.Source).String()
+	}
+	if taskStatus.Reason != nil {
+		reason = (*taskStatus.Reason).String()
+	}
+	taskState := taskStatus.GetState()
+	metrics.StatusUpdates.WithLabelValues(source, reason, taskState.String()).Inc()
+
+	log.Infof(
+		"task status update %q from %q for task %q on slave %q executor %q for reason %q",
+		taskState.String(),
+		source,
+		taskStatus.TaskId.GetValue(),
+		taskStatus.SlaveId.GetValue(),
+		taskStatus.ExecutorId.GetValue(),
+		reason)
+
+	switch taskState {
+	case mesos.TaskState_TASK_RUNNING, mesos.TaskState_TASK_FINISHED, mesos.TaskState_TASK_STARTING, mesos.TaskState_TASK_STAGING:
+		if _, state := k.taskRegistry.UpdateStatus(taskStatus); state == podtask.StateUnknown {
+			if taskState != mesos.TaskState_TASK_FINISHED {
+				//TODO(jdef) what if I receive this after a TASK_LOST or TASK_KILLED?
+				//I don't want to reincarnate then..  TASK_LOST is a special case because
+				//the master is stateless and there are scenarios where I may get TASK_LOST
+				//followed by TASK_RUNNING.
+				//TODO(jdef) consider running this asynchronously since there are API server
+				//calls that may be made
+				k.reconcileNonTerminalTask(driver, taskStatus)
+			} // else, we don't really care about FINISHED tasks that aren't registered
+			return
+		}
+		if _, exists := k.slaves.getSlave(taskStatus.GetSlaveId().GetValue()); !exists {
+			// a registered task has an update reported by a slave that we don't recognize.
+			// this should never happen! So we don't reconcile it.
+			log.Errorf("Ignore status %+v because the slave does not exist", taskStatus)
+			return
+		}
+	case mesos.TaskState_TASK_FAILED:
+		if task, _ := k.taskRegistry.UpdateStatus(taskStatus); task != nil {
+			if task.Has(podtask.Launched) && !task.Has(podtask.Bound) {
+				go k.plugin.reconcilePod(task.Pod)
+				return
+			}
+		} else {
+			// unknown task failed, not much we can do about it
+			return
+		}
+		// last-ditch effort to reconcile our records
+		fallthrough
+	case mesos.TaskState_TASK_LOST, mesos.TaskState_TASK_KILLED:
+		k.reconcileTerminalTask(driver, taskStatus)
+	}
+}
+
+func (k *KubernetesScheduler) reconcileTerminalTask(driver bindings.SchedulerDriver, taskStatus *mesos.TaskStatus) {
+	task, state := k.taskRegistry.UpdateStatus(taskStatus)
+
+	if (state == podtask.StateRunning || state == podtask.StatePending) && taskStatus.SlaveId != nil &&
+		((taskStatus.GetSource() == mesos.TaskStatus_SOURCE_MASTER && taskStatus.GetReason() == mesos.TaskStatus_REASON_RECONCILIATION) ||
+			(taskStatus.GetSource() == mesos.TaskStatus_SOURCE_SLAVE && taskStatus.GetReason() == mesos.TaskStatus_REASON_EXECUTOR_TERMINATED) ||
+			(taskStatus.GetSource() == mesos.TaskStatus_SOURCE_SLAVE && taskStatus.GetReason() == mesos.TaskStatus_REASON_EXECUTOR_UNREGISTERED)) {
+		//--
+		// pod-task has metadata that refers to:
+		// (1) a task that Mesos no longer knows about, or else
+		// (2) a pod that the Kubelet will never report as "failed"
+		// For now, destroy the pod and hope that there's a replication controller backing it up.
+		// TODO(jdef) for case #2 don't delete the pod, just update it's status to Failed
+		pod := &task.Pod
+		log.Warningf("deleting rogue pod %v/%v for lost task %v", pod.Namespace, pod.Name, task.ID)
+		if err := k.client.Pods(pod.Namespace).Delete(pod.Name, nil); err != nil && !errors.IsNotFound(err) {
+			log.Errorf("failed to delete pod %v/%v for terminal task %v: %v", pod.Namespace, pod.Name, task.ID, err)
+		}
+	} else if taskStatus.GetReason() == mesos.TaskStatus_REASON_EXECUTOR_TERMINATED || taskStatus.GetReason() == mesos.TaskStatus_REASON_EXECUTOR_UNREGISTERED {
+		// attempt to prevent dangling pods in the pod and task registries
+		log.V(1).Infof("request explicit reconciliation to clean up for task %v after executor reported (terminated/unregistered)", taskStatus.TaskId.GetValue())
+		k.reconciler.RequestExplicit()
+	} else if taskStatus.GetState() == mesos.TaskState_TASK_LOST && state == podtask.StateRunning && taskStatus.ExecutorId != nil && taskStatus.SlaveId != nil {
+		//TODO(jdef) this may not be meaningful once we have proper checkpointing and master detection
+		//If we're reconciling and receive this then the executor may be
+		//running a task that we need it to kill. It's possible that the framework
+		//is unrecognized by the master at this point, so KillTask is not guaranteed
+		//to do anything. The underlying driver transport may be able to send a
+		//FrameworkMessage directly to the slave to terminate the task.
+		log.V(2).Info("forwarding TASK_LOST message to executor %v on slave %v", taskStatus.ExecutorId, taskStatus.SlaveId)
+		data := fmt.Sprintf("task-lost:%s", task.ID) //TODO(jdef) use a real message type
+		if _, err := driver.SendFrameworkMessage(taskStatus.ExecutorId, taskStatus.SlaveId, data); err != nil {
+			log.Error(err.Error())
+		}
+	}
+}
+
+// reconcile an unknown (from the perspective of our registry) non-terminal task
+func (k *KubernetesScheduler) reconcileNonTerminalTask(driver bindings.SchedulerDriver, taskStatus *mesos.TaskStatus) {
+	// attempt to recover task from pod info:
+	// - task data may contain an api.PodStatusResult; if status.reason == REASON_RECONCILIATION then status.data == nil
+	// - the Name can be parsed by container.ParseFullName() to yield a pod Name and Namespace
+	// - pull the pod metadata down from the api server
+	// - perform task recovery based on pod metadata
+	taskId := taskStatus.TaskId.GetValue()
+	if taskStatus.GetReason() == mesos.TaskStatus_REASON_RECONCILIATION && taskStatus.GetSource() == mesos.TaskStatus_SOURCE_MASTER {
+		// there will be no data in the task status that we can use to determine the associated pod
+		switch taskStatus.GetState() {
+		case mesos.TaskState_TASK_STAGING:
+			// there is still hope for this task, don't kill it just yet
+			//TODO(jdef) there should probably be a limit for how long we tolerate tasks stuck in this state
+			return
+		default:
+			// for TASK_{STARTING,RUNNING} we should have already attempted to recoverTasks() for.
+			// if the scheduler failed over before the executor fired TASK_STARTING, then we should *not*
+			// be processing this reconciliation update before we process the one from the executor.
+			// point: we don't know what this task is (perhaps there was unrecoverable metadata in the pod),
+			// so it gets killed.
+			log.Errorf("killing non-terminal, unrecoverable task %v", taskId)
+		}
+	} else if podStatus, err := podtask.ParsePodStatusResult(taskStatus); err != nil {
+		// possible rogue pod exists at this point because we can't identify it; should kill the task
+		log.Errorf("possible rogue pod; illegal task status data for task %v, expected an api.PodStatusResult: %v", taskId, err)
+	} else if name, namespace, err := container.ParsePodFullName(podStatus.Name); err != nil {
+		// possible rogue pod exists at this point because we can't identify it; should kill the task
+		log.Errorf("possible rogue pod; illegal api.PodStatusResult, unable to parse full pod name from: '%v' for task %v: %v",
+			podStatus.Name, taskId, err)
+	} else if pod, err := k.client.Pods(namespace).Get(name); err == nil {
+		if t, ok, err := podtask.RecoverFrom(*pod); ok {
+			log.Infof("recovered task %v from metadata in pod %v/%v", taskId, namespace, name)
+			_, err := k.taskRegistry.Register(t, nil)
+			if err != nil {
+				// someone beat us to it?!
+				log.Warningf("failed to register recovered task: %v", err)
+				return
+			} else {
+				k.taskRegistry.UpdateStatus(taskStatus)
+			}
+			return
+		} else if err != nil {
+			//should kill the pod and the task
+			log.Errorf("killing pod, failed to recover task from pod %v/%v: %v", namespace, name, err)
+			if err := k.client.Pods(namespace).Delete(name, nil); err != nil {
+				log.Errorf("failed to delete pod %v/%v: %v", namespace, name, err)
+			}
+		} else {
+			//this is pretty unexpected: we received a TASK_{STARTING,RUNNING} message, but the apiserver's pod
+			//metadata is not appropriate for task reconstruction -- which should almost certainly never
+			//be the case unless someone swapped out the pod on us (and kept the same namespace/name) while
+			//we were failed over.
+
+			//kill this task, allow the newly launched scheduler to schedule the new pod
+			log.Warningf("unexpected pod metadata for task %v in apiserver, assuming new unscheduled pod spec: %+v", taskId, pod)
+		}
+	} else if errors.IsNotFound(err) {
+		// pod lookup failed, should delete the task since the pod is no longer valid; may be redundant, that's ok
+		log.Infof("killing task %v since pod %v/%v no longer exists", taskId, namespace, name)
+	} else if errors.IsServerTimeout(err) {
+		log.V(2).Infof("failed to reconcile task due to API server timeout: %v", err)
+		return
+	} else {
+		log.Errorf("unexpected API server error, aborting reconcile for task %v: %v", taskId, err)
+		return
+	}
+	if _, err := driver.KillTask(taskStatus.TaskId); err != nil {
+		log.Errorf("failed to kill task %v: %v", taskId, err)
+	}
+}
+
+// FrameworkMessage is called when the scheduler receives a message from the executor.
+func (k *KubernetesScheduler) FrameworkMessage(driver bindings.SchedulerDriver,
+	executorId *mesos.ExecutorID, slaveId *mesos.SlaveID, message string) {
+	log.Infof("Received messages from executor %v of slave %v, %v\n", executorId, slaveId, message)
+}
+
+// SlaveLost is called when some slave is lost.
+func (k *KubernetesScheduler) SlaveLost(driver bindings.SchedulerDriver, slaveId *mesos.SlaveID) {
+	log.Infof("Slave %v is lost\n", slaveId)
+
+	sid := slaveId.GetValue()
+	k.offers.InvalidateForSlave(sid)
+
+	// TODO(jdef): delete slave from our internal list? probably not since we may need to reconcile
+	// tasks. it would be nice to somehow flag the slave as lost so that, perhaps, we can periodically
+	// flush lost slaves older than X, and for which no tasks or pods reference.
+
+	// unfinished tasks/pods will be dropped. use a replication controller if you want pods to
+	// be restarted when slaves die.
+}
+
+// ExecutorLost is called when some executor is lost.
+func (k *KubernetesScheduler) ExecutorLost(driver bindings.SchedulerDriver, executorId *mesos.ExecutorID, slaveId *mesos.SlaveID, status int) {
+	log.Infof("Executor %v of slave %v is lost, status: %v\n", executorId, slaveId, status)
+	// TODO(yifan): Restart any unfinished tasks of the executor.
+}
+
+// Error is called when there is an unrecoverable error in the scheduler or scheduler driver.
+// The driver should have been aborted before this is invoked.
+func (k *KubernetesScheduler) Error(driver bindings.SchedulerDriver, message string) {
+	log.Fatalf("fatal scheduler error: %v\n", message)
+}
+
+// filter func used for explicit task reconciliation, selects only non-terminal tasks which
+// have been communicated to mesos (read: launched).
+func explicitTaskFilter(t *podtask.T) bool {
+	switch t.State {
+	case podtask.StateRunning:
+		return true
+	case podtask.StatePending:
+		return t.Has(podtask.Launched)
+	default:
+		return false
+	}
+}
+
+// invoke the given ReconcilerAction funcs in sequence, aborting the sequence if reconciliation
+// is cancelled. if any other errors occur the composite reconciler will attempt to complete the
+// sequence, reporting only the last generated error.
+func (k *KubernetesScheduler) makeCompositeReconciler(actions ...ReconcilerAction) ReconcilerAction {
+	if x := len(actions); x == 0 {
+		// programming error
+		panic("no actions specified for composite reconciler")
+	} else if x == 1 {
+		return actions[0]
+	}
+	chained := func(d bindings.SchedulerDriver, c <-chan struct{}, a, b ReconcilerAction) <-chan error {
+		ech := a(d, c)
+		ch := make(chan error, 1)
+		go func() {
+			select {
+			case <-k.terminate:
+			case <-c:
+			case e := <-ech:
+				if e != nil {
+					ch <- e
+					return
+				}
+				ech = b(d, c)
+				select {
+				case <-k.terminate:
+				case <-c:
+				case e := <-ech:
+					if e != nil {
+						ch <- e
+						return
+					}
+					close(ch)
+					return
+				}
+			}
+			ch <- fmt.Errorf("aborting composite reconciler action")
+		}()
+		return ch
+	}
+	result := func(d bindings.SchedulerDriver, c <-chan struct{}) <-chan error {
+		return chained(d, c, actions[0], actions[1])
+	}
+	for i := 2; i < len(actions); i++ {
+		i := i
+		next := func(d bindings.SchedulerDriver, c <-chan struct{}) <-chan error {
+			return chained(d, c, ReconcilerAction(result), actions[i])
+		}
+		result = next
+	}
+	return ReconcilerAction(result)
+}
+
+// reconciler action factory, performs explicit task reconciliation for non-terminal
+// tasks listed in the scheduler's internal taskRegistry.
+func (k *KubernetesScheduler) makeTaskRegistryReconciler() ReconcilerAction {
+	return ReconcilerAction(func(drv bindings.SchedulerDriver, cancel <-chan struct{}) <-chan error {
+		taskToSlave := make(map[string]string)
+		for _, t := range k.taskRegistry.List(explicitTaskFilter) {
+			if t.Spec.SlaveID != "" {
+				taskToSlave[t.ID] = t.Spec.SlaveID
+			}
+		}
+		return proc.ErrorChan(k.explicitlyReconcileTasks(drv, taskToSlave, cancel))
+	})
+}
+
+// reconciler action factory, performs explicit task reconciliation for non-terminal
+// tasks identified by annotations in the Kubernetes pod registry.
+func (k *KubernetesScheduler) makePodRegistryReconciler() ReconcilerAction {
+	return ReconcilerAction(func(drv bindings.SchedulerDriver, cancel <-chan struct{}) <-chan error {
+		ctx := api.NewDefaultContext()
+		podList, err := k.client.Pods(api.NamespaceValue(ctx)).List(labels.Everything(), fields.Everything())
+		if err != nil {
+			return proc.ErrorChanf("failed to reconcile pod registry: %v", err)
+		}
+		taskToSlave := make(map[string]string)
+		for _, pod := range podList.Items {
+			if len(pod.Annotations) == 0 {
+				continue
+			}
+			taskId, found := pod.Annotations[meta.TaskIdKey]
+			if !found {
+				continue
+			}
+			slaveId, found := pod.Annotations[meta.SlaveIdKey]
+			if !found {
+				continue
+			}
+			taskToSlave[taskId] = slaveId
+		}
+		return proc.ErrorChan(k.explicitlyReconcileTasks(drv, taskToSlave, cancel))
+	})
+}
+
+// execute an explicit task reconciliation, as per http://mesos.apache.org/documentation/latest/reconciliation/
+func (k *KubernetesScheduler) explicitlyReconcileTasks(driver bindings.SchedulerDriver, taskToSlave map[string]string, cancel <-chan struct{}) error {
+	log.Info("explicit reconcile tasks")
+
+	// tell mesos to send us the latest status updates for all the non-terminal tasks that we know about
+	statusList := []*mesos.TaskStatus{}
+	remaining := util.KeySet(reflect.ValueOf(taskToSlave))
+	for taskId, slaveId := range taskToSlave {
+		if slaveId == "" {
+			delete(taskToSlave, taskId)
+			continue
+		}
+		statusList = append(statusList, &mesos.TaskStatus{
+			TaskId:  mutil.NewTaskID(taskId),
+			SlaveId: mutil.NewSlaveID(slaveId),
+			State:   mesos.TaskState_TASK_RUNNING.Enum(), // req'd field, doesn't have to reflect reality
+		})
+	}
+
+	select {
+	case <-cancel:
+		return reconciliationCancelledErr
+	default:
+		if _, err := driver.ReconcileTasks(statusList); err != nil {
+			return err
+		}
+	}
+
+	start := time.Now()
+	first := true
+	for backoff := 1 * time.Second; first || remaining.Len() > 0; backoff = backoff * 2 {
+		first = false
+		// nothing to do here other than wait for status updates..
+		if backoff > k.schedcfg.ExplicitReconciliationMaxBackoff.Duration {
+			backoff = k.schedcfg.ExplicitReconciliationMaxBackoff.Duration
+		}
+		select {
+		case <-cancel:
+			return reconciliationCancelledErr
+		case <-time.After(backoff):
+			for taskId := range remaining {
+				if task, _ := k.taskRegistry.Get(taskId); task != nil && explicitTaskFilter(task) && task.UpdatedTime.Before(start) {
+					// keep this task in remaining list
+					continue
+				}
+				remaining.Delete(taskId)
+			}
+		}
+	}
+	return nil
+}
+
+var (
+	reconciliationCancelledErr = fmt.Errorf("explicit task reconciliation cancelled")
+)
+
+type ReconcilerAction func(driver bindings.SchedulerDriver, cancel <-chan struct{}) <-chan error
+
+type Reconciler struct {
+	proc.Doer
+	Action                             ReconcilerAction
+	explicit                           chan struct{}   // send an empty struct to trigger explicit reconciliation
+	implicit                           chan struct{}   // send an empty struct to trigger implicit reconciliation
+	done                               <-chan struct{} // close this when you want the reconciler to exit
+	cooldown                           time.Duration
+	explicitReconciliationAbortTimeout time.Duration
+}
+
+func newReconciler(doer proc.Doer, action ReconcilerAction,
+	cooldown, explicitReconciliationAbortTimeout time.Duration, done <-chan struct{}) *Reconciler {
+	return &Reconciler{
+		Doer:     doer,
+		explicit: make(chan struct{}, 1),
+		implicit: make(chan struct{}, 1),
+		cooldown: cooldown,
+		explicitReconciliationAbortTimeout: explicitReconciliationAbortTimeout,
+		done: done,
+		Action: func(driver bindings.SchedulerDriver, cancel <-chan struct{}) <-chan error {
+			// trigged the reconciler action in the doer's execution context,
+			// but it could take a while and the scheduler needs to be able to
+			// process updates, the callbacks for which ALSO execute in the SAME
+			// deferred execution context -- so the action MUST be executed async.
+			errOnce := proc.NewErrorOnce(cancel)
+			return errOnce.Send(doer.Do(func() {
+				// only triggers the action if we're the currently elected,
+				// registered master and runs the action async.
+				go func() {
+					var err <-chan error
+					defer errOnce.Send(err)
+					err = action(driver, cancel)
+				}()
+			})).Err()
+		},
+	}
+}
+
+func (r *Reconciler) RequestExplicit() {
+	select {
+	case r.explicit <- struct{}{}: // noop
+	default: // request queue full; noop
+	}
+}
+
+func (r *Reconciler) RequestImplicit() {
+	select {
+	case r.implicit <- struct{}{}: // noop
+	default: // request queue full; noop
+	}
+}
+
+// execute task reconciliation, returns when r.done is closed. intended to run as a goroutine.
+// if reconciliation is requested while another is in progress, the in-progress operation will be
+// cancelled before the new reconciliation operation begins.
+func (r *Reconciler) Run(driver bindings.SchedulerDriver) {
+	var cancel, finished chan struct{}
+requestLoop:
+	for {
+		select {
+		case <-r.done:
+			return
+		default: // proceed
+		}
+		select {
+		case <-r.implicit:
+			metrics.ReconciliationRequested.WithLabelValues("implicit").Inc()
+			select {
+			case <-r.done:
+				return
+			case <-r.explicit:
+				break // give preference to a pending request for explicit
+			default: // continue
+				// don't run implicit reconciliation while explicit is ongoing
+				if finished != nil {
+					select {
+					case <-finished: // continue w/ implicit
+					default:
+						log.Infoln("skipping implicit reconcile because explicit reconcile is ongoing")
+						continue requestLoop
+					}
+				}
+				errOnce := proc.NewErrorOnce(r.done)
+				errCh := r.Do(func() {
+					var err error
+					defer errOnce.Report(err)
+					log.Infoln("implicit reconcile tasks")
+					metrics.ReconciliationExecuted.WithLabelValues("implicit").Inc()
+					if _, err = driver.ReconcileTasks([]*mesos.TaskStatus{}); err != nil {
+						log.V(1).Infof("failed to request implicit reconciliation from mesos: %v", err)
+					}
+				})
+				proc.OnError(errOnce.Send(errCh).Err(), func(err error) {
+					log.Errorf("failed to run implicit reconciliation: %v", err)
+				}, r.done)
+				goto slowdown
+			}
+		case <-r.done:
+			return
+		case <-r.explicit: // continue
+			metrics.ReconciliationRequested.WithLabelValues("explicit").Inc()
+		}
+
+		if cancel != nil {
+			close(cancel)
+			cancel = nil
+
+			// play nice and wait for the prior operation to finish, complain
+			// if it doesn't
+			select {
+			case <-r.done:
+				return
+			case <-finished: // noop, expected
+			case <-time.After(r.explicitReconciliationAbortTimeout): // very unexpected
+				log.Error("reconciler action failed to stop upon cancellation")
+			}
+		}
+		// copy 'finished' to 'fin' here in case we end up with simultaneous go-routines,
+		// if cancellation takes too long or fails - we don't want to close the same chan
+		// more than once
+		cancel = make(chan struct{})
+		finished = make(chan struct{})
+		go func(fin chan struct{}) {
+			startedAt := time.Now()
+			defer func() {
+				metrics.ReconciliationLatency.Observe(metrics.InMicroseconds(time.Since(startedAt)))
+			}()
+
+			metrics.ReconciliationExecuted.WithLabelValues("explicit").Inc()
+			defer close(fin)
+			err := <-r.Action(driver, cancel)
+			if err == reconciliationCancelledErr {
+				metrics.ReconciliationCancelled.WithLabelValues("explicit").Inc()
+				log.Infoln(err.Error())
+			} else if err != nil {
+				log.Errorf("reconciler action failed: %v", err)
+			}
+		}(finished)
+	slowdown:
+		// don't allow reconciliation to run very frequently, either explicit or implicit
+		select {
+		case <-r.done:
+			return
+		case <-time.After(r.cooldown): // noop
+		}
+	} // for
+}
+
+func (ks *KubernetesScheduler) recoverTasks() error {
+	ctx := api.NewDefaultContext()
+	podList, err := ks.client.Pods(api.NamespaceValue(ctx)).List(labels.Everything(), fields.Everything())
+	if err != nil {
+		log.V(1).Infof("failed to recover pod registry, madness may ensue: %v", err)
+		return err
+	}
+	recoverSlave := func(t *podtask.T) {
+
+		slaveId := t.Spec.SlaveID
+		ks.slaves.checkAndAdd(slaveId, t.Offer.Host())
+	}
+	for _, pod := range podList.Items {
+		if t, ok, err := podtask.RecoverFrom(pod); err != nil {
+			log.Errorf("failed to recover task from pod, will attempt to delete '%v/%v': %v", pod.Namespace, pod.Name, err)
+			err := ks.client.Pods(pod.Namespace).Delete(pod.Name, nil)
+			//TODO(jdef) check for temporary or not-found errors
+			if err != nil {
+				log.Errorf("failed to delete pod '%v/%v': %v", pod.Namespace, pod.Name, err)
+			}
+		} else if ok {
+			ks.taskRegistry.Register(t, nil)
+			recoverSlave(t)
+			log.Infof("recovered task %v from pod %v/%v", t.ID, pod.Namespace, pod.Name)
+		}
+	}
+	return nil
+}
--- a/contrib/mesos/pkg/scheduler/scheduler_test.go
+++ b/contrib/mesos/pkg/scheduler/scheduler_test.go
@@ -0,0 +1,350 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package scheduler
+
+import (
+	"testing"
+
+	"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/offers"
+	"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/proc"
+	schedcfg "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/scheduler/config"
+	"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/scheduler/podtask"
+	mesos "github.com/mesos/mesos-go/mesosproto"
+	util "github.com/mesos/mesos-go/mesosutil"
+	"github.com/stretchr/testify/assert"
+)
+
+// Check that same slave is only added once.
+func TestSlaveStorage_checkAndAdd(t *testing.T) {
+	assert := assert.New(t)
+
+	slaveStorage := newSlaveStorage()
+	assert.Equal(0, len(slaveStorage.slaves))
+
+	slaveId := "slave1"
+	slaveHostname := "slave1Hostname"
+	slaveStorage.checkAndAdd(slaveId, slaveHostname)
+	assert.Equal(1, len(slaveStorage.getSlaveIds()))
+
+	slaveStorage.checkAndAdd(slaveId, slaveHostname)
+	assert.Equal(1, len(slaveStorage.getSlaveIds()))
+}
+
+// Check that getSlave returns notExist for nonexisting slave.
+func TestSlaveStorage_getSlave(t *testing.T) {
+	assert := assert.New(t)
+
+	slaveStorage := newSlaveStorage()
+	assert.Equal(0, len(slaveStorage.slaves))
+
+	slaveId := "slave1"
+	slaveHostname := "slave1Hostname"
+
+	_, exists := slaveStorage.getSlave(slaveId)
+	assert.Equal(false, exists)
+
+	slaveStorage.checkAndAdd(slaveId, slaveHostname)
+	assert.Equal(1, len(slaveStorage.getSlaveIds()))
+
+	_, exists = slaveStorage.getSlave(slaveId)
+	assert.Equal(true, exists)
+}
+
+// Check that getSlaveIds returns array with all slaveIds.
+func TestSlaveStorage_getSlaveIds(t *testing.T) {
+	assert := assert.New(t)
+
+	slaveStorage := newSlaveStorage()
+	assert.Equal(0, len(slaveStorage.slaves))
+
+	slaveId := "1"
+	slaveHostname := "hn1"
+	slaveStorage.checkAndAdd(slaveId, slaveHostname)
+	assert.Equal(1, len(slaveStorage.getSlaveIds()))
+
+	slaveId = "2"
+	slaveHostname = "hn2"
+	slaveStorage.checkAndAdd(slaveId, slaveHostname)
+	assert.Equal(2, len(slaveStorage.getSlaveIds()))
+
+	slaveIds := slaveStorage.getSlaveIds()
+
+	slaveIdsMap := make(map[string]bool, len(slaveIds))
+	for _, s := range slaveIds {
+		slaveIdsMap[s] = true
+	}
+
+	_, ok := slaveIdsMap["1"]
+	assert.Equal(ok, true)
+
+	_, ok = slaveIdsMap["2"]
+	assert.Equal(ok, true)
+
+}
+
+//get number of non-expired offers from  offer registry
+func getNumberOffers(os offers.Registry) int {
+	//walk offers and check it is stored in registry
+	walked := 0
+	walker1 := func(p offers.Perishable) (bool, error) {
+		walked++
+		return false, nil
+
+	}
+	os.Walk(walker1)
+	return walked
+}
+
+//test adding of ressource offer, should be added to offer registry and slavesf
+func TestResourceOffer_Add(t *testing.T) {
+	assert := assert.New(t)
+
+	testScheduler := &KubernetesScheduler{
+		offers: offers.CreateRegistry(offers.RegistryConfig{
+			Compat: func(o *mesos.Offer) bool {
+				return true
+			},
+			DeclineOffer: func(offerId string) <-chan error {
+				return proc.ErrorChan(nil)
+			},
+			// remember expired offers so that we can tell if a previously scheduler offer relies on one
+			LingerTTL:     schedcfg.DefaultOfferLingerTTL,
+			TTL:           schedcfg.DefaultOfferTTL,
+			ListenerDelay: schedcfg.DefaultListenerDelay,
+		}),
+		slaves: newSlaveStorage(),
+	}
+
+	hostname := "h1"
+	offerID1 := util.NewOfferID("test1")
+	offer1 := &mesos.Offer{Id: offerID1, Hostname: &hostname, SlaveId: util.NewSlaveID(hostname)}
+	offers1 := []*mesos.Offer{offer1}
+	testScheduler.ResourceOffers(nil, offers1)
+
+	assert.Equal(1, getNumberOffers(testScheduler.offers))
+	//check slave hostname
+	assert.Equal(1, len(testScheduler.slaves.getSlaveIds()))
+
+	//add another offer
+	hostname2 := "h2"
+	offer2 := &mesos.Offer{Id: util.NewOfferID("test2"), Hostname: &hostname2, SlaveId: util.NewSlaveID(hostname2)}
+	offers2 := []*mesos.Offer{offer2}
+	testScheduler.ResourceOffers(nil, offers2)
+
+	//check it is stored in registry
+	assert.Equal(2, getNumberOffers(testScheduler.offers))
+
+	//check slave hostnames
+	assert.Equal(2, len(testScheduler.slaves.getSlaveIds()))
+}
+
+//test adding of ressource offer, should be added to offer registry and slavesf
+func TestResourceOffer_Add_Rescind(t *testing.T) {
+	assert := assert.New(t)
+
+	testScheduler := &KubernetesScheduler{
+		offers: offers.CreateRegistry(offers.RegistryConfig{
+			Compat: func(o *mesos.Offer) bool {
+				return true
+			},
+			DeclineOffer: func(offerId string) <-chan error {
+				return proc.ErrorChan(nil)
+			},
+			// remember expired offers so that we can tell if a previously scheduler offer relies on one
+			LingerTTL:     schedcfg.DefaultOfferLingerTTL,
+			TTL:           schedcfg.DefaultOfferTTL,
+			ListenerDelay: schedcfg.DefaultListenerDelay,
+		}),
+		slaves: newSlaveStorage(),
+	}
+
+	hostname := "h1"
+	offerID1 := util.NewOfferID("test1")
+	offer1 := &mesos.Offer{Id: offerID1, Hostname: &hostname, SlaveId: util.NewSlaveID(hostname)}
+	offers1 := []*mesos.Offer{offer1}
+	testScheduler.ResourceOffers(nil, offers1)
+
+	assert.Equal(1, getNumberOffers(testScheduler.offers))
+
+	//check slave hostname
+	assert.Equal(1, len(testScheduler.slaves.getSlaveIds()))
+
+	//add another offer
+	hostname2 := "h2"
+	offer2 := &mesos.Offer{Id: util.NewOfferID("test2"), Hostname: &hostname2, SlaveId: util.NewSlaveID(hostname2)}
+	offers2 := []*mesos.Offer{offer2}
+	testScheduler.ResourceOffers(nil, offers2)
+
+	assert.Equal(2, getNumberOffers(testScheduler.offers))
+
+	//check slave hostnames
+	assert.Equal(2, len(testScheduler.slaves.getSlaveIds()))
+
+	//next whether offers can be rescinded
+	testScheduler.OfferRescinded(nil, offerID1)
+	assert.Equal(1, getNumberOffers(testScheduler.offers))
+
+	//next whether offers can be rescinded
+	testScheduler.OfferRescinded(nil, util.NewOfferID("test2"))
+	//walk offers again and check it is removed from registry
+	assert.Equal(0, getNumberOffers(testScheduler.offers))
+
+	//remove non existing ID
+	testScheduler.OfferRescinded(nil, util.NewOfferID("notExist"))
+}
+
+//test that when a slave is lost we remove all offers
+func TestSlave_Lost(t *testing.T) {
+	assert := assert.New(t)
+
+	//
+	testScheduler := &KubernetesScheduler{
+		offers: offers.CreateRegistry(offers.RegistryConfig{
+			Compat: func(o *mesos.Offer) bool {
+				return true
+			},
+			// remember expired offers so that we can tell if a previously scheduler offer relies on one
+			LingerTTL:     schedcfg.DefaultOfferLingerTTL,
+			TTL:           schedcfg.DefaultOfferTTL,
+			ListenerDelay: schedcfg.DefaultListenerDelay,
+		}),
+		slaves: newSlaveStorage(),
+	}
+
+	hostname := "h1"
+	offer1 := &mesos.Offer{Id: util.NewOfferID("test1"), Hostname: &hostname, SlaveId: util.NewSlaveID(hostname)}
+	offers1 := []*mesos.Offer{offer1}
+	testScheduler.ResourceOffers(nil, offers1)
+	offer2 := &mesos.Offer{Id: util.NewOfferID("test2"), Hostname: &hostname, SlaveId: util.NewSlaveID(hostname)}
+	offers2 := []*mesos.Offer{offer2}
+	testScheduler.ResourceOffers(nil, offers2)
+
+	//add another offer from different slaveID
+	hostname2 := "h2"
+	offer3 := &mesos.Offer{Id: util.NewOfferID("test3"), Hostname: &hostname2, SlaveId: util.NewSlaveID(hostname2)}
+	offers3 := []*mesos.Offer{offer3}
+	testScheduler.ResourceOffers(nil, offers3)
+
+	//test precondition
+	assert.Equal(3, getNumberOffers(testScheduler.offers))
+	assert.Equal(2, len(testScheduler.slaves.getSlaveIds()))
+
+	//remove first slave
+	testScheduler.SlaveLost(nil, util.NewSlaveID(hostname))
+
+	//offers should be removed
+	assert.Equal(1, getNumberOffers(testScheduler.offers))
+	//slave hostnames should still be all present
+	assert.Equal(2, len(testScheduler.slaves.getSlaveIds()))
+
+	//remove second slave
+	testScheduler.SlaveLost(nil, util.NewSlaveID(hostname2))
+
+	//offers should be removed
+	assert.Equal(0, getNumberOffers(testScheduler.offers))
+	//slave hostnames should still be all present
+	assert.Equal(2, len(testScheduler.slaves.getSlaveIds()))
+
+	//try to remove non existing slave
+	testScheduler.SlaveLost(nil, util.NewSlaveID("notExist"))
+
+}
+
+//test when we loose connection to master we invalidate all cached offers
+func TestDisconnect(t *testing.T) {
+	assert := assert.New(t)
+
+	//
+	testScheduler := &KubernetesScheduler{
+		offers: offers.CreateRegistry(offers.RegistryConfig{
+			Compat: func(o *mesos.Offer) bool {
+				return true
+			},
+			// remember expired offers so that we can tell if a previously scheduler offer relies on one
+			LingerTTL:     schedcfg.DefaultOfferLingerTTL,
+			TTL:           schedcfg.DefaultOfferTTL,
+			ListenerDelay: schedcfg.DefaultListenerDelay,
+		}),
+		slaves: newSlaveStorage(),
+	}
+
+	hostname := "h1"
+	offer1 := &mesos.Offer{Id: util.NewOfferID("test1"), Hostname: &hostname, SlaveId: util.NewSlaveID(hostname)}
+	offers1 := []*mesos.Offer{offer1}
+	testScheduler.ResourceOffers(nil, offers1)
+	offer2 := &mesos.Offer{Id: util.NewOfferID("test2"), Hostname: &hostname, SlaveId: util.NewSlaveID(hostname)}
+	offers2 := []*mesos.Offer{offer2}
+	testScheduler.ResourceOffers(nil, offers2)
+
+	//add another offer from different slaveID
+	hostname2 := "h2"
+	offer3 := &mesos.Offer{Id: util.NewOfferID("test2"), Hostname: &hostname2, SlaveId: util.NewSlaveID(hostname2)}
+	offers3 := []*mesos.Offer{offer3}
+	testScheduler.ResourceOffers(nil, offers3)
+
+	//disconnect
+	testScheduler.Disconnected(nil)
+
+	//all offers should be removed
+	assert.Equal(0, getNumberOffers(testScheduler.offers))
+	//slave hostnames should still be all present
+	assert.Equal(2, len(testScheduler.slaves.getSlaveIds()))
+}
+
+//test we can handle different status updates, TODO check state transitions
+func TestStatus_Update(t *testing.T) {
+
+	mockdriver := MockSchedulerDriver{}
+	// setup expectations
+	mockdriver.On("KillTask", util.NewTaskID("test-task-001")).Return(mesos.Status_DRIVER_RUNNING, nil)
+
+	testScheduler := &KubernetesScheduler{
+		offers: offers.CreateRegistry(offers.RegistryConfig{
+			Compat: func(o *mesos.Offer) bool {
+				return true
+			},
+			// remember expired offers so that we can tell if a previously scheduler offer relies on one
+			LingerTTL:     schedcfg.DefaultOfferLingerTTL,
+			TTL:           schedcfg.DefaultOfferTTL,
+			ListenerDelay: schedcfg.DefaultListenerDelay,
+		}),
+		slaves:       newSlaveStorage(),
+		driver:       &mockdriver,
+		taskRegistry: podtask.NewInMemoryRegistry(),
+	}
+
+	taskStatus_task_starting := util.NewTaskStatus(
+		util.NewTaskID("test-task-001"),
+		mesos.TaskState_TASK_RUNNING,
+	)
+	testScheduler.StatusUpdate(testScheduler.driver, taskStatus_task_starting)
+
+	taskStatus_task_running := util.NewTaskStatus(
+		util.NewTaskID("test-task-001"),
+		mesos.TaskState_TASK_RUNNING,
+	)
+	testScheduler.StatusUpdate(testScheduler.driver, taskStatus_task_running)
+
+	taskStatus_task_failed := util.NewTaskStatus(
+		util.NewTaskID("test-task-001"),
+		mesos.TaskState_TASK_FAILED,
+	)
+	testScheduler.StatusUpdate(testScheduler.driver, taskStatus_task_failed)
+
+	//assert that mock was invoked
+	mockdriver.AssertExpectations(t)
+}
--- a/contrib/mesos/pkg/scheduler/service/compat_testing.go
+++ b/contrib/mesos/pkg/scheduler/service/compat_testing.go
@@ -0,0 +1,32 @@
+// +build unit_test
+
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package service
+
+import (
+	"os"
+	"syscall"
+)
+
+func makeFailoverSigChan() <-chan os.Signal {
+	return nil
+}
+
+func makeDisownedProcAttr() *syscall.SysProcAttr {
+	return nil
+}
--- a/contrib/mesos/pkg/scheduler/service/compat_unix.go
+++ b/contrib/mesos/pkg/scheduler/service/compat_unix.go
@@ -0,0 +1,38 @@
+// +build darwin dragonfly freebsd linux netbsd openbsd
+// +build !unit_test
+
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package service
+
+import (
+	"os"
+	"os/signal"
+	"syscall"
+)
+
+func makeFailoverSigChan() <-chan os.Signal {
+	ch := make(chan os.Signal, 1)
+	signal.Notify(ch, syscall.SIGUSR1)
+	return ch
+}
+
+func makeDisownedProcAttr() *syscall.SysProcAttr {
+	return &syscall.SysProcAttr{
+		Setpgid: true, // disown the spawned scheduler
+	}
+}
--- a/contrib/mesos/pkg/scheduler/service/compat_windows.go
+++ b/contrib/mesos/pkg/scheduler/service/compat_windows.go
@@ -0,0 +1,51 @@
+// +build windows
+// +build !unit_test
+
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package service
+
+import (
+	"os"
+	"syscall"
+)
+
+func makeFailoverSigChan() <-chan os.Signal {
+	/* TODO(jdef)
+		from go's windows compatibility test, it looks like we need to provide a filtered
+		signal channel here
+
+	        c := make(chan os.Signal, 10)
+	        signal.Notify(c)
+	        select {
+	        case s := <-c:
+	                if s != os.Interrupt {
+	                        log.Fatalf("Wrong signal received: got %q, want %q\n", s, os.Interrupt)
+	                }
+	        case <-time.After(3 * time.Second):
+	                log.Fatalf("Timeout waiting for Ctrl+Break\n")
+	        }
+	*/
+	return nil
+}
+
+func makeDisownedProcAttr() *syscall.SysProcAttr {
+	//TODO(jdef) test this somehow?!?!
+	return &syscall.SysProcAttr{
+		CreationFlags: syscall.CREATE_NEW_PROCESS_GROUP | syscall.CREATE_UNICODE_ENVIRONMENT,
+	}
+}
--- a/contrib/mesos/pkg/scheduler/service/doc.go
+++ b/contrib/mesos/pkg/scheduler/service/doc.go
@@ -0,0 +1,18 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+// Package service contains the cmd/k8sm-scheduler glue code
+package service
--- a/contrib/mesos/pkg/scheduler/service/publish.go
+++ b/contrib/mesos/pkg/scheduler/service/publish.go
@@ -0,0 +1,121 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package service
+
+import (
+	"net"
+	"reflect"
+	"time"
+
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/api"
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/api/errors"
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/master/ports"
+
+	"github.com/golang/glog"
+)
+
+const (
+	SCHEDULER_SERVICE_NAME = "k8sm-scheduler"
+)
+
+func (m *SchedulerServer) newServiceWriter(stop <-chan struct{}) func() {
+	return func() {
+		for {
+			// Update service & endpoint records.
+			// TODO(k8s): when it becomes possible to change this stuff,
+			// stop polling and start watching.
+			if err := m.createSchedulerServiceIfNeeded(SCHEDULER_SERVICE_NAME, ports.SchedulerPort); err != nil {
+				glog.Errorf("Can't create scheduler service: %v", err)
+			}
+
+			if err := m.setEndpoints(SCHEDULER_SERVICE_NAME, net.IP(m.Address), m.Port); err != nil {
+				glog.Errorf("Can't create scheduler endpoints: %v", err)
+			}
+
+			select {
+			case <-stop:
+				return
+			case <-time.After(10 * time.Second):
+			}
+		}
+	}
+}
+
+// createSchedulerServiceIfNeeded will create the specified service if it
+// doesn't already exist.
+func (m *SchedulerServer) createSchedulerServiceIfNeeded(serviceName string, servicePort int) error {
+	ctx := api.NewDefaultContext()
+	if _, err := m.client.Services(api.NamespaceValue(ctx)).Get(serviceName); err == nil {
+		// The service already exists.
+		return nil
+	}
+	svc := &api.Service{
+		ObjectMeta: api.ObjectMeta{
+			Name:      serviceName,
+			Namespace: api.NamespaceDefault,
+			Labels:    map[string]string{"provider": "k8sm", "component": "scheduler"},
+		},
+		Spec: api.ServiceSpec{
+			Ports: []api.ServicePort{{Port: servicePort, Protocol: api.ProtocolTCP}},
+			// maintained by this code, not by the pod selector
+			Selector:        nil,
+			SessionAffinity: api.ServiceAffinityNone,
+		},
+	}
+	if m.ServiceAddress != nil {
+		svc.Spec.ClusterIP = m.ServiceAddress.String()
+	}
+	_, err := m.client.Services(api.NamespaceValue(ctx)).Create(svc)
+	if err != nil && errors.IsAlreadyExists(err) {
+		err = nil
+	}
+	return err
+}
+
+// setEndpoints sets the endpoints for the given service.
+// in a multi-master scenario only the master will be publishing an endpoint.
+// see SchedulerServer.bootstrap.
+func (m *SchedulerServer) setEndpoints(serviceName string, ip net.IP, port int) error {
+	// The setting we want to find.
+	want := []api.EndpointSubset{{
+		Addresses: []api.EndpointAddress{{IP: ip.String()}},
+		Ports:     []api.EndpointPort{{Port: port, Protocol: api.ProtocolTCP}},
+	}}
+
+	ctx := api.NewDefaultContext()
+	e, err := m.client.Endpoints(api.NamespaceValue(ctx)).Get(serviceName)
+	createOrUpdate := m.client.Endpoints(api.NamespaceValue(ctx)).Update
+	if err != nil {
+		if errors.IsNotFound(err) {
+			createOrUpdate = m.client.Endpoints(api.NamespaceValue(ctx)).Create
+		}
+		e = &api.Endpoints{
+			ObjectMeta: api.ObjectMeta{
+				Name:      serviceName,
+				Namespace: api.NamespaceDefault,
+			},
+		}
+	}
+	if !reflect.DeepEqual(e.Subsets, want) {
+		e.Subsets = want
+		glog.Infof("setting endpoints for master service %q to %#v", serviceName, e)
+		_, err = createOrUpdate(e)
+		return err
+	}
+	// We didn't make any changes, no need to actually call update.
+	return nil
+}
--- a/contrib/mesos/pkg/scheduler/service/service.go
+++ b/contrib/mesos/pkg/scheduler/service/service.go
@@ -0,0 +1,751 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package service
+
+import (
+	"bufio"
+	"errors"
+	"fmt"
+	"io/ioutil"
+	"net"
+	"net/http"
+	"os"
+	"os/exec"
+	"os/user"
+	"strconv"
+	"strings"
+	"sync"
+	"time"
+
+	"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/election"
+	execcfg "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/executor/config"
+	"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/hyperkube"
+	"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/profile"
+	"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/runtime"
+	"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/scheduler"
+	schedcfg "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/scheduler/config"
+	"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/scheduler/ha"
+	"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/scheduler/meta"
+	"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/scheduler/metrics"
+	"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/scheduler/uid"
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/client"
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/clientauth"
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/master/ports"
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/tools"
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/util"
+	"github.com/coreos/go-etcd/etcd"
+	"github.com/gogo/protobuf/proto"
+	log "github.com/golang/glog"
+	"github.com/kardianos/osext"
+	"github.com/mesos/mesos-go/auth"
+	"github.com/mesos/mesos-go/auth/sasl"
+	"github.com/mesos/mesos-go/auth/sasl/mech"
+	mesos "github.com/mesos/mesos-go/mesosproto"
+	mutil "github.com/mesos/mesos-go/mesosutil"
+	bindings "github.com/mesos/mesos-go/scheduler"
+	"github.com/prometheus/client_golang/prometheus"
+	"github.com/spf13/pflag"
+	"golang.org/x/net/context"
+)
+
+const (
+	defaultMesosMaster       = "localhost:5050"
+	defaultMesosUser         = "root" // should have privs to execute docker and iptables commands
+	defaultReconcileInterval = 300    // 5m default task reconciliation interval
+	defaultReconcileCooldown = 15 * time.Second
+	defaultFrameworkName     = "Kubernetes"
+)
+
+type SchedulerServer struct {
+	Port                          int
+	Address                       util.IP
+	EnableProfiling               bool
+	AuthPath                      string
+	APIServerList                 util.StringList
+	EtcdServerList                util.StringList
+	EtcdConfigFile                string
+	AllowPrivileged               bool
+	ExecutorPath                  string
+	ProxyPath                     string
+	MesosMaster                   string
+	MesosUser                     string
+	MesosRole                     string
+	MesosAuthPrincipal            string
+	MesosAuthSecretFile           string
+	Checkpoint                    bool
+	FailoverTimeout               float64
+	ExecutorBindall               bool
+	ExecutorRunProxy              bool
+	ExecutorProxyBindall          bool
+	ExecutorLogV                  int
+	ExecutorSuicideTimeout        time.Duration
+	MesosAuthProvider             string
+	DriverPort                    uint
+	HostnameOverride              string
+	ReconcileInterval             int64
+	ReconcileCooldown             time.Duration
+	SchedulerConfigFileName       string
+	Graceful                      bool
+	FrameworkName                 string
+	FrameworkWebURI               string
+	HA                            bool
+	AdvertisedAddress             string
+	ServiceAddress                util.IP
+	HADomain                      string
+	KMPath                        string
+	ClusterDNS                    util.IP
+	ClusterDomain                 string
+	KubeletRootDirectory          string
+	KubeletDockerEndpoint         string
+	KubeletPodInfraContainerImage string
+	KubeletCadvisorPort           uint
+	KubeletHostNetworkSources     string
+	KubeletSyncFrequency          time.Duration
+	KubeletNetworkPluginName      string
+
+	executable  string // path to the binary running this service
+	client      *client.Client
+	driver      bindings.SchedulerDriver
+	driverMutex sync.RWMutex
+	mux         *http.ServeMux
+}
+
+// useful for unit testing specific funcs
+type schedulerProcessInterface interface {
+	End() <-chan struct{}
+	Failover() <-chan struct{}
+	Terminal() <-chan struct{}
+}
+
+// NewSchedulerServer creates a new SchedulerServer with default parameters
+func NewSchedulerServer() *SchedulerServer {
+	s := SchedulerServer{
+		Port:                   ports.SchedulerPort,
+		Address:                util.IP(net.ParseIP("127.0.0.1")),
+		FailoverTimeout:        time.Duration((1 << 62) - 1).Seconds(),
+		ExecutorRunProxy:       true,
+		ExecutorSuicideTimeout: execcfg.DefaultSuicideTimeout,
+		MesosAuthProvider:      sasl.ProviderName,
+		MesosMaster:            defaultMesosMaster,
+		MesosUser:              defaultMesosUser,
+		ReconcileInterval:      defaultReconcileInterval,
+		ReconcileCooldown:      defaultReconcileCooldown,
+		Checkpoint:             true,
+		FrameworkName:          defaultFrameworkName,
+		HA:                     false,
+		mux:                    http.NewServeMux(),
+		KubeletCadvisorPort:    4194, // copied from github.com/GoogleCloudPlatform/kubernetes/blob/release-0.14/cmd/kubelet/app/server.go
+		KubeletSyncFrequency:   10 * time.Second,
+	}
+	// cache this for later use. also useful in case the original binary gets deleted, e.g.
+	// during upgrades, development deployments, etc.
+	if filename, err := osext.Executable(); err != nil {
+		log.Fatalf("failed to determine path to currently running executable: %v", err)
+	} else {
+		s.executable = filename
+		s.KMPath = filename
+	}
+
+	return &s
+}
+
+func (s *SchedulerServer) addCoreFlags(fs *pflag.FlagSet) {
+	fs.IntVar(&s.Port, "port", s.Port, "The port that the scheduler's http service runs on")
+	fs.Var(&s.Address, "address", "The IP address to serve on (set to 0.0.0.0 for all interfaces)")
+	fs.BoolVar(&s.EnableProfiling, "profiling", s.EnableProfiling, "Enable profiling via web interface host:port/debug/pprof/")
+	fs.Var(&s.APIServerList, "api-servers", "List of Kubernetes API servers for publishing events, and reading pods and services. (ip:port), comma separated.")
+	fs.StringVar(&s.AuthPath, "auth-path", s.AuthPath, "Path to .kubernetes_auth file, specifying how to authenticate to API server.")
+	fs.Var(&s.EtcdServerList, "etcd-servers", "List of etcd servers to watch (http://ip:port), comma separated. Mutually exclusive with --etcd-config")
+	fs.StringVar(&s.EtcdConfigFile, "etcd-config", s.EtcdConfigFile, "The config file for the etcd client. Mutually exclusive with --etcd-servers.")
+	fs.BoolVar(&s.AllowPrivileged, "allow-privileged", s.AllowPrivileged, "If true, allow privileged containers.")
+	fs.StringVar(&s.ClusterDomain, "cluster-domain", s.ClusterDomain, "Domain for this cluster.  If set, kubelet will configure all containers to search this domain in addition to the host's search domains")
+	fs.Var(&s.ClusterDNS, "cluster-dns", "IP address for a cluster DNS server. If set, kubelet will configure all containers to use this for DNS resolution in addition to the host's DNS servers")
+
+	fs.StringVar(&s.MesosMaster, "mesos-master", s.MesosMaster, "Location of the Mesos master. The format is a comma-delimited list of of hosts like zk://host1:port,host2:port/mesos. If using ZooKeeper, pay particular attention to the leading zk:// and trailing /mesos! If not using ZooKeeper, standard URLs like http://localhost are also acceptable.")
+	fs.StringVar(&s.MesosUser, "mesos-user", s.MesosUser, "Mesos user for this framework, defaults to root.")
+	fs.StringVar(&s.MesosRole, "mesos-role", s.MesosRole, "Mesos role for this framework, defaults to none.")
+	fs.StringVar(&s.MesosAuthPrincipal, "mesos-authentication-principal", s.MesosAuthPrincipal, "Mesos authentication principal.")
+	fs.StringVar(&s.MesosAuthSecretFile, "mesos-authentication-secret-file", s.MesosAuthSecretFile, "Mesos authentication secret file.")
+	fs.StringVar(&s.MesosAuthProvider, "mesos-authentication-provider", s.MesosAuthProvider, fmt.Sprintf("Authentication provider to use, default is SASL that supports mechanisms: %+v", mech.ListSupported()))
+	fs.BoolVar(&s.Checkpoint, "checkpoint", s.Checkpoint, "Enable/disable checkpointing for the kubernetes-mesos framework.")
+	fs.Float64Var(&s.FailoverTimeout, "failover-timeout", s.FailoverTimeout, fmt.Sprintf("Framework failover timeout, in sec."))
+	fs.UintVar(&s.DriverPort, "driver-port", s.DriverPort, "Port that the Mesos scheduler driver process should listen on.")
+	fs.StringVar(&s.HostnameOverride, "hostname-override", s.HostnameOverride, "If non-empty, will use this string as identification instead of the actual hostname.")
+	fs.Int64Var(&s.ReconcileInterval, "reconcile-interval", s.ReconcileInterval, "Interval at which to execute task reconciliation, in sec. Zero disables.")
+	fs.DurationVar(&s.ReconcileCooldown, "reconcile-cooldown", s.ReconcileCooldown, "Minimum rest period between task reconciliation operations.")
+	fs.StringVar(&s.SchedulerConfigFileName, "scheduler-config", s.SchedulerConfigFileName, "An ini-style configuration file with low-level scheduler settings.")
+	fs.BoolVar(&s.Graceful, "graceful", s.Graceful, "Indicator of a graceful failover, intended for internal use only.")
+	fs.BoolVar(&s.HA, "ha", s.HA, "Run the scheduler in high availability mode with leader election. All peers should be configured exactly the same.")
+	fs.StringVar(&s.FrameworkName, "framework-name", s.FrameworkName, "The framework name to register with Mesos.")
+	fs.StringVar(&s.FrameworkWebURI, "framework-weburi", s.FrameworkWebURI, "A URI that points to a web-based interface for interacting with the framework.")
+	fs.StringVar(&s.AdvertisedAddress, "advertised-address", s.AdvertisedAddress, "host:port address that is advertised to clients. May be used to construct artifact download URIs.")
+	fs.Var(&s.ServiceAddress, "service-address", "The service portal IP address that the scheduler should register with (if unset, chooses randomly)")
+
+	fs.BoolVar(&s.ExecutorBindall, "executor-bindall", s.ExecutorBindall, "When true will set -address of the executor to 0.0.0.0.")
+	fs.IntVar(&s.ExecutorLogV, "executor-logv", s.ExecutorLogV, "Logging verbosity of spawned executor processes.")
+	fs.BoolVar(&s.ExecutorProxyBindall, "executor-proxy-bindall", s.ExecutorProxyBindall, "When true pass -proxy-bindall to the executor.")
+	fs.BoolVar(&s.ExecutorRunProxy, "executor-run-proxy", s.ExecutorRunProxy, "Run the kube-proxy as a child process of the executor.")
+	fs.DurationVar(&s.ExecutorSuicideTimeout, "executor-suicide-timeout", s.ExecutorSuicideTimeout, "Executor self-terminates after this period of inactivity. Zero disables suicide watch.")
+
+	fs.StringVar(&s.KubeletRootDirectory, "kubelet-root-dir", s.KubeletRootDirectory, "Directory path for managing kubelet files (volume mounts,etc). Defaults to executor sandbox.")
+	fs.StringVar(&s.KubeletDockerEndpoint, "kubelet-docker-endpoint", s.KubeletDockerEndpoint, "If non-empty, kubelet will use this for the docker endpoint to communicate with.")
+	fs.StringVar(&s.KubeletPodInfraContainerImage, "kubelet-pod-infra-container-image", s.KubeletPodInfraContainerImage, "The image whose network/ipc namespaces containers in each pod will use.")
+	fs.UintVar(&s.KubeletCadvisorPort, "kubelet-cadvisor-port", s.KubeletCadvisorPort, "The port of the kubelet's local cAdvisor endpoint")
+	fs.StringVar(&s.KubeletHostNetworkSources, "kubelet-host-network-sources", s.KubeletHostNetworkSources, "Comma-separated list of sources from which the Kubelet allows pods to use of host network. For all sources use \"*\" [default=\"file\"]")
+	fs.DurationVar(&s.KubeletSyncFrequency, "kubelet-sync-frequency", s.KubeletSyncFrequency, "Max period between synchronizing running containers and config")
+	fs.StringVar(&s.KubeletNetworkPluginName, "kubelet-network-plugin", s.KubeletNetworkPluginName, "<Warning: Alpha feature> The name of the network plugin to be invoked for various events in kubelet/pod lifecycle")
+
+	//TODO(jdef) support this flag once we have a better handle on mesos-dns and k8s DNS integration
+	//fs.StringVar(&s.HADomain, "ha-domain", s.HADomain, "Domain of the HA scheduler service, only used in HA mode. If specified may be used to construct artifact download URIs.")
+}
+
+func (s *SchedulerServer) AddStandaloneFlags(fs *pflag.FlagSet) {
+	s.addCoreFlags(fs)
+	fs.StringVar(&s.ExecutorPath, "executor-path", s.ExecutorPath, "Location of the kubernetes executor executable")
+	fs.StringVar(&s.ProxyPath, "proxy-path", s.ProxyPath, "Location of the kubernetes proxy executable")
+}
+
+func (s *SchedulerServer) AddHyperkubeFlags(fs *pflag.FlagSet) {
+	s.addCoreFlags(fs)
+	fs.StringVar(&s.KMPath, "km-path", s.KMPath, "Location of the km executable, may be a URI or an absolute file path.")
+}
+
+// returns (downloadURI, basename(path))
+func (s *SchedulerServer) serveFrameworkArtifact(path string) (string, string) {
+	serveFile := func(pattern string, filename string) {
+		s.mux.HandleFunc(pattern, func(w http.ResponseWriter, r *http.Request) {
+			http.ServeFile(w, r, filename)
+		})
+	}
+
+	// Create base path (http://foobar:5000/<base>)
+	pathSplit := strings.Split(path, "/")
+	var base string
+	if len(pathSplit) > 0 {
+		base = pathSplit[len(pathSplit)-1]
+	} else {
+		base = path
+	}
+	serveFile("/"+base, path)
+
+	hostURI := ""
+	if s.AdvertisedAddress != "" {
+		hostURI = fmt.Sprintf("http://%s/%s", s.AdvertisedAddress, base)
+	} else if s.HA && s.HADomain != "" {
+		hostURI = fmt.Sprintf("http://%s.%s:%d/%s", SCHEDULER_SERVICE_NAME, s.HADomain, ports.SchedulerPort, base)
+	} else {
+		hostURI = fmt.Sprintf("http://%s:%d/%s", s.Address.String(), s.Port, base)
+	}
+	log.V(2).Infof("Hosting artifact '%s' at '%s'", path, hostURI)
+
+	return hostURI, base
+}
+
+func (s *SchedulerServer) prepareExecutorInfo(hks hyperkube.Interface) (*mesos.ExecutorInfo, *uid.UID, error) {
+	ci := &mesos.CommandInfo{
+		Shell: proto.Bool(false),
+	}
+
+	//TODO(jdef) these should be shared constants with km
+	const (
+		KM_EXECUTOR = "executor"
+		KM_PROXY    = "proxy"
+	)
+
+	if s.ExecutorPath != "" {
+		uri, executorCmd := s.serveFrameworkArtifact(s.ExecutorPath)
+		ci.Uris = append(ci.Uris, &mesos.CommandInfo_URI{Value: proto.String(uri), Executable: proto.Bool(true)})
+		ci.Value = proto.String(fmt.Sprintf("./%s", executorCmd))
+	} else if !hks.FindServer(KM_EXECUTOR) {
+		return nil, nil, fmt.Errorf("either run this scheduler via km or else --executor-path is required")
+	} else {
+		if strings.Index(s.KMPath, "://") > 0 {
+			// URI could point directly to executable, e.g. hdfs:///km
+			// or else indirectly, e.g. http://acmestorage/tarball.tgz
+			// so we assume that for this case the command will always "km"
+			ci.Uris = append(ci.Uris, &mesos.CommandInfo_URI{Value: proto.String(s.KMPath), Executable: proto.Bool(true)})
+			ci.Value = proto.String("./km") // TODO(jdef) extract constant
+		} else if s.KMPath != "" {
+			uri, kmCmd := s.serveFrameworkArtifact(s.KMPath)
+			ci.Uris = append(ci.Uris, &mesos.CommandInfo_URI{Value: proto.String(uri), Executable: proto.Bool(true)})
+			ci.Value = proto.String(fmt.Sprintf("./%s", kmCmd))
+		} else {
+			uri, kmCmd := s.serveFrameworkArtifact(s.executable)
+			ci.Uris = append(ci.Uris, &mesos.CommandInfo_URI{Value: proto.String(uri), Executable: proto.Bool(true)})
+			ci.Value = proto.String(fmt.Sprintf("./%s", kmCmd))
+		}
+		ci.Arguments = append(ci.Arguments, KM_EXECUTOR)
+	}
+
+	if s.ProxyPath != "" {
+		uri, proxyCmd := s.serveFrameworkArtifact(s.ProxyPath)
+		ci.Uris = append(ci.Uris, &mesos.CommandInfo_URI{Value: proto.String(uri), Executable: proto.Bool(true)})
+		ci.Arguments = append(ci.Arguments, fmt.Sprintf("--proxy-exec=./%s", proxyCmd))
+	} else if !hks.FindServer(KM_PROXY) {
+		return nil, nil, fmt.Errorf("either run this scheduler via km or else --proxy-path is required")
+	} else if s.ExecutorPath != "" {
+		return nil, nil, fmt.Errorf("proxy can only use km binary if executor does the same")
+	} // else, executor is smart enough to know when proxy-path is required, or to use km
+
+	//TODO(jdef): provide some way (env var?) for users to customize executor config
+	//TODO(jdef): set -address to 127.0.0.1 if `address` is 127.0.0.1
+	//TODO(jdef): propagate dockercfg from RootDirectory?
+
+	apiServerArgs := strings.Join(s.APIServerList, ",")
+	ci.Arguments = append(ci.Arguments, fmt.Sprintf("--api-servers=%s", apiServerArgs))
+	ci.Arguments = append(ci.Arguments, fmt.Sprintf("--v=%d", s.ExecutorLogV))
+	ci.Arguments = append(ci.Arguments, fmt.Sprintf("--allow-privileged=%t", s.AllowPrivileged))
+	ci.Arguments = append(ci.Arguments, fmt.Sprintf("--suicide-timeout=%v", s.ExecutorSuicideTimeout))
+
+	if s.ExecutorBindall {
+		//TODO(jdef) determine whether hostname-override is really needed for bindall because
+		//it conflicts with kubelet node status checks/updates
+		//ci.Arguments = append(ci.Arguments, "--hostname-override=0.0.0.0")
+		ci.Arguments = append(ci.Arguments, "--address=0.0.0.0")
+	}
+
+	ci.Arguments = append(ci.Arguments, fmt.Sprintf("--proxy-bindall=%v", s.ExecutorProxyBindall))
+	ci.Arguments = append(ci.Arguments, fmt.Sprintf("--run-proxy=%v", s.ExecutorRunProxy))
+	ci.Arguments = append(ci.Arguments, fmt.Sprintf("--cadvisor-port=%v", s.KubeletCadvisorPort))
+	ci.Arguments = append(ci.Arguments, fmt.Sprintf("--sync-frequency=%v", s.KubeletSyncFrequency))
+
+	if s.AuthPath != "" {
+		//TODO(jdef) should probably support non-local files, e.g. hdfs:///some/config/file
+		uri, basename := s.serveFrameworkArtifact(s.AuthPath)
+		ci.Uris = append(ci.Uris, &mesos.CommandInfo_URI{Value: proto.String(uri)})
+		ci.Arguments = append(ci.Arguments, fmt.Sprintf("--auth-path=%s", basename))
+	}
+	appendOptional := func(name string, value string) {
+		if value != "" {
+			ci.Arguments = append(ci.Arguments, fmt.Sprintf("--%s=%s", name, value))
+		}
+	}
+	if s.ClusterDNS != nil {
+		appendOptional("cluster-dns", s.ClusterDNS.String())
+	}
+	appendOptional("cluster-domain", s.ClusterDomain)
+	appendOptional("root-dir", s.KubeletRootDirectory)
+	appendOptional("docker-endpoint", s.KubeletDockerEndpoint)
+	appendOptional("pod-infra-container-image", s.KubeletPodInfraContainerImage)
+	appendOptional("host-network-sources", s.KubeletHostNetworkSources)
+	appendOptional("network-plugin", s.KubeletNetworkPluginName)
+
+	log.V(1).Infof("prepared executor command %q with args '%+v'", ci.GetValue(), ci.Arguments)
+
+	// Create mesos scheduler driver.
+	info := &mesos.ExecutorInfo{
+		Command: ci,
+		Name:    proto.String(execcfg.DefaultInfoName),
+		Source:  proto.String(execcfg.DefaultInfoSource),
+	}
+
+	// calculate ExecutorInfo hash to be used for validating compatibility
+	// of ExecutorInfo's generated by other HA schedulers.
+	ehash := hashExecutorInfo(info)
+	eid := uid.New(ehash, execcfg.DefaultInfoID)
+	info.ExecutorId = &mesos.ExecutorID{Value: proto.String(eid.String())}
+
+	return info, eid, nil
+}
+
+// TODO(jdef): hacked from kubelet/server/server.go
+// TODO(k8s): replace this with clientcmd
+func (s *SchedulerServer) createAPIServerClient() (*client.Client, error) {
+	authInfo, err := clientauth.LoadFromFile(s.AuthPath)
+	if err != nil {
+		log.Warningf("Could not load kubernetes auth path: %v. Continuing with defaults.", err)
+	}
+	if authInfo == nil {
+		// authInfo didn't load correctly - continue with defaults.
+		authInfo = &clientauth.Info{}
+	}
+	clientConfig, err := authInfo.MergeWithConfig(client.Config{})
+	if err != nil {
+		return nil, err
+	}
+	if len(s.APIServerList) < 1 {
+		return nil, fmt.Errorf("no api servers specified")
+	}
+	// TODO: adapt Kube client to support LB over several servers
+	if len(s.APIServerList) > 1 {
+		log.Infof("Multiple api servers specified.  Picking first one")
+	}
+	clientConfig.Host = s.APIServerList[0]
+	c, err := client.New(&clientConfig)
+	if err != nil {
+		return nil, err
+	}
+	return c, nil
+}
+
+func (s *SchedulerServer) setDriver(driver bindings.SchedulerDriver) {
+	s.driverMutex.Lock()
+	defer s.driverMutex.Unlock()
+	s.driver = driver
+}
+
+func (s *SchedulerServer) getDriver() (driver bindings.SchedulerDriver) {
+	s.driverMutex.RLock()
+	defer s.driverMutex.RUnlock()
+	return s.driver
+}
+
+func (s *SchedulerServer) Run(hks hyperkube.Interface, _ []string) error {
+	// get scheduler low-level config
+	sc := schedcfg.CreateDefaultConfig()
+	if s.SchedulerConfigFileName != "" {
+		f, err := os.Open(s.SchedulerConfigFileName)
+		if err != nil {
+			log.Fatalf("Cannot open scheduler config file: %v", err)
+		}
+
+		err = sc.Read(bufio.NewReader(f))
+		if err != nil {
+			log.Fatalf("Invalid scheduler config file: %v", err)
+		}
+	}
+
+	schedulerProcess, driverFactory, etcdClient, eid := s.bootstrap(hks, sc)
+
+	if s.EnableProfiling {
+		profile.InstallHandler(s.mux)
+	}
+	go runtime.Until(func() {
+		log.V(1).Info("Starting HTTP interface")
+		log.Error(http.ListenAndServe(net.JoinHostPort(s.Address.String(), strconv.Itoa(s.Port)), s.mux))
+	}, sc.HttpBindInterval.Duration, schedulerProcess.Terminal())
+
+	if s.HA {
+		validation := ha.ValidationFunc(validateLeadershipTransition)
+		srv := ha.NewCandidate(schedulerProcess, driverFactory, validation)
+		path := fmt.Sprintf(meta.DefaultElectionFormat, s.FrameworkName)
+		sid := uid.New(eid.Group(), "").String()
+		log.Infof("registering for election at %v with id %v", path, sid)
+		go election.Notify(election.NewEtcdMasterElector(etcdClient), path, sid, srv, nil)
+	} else {
+		log.Infoln("self-electing in non-HA mode")
+		schedulerProcess.Elect(driverFactory)
+	}
+	return s.awaitFailover(schedulerProcess, func() error { return s.failover(s.getDriver(), hks) })
+}
+
+// watch the scheduler process for failover signals and properly handle such. may never return.
+func (s *SchedulerServer) awaitFailover(schedulerProcess schedulerProcessInterface, handler func() error) error {
+
+	// we only want to return the first error (if any), everyone else can block forever
+	errCh := make(chan error, 1)
+	doFailover := func() error {
+		// we really don't expect handler to return, if it does something went seriously wrong
+		err := handler()
+		if err != nil {
+			defer schedulerProcess.End()
+			err = fmt.Errorf("failover failed, scheduler will terminate: %v", err)
+		}
+		return err
+	}
+
+	// guard for failover signal processing, first signal processor wins
+	failoverLatch := &runtime.Latch{}
+	runtime.On(schedulerProcess.Terminal(), func() {
+		if !failoverLatch.Acquire() {
+			log.V(1).Infof("scheduler process ending, already failing over")
+			select {}
+		}
+		var err error
+		defer func() { errCh <- err }()
+		select {
+		case <-schedulerProcess.Failover():
+			err = doFailover()
+		default:
+			if s.HA {
+				err = fmt.Errorf("ha scheduler exiting instead of failing over")
+			} else {
+				log.Infof("exiting scheduler")
+			}
+		}
+	})
+	runtime.OnOSSignal(makeFailoverSigChan(), func(_ os.Signal) {
+		if !failoverLatch.Acquire() {
+			log.V(1).Infof("scheduler process signalled, already failing over")
+			select {}
+		}
+		errCh <- doFailover()
+	})
+	return <-errCh
+}
+
+func validateLeadershipTransition(desired, current string) {
+	log.Infof("validating leadership transition")
+	d := uid.Parse(desired).Group()
+	c := uid.Parse(current).Group()
+	if d == 0 {
+		// should *never* happen, but..
+		log.Fatalf("illegal scheduler UID: %q", desired)
+	}
+	if d != c && c != 0 {
+		log.Fatalf("desired scheduler group (%x) != current scheduler group (%x)", d, c)
+	}
+}
+
+// hacked from https://github.com/GoogleCloudPlatform/kubernetes/blob/release-0.14/cmd/kube-apiserver/app/server.go
+func newEtcd(etcdConfigFile string, etcdServerList util.StringList) (client tools.EtcdGetSet, err error) {
+	if etcdConfigFile != "" {
+		client, err = etcd.NewClientFromFile(etcdConfigFile)
+	} else {
+		client = etcd.NewClient(etcdServerList)
+	}
+	return
+}
+
+func (s *SchedulerServer) bootstrap(hks hyperkube.Interface, sc *schedcfg.Config) (*ha.SchedulerProcess, ha.DriverFactory, tools.EtcdGetSet, *uid.UID) {
+
+	s.FrameworkName = strings.TrimSpace(s.FrameworkName)
+	if s.FrameworkName == "" {
+		log.Fatalf("framework-name must be a non-empty string")
+	}
+	s.FrameworkWebURI = strings.TrimSpace(s.FrameworkWebURI)
+
+	metrics.Register()
+	runtime.Register()
+	s.mux.Handle("/metrics", prometheus.Handler())
+
+	if (s.EtcdConfigFile != "" && len(s.EtcdServerList) != 0) || (s.EtcdConfigFile == "" && len(s.EtcdServerList) == 0) {
+		log.Fatalf("specify either --etcd-servers or --etcd-config")
+	}
+
+	if len(s.APIServerList) < 1 {
+		log.Fatal("No api servers specified.")
+	}
+
+	client, err := s.createAPIServerClient()
+	if err != nil {
+		log.Fatalf("Unable to make apiserver client: %v", err)
+	}
+	s.client = client
+
+	if s.ReconcileCooldown < defaultReconcileCooldown {
+		s.ReconcileCooldown = defaultReconcileCooldown
+		log.Warningf("user-specified reconcile cooldown too small, defaulting to %v", s.ReconcileCooldown)
+	}
+
+	executor, eid, err := s.prepareExecutorInfo(hks)
+	if err != nil {
+		log.Fatalf("misconfigured executor: %v", err)
+	}
+
+	// TODO(jdef): remove the dependency on etcd as soon as
+	// (1) the generic config store is available for the FrameworkId storage
+	// (2) the generic master election is provided by the apiserver
+	// Compare docs/proposals/high-availability.md
+	etcdClient, err := newEtcd(s.EtcdConfigFile, s.EtcdServerList)
+	if err != nil {
+		log.Fatalf("misconfigured etcd: %v", err)
+	}
+
+	mesosPodScheduler := scheduler.New(scheduler.Config{
+		Schedcfg:          *sc,
+		Executor:          executor,
+		ScheduleFunc:      scheduler.FCFSScheduleFunc,
+		Client:            client,
+		EtcdClient:        etcdClient,
+		FailoverTimeout:   s.FailoverTimeout,
+		ReconcileInterval: s.ReconcileInterval,
+		ReconcileCooldown: s.ReconcileCooldown,
+	})
+
+	masterUri := s.MesosMaster
+	info, cred, err := s.buildFrameworkInfo()
+	if err != nil {
+		log.Fatalf("Misconfigured mesos framework: %v", err)
+	}
+
+	schedulerProcess := ha.New(mesosPodScheduler)
+	dconfig := &bindings.DriverConfig{
+		Scheduler:        schedulerProcess,
+		Framework:        info,
+		Master:           masterUri,
+		Credential:       cred,
+		BindingAddress:   net.IP(s.Address),
+		BindingPort:      uint16(s.DriverPort),
+		HostnameOverride: s.HostnameOverride,
+		WithAuthContext: func(ctx context.Context) context.Context {
+			ctx = auth.WithLoginProvider(ctx, s.MesosAuthProvider)
+			ctx = sasl.WithBindingAddress(ctx, net.IP(s.Address))
+			return ctx
+		},
+	}
+
+	kpl := scheduler.NewPlugin(mesosPodScheduler.NewDefaultPluginConfig(schedulerProcess.Terminal(), s.mux))
+	runtime.On(mesosPodScheduler.Registration(), func() { kpl.Run(schedulerProcess.Terminal()) })
+	runtime.On(mesosPodScheduler.Registration(), s.newServiceWriter(schedulerProcess.Terminal()))
+
+	driverFactory := ha.DriverFactory(func() (drv bindings.SchedulerDriver, err error) {
+		log.V(1).Infoln("performing deferred initialization")
+		if err = mesosPodScheduler.Init(schedulerProcess.Master(), kpl, s.mux); err != nil {
+			return nil, fmt.Errorf("failed to initialize pod scheduler: %v", err)
+		}
+		log.V(1).Infoln("deferred init complete")
+		// defer obtaining framework ID to prevent multiple schedulers
+		// from overwriting each other's framework IDs
+		dconfig.Framework.Id, err = s.fetchFrameworkID(etcdClient)
+		if err != nil {
+			return nil, fmt.Errorf("failed to fetch framework ID from etcd: %v", err)
+		}
+		log.V(1).Infoln("constructing mesos scheduler driver")
+		drv, err = bindings.NewMesosSchedulerDriver(*dconfig)
+		if err != nil {
+			return nil, fmt.Errorf("failed to construct scheduler driver: %v", err)
+		}
+		log.V(1).Infoln("constructed mesos scheduler driver:", drv)
+		s.setDriver(drv)
+		return drv, nil
+	})
+
+	return schedulerProcess, driverFactory, etcdClient, eid
+}
+
+func (s *SchedulerServer) failover(driver bindings.SchedulerDriver, hks hyperkube.Interface) error {
+	if driver != nil {
+		stat, err := driver.Stop(true)
+		if stat != mesos.Status_DRIVER_STOPPED {
+			return fmt.Errorf("failed to stop driver for failover, received unexpected status code: %v", stat)
+		} else if err != nil {
+			return err
+		}
+	}
+
+	// there's no guarantee that all goroutines are actually programmed intelligently with 'done'
+	// signals, so we'll need to restart if we want to really stop everything
+
+	// run the same command that we were launched with
+	//TODO(jdef) assumption here is that the sheduler is the only service running in this process, we should probably validate that somehow
+	args := []string{}
+	flags := pflag.CommandLine
+	if hks != nil {
+		args = append(args, hks.Name())
+		flags = hks.Flags()
+	}
+	flags.Visit(func(flag *pflag.Flag) {
+		if flag.Name != "api-servers" && flag.Name != "etcd-servers" {
+			args = append(args, fmt.Sprintf("--%s=%s", flag.Name, flag.Value.String()))
+		}
+	})
+	if !s.Graceful {
+		args = append(args, "--graceful")
+	}
+	if len(s.APIServerList) > 0 {
+		args = append(args, "--api-servers="+strings.Join(s.APIServerList, ","))
+	}
+	if len(s.EtcdServerList) > 0 {
+		args = append(args, "--etcd-servers="+strings.Join(s.EtcdServerList, ","))
+	}
+	args = append(args, flags.Args()...)
+
+	log.V(1).Infof("spawning scheduler for graceful failover: %s %+v", s.executable, args)
+
+	cmd := exec.Command(s.executable, args...)
+	cmd.Stdin = os.Stdin
+	cmd.Stdout = os.Stdout
+	cmd.Stderr = os.Stderr
+	cmd.SysProcAttr = makeDisownedProcAttr()
+
+	// TODO(jdef) pass in a pipe FD so that we can block, waiting for the child proc to be ready
+	//cmd.ExtraFiles = []*os.File{}
+
+	exitcode := 0
+	log.Flush() // TODO(jdef) it would be really nice to ensure that no one else in our process was still logging
+	if err := cmd.Start(); err != nil {
+		//log to stdtout here to avoid conflicts with normal stderr logging
+		fmt.Fprintf(os.Stdout, "failed to spawn failover process: %v\n", err)
+		os.Exit(1)
+	}
+	os.Exit(exitcode)
+	select {} // will never reach here
+}
+
+func (s *SchedulerServer) buildFrameworkInfo() (info *mesos.FrameworkInfo, cred *mesos.Credential, err error) {
+	username, err := s.getUsername()
+	if err != nil {
+		return nil, nil, err
+	}
+	log.V(2).Infof("Framework configured with mesos user %v", username)
+	info = &mesos.FrameworkInfo{
+		Name:       proto.String(s.FrameworkName),
+		User:       proto.String(username),
+		Checkpoint: proto.Bool(s.Checkpoint),
+	}
+	if s.FrameworkWebURI != "" {
+		info.WebuiUrl = proto.String(s.FrameworkWebURI)
+	}
+	if s.FailoverTimeout > 0 {
+		info.FailoverTimeout = proto.Float64(s.FailoverTimeout)
+	}
+	if s.MesosRole != "" {
+		info.Role = proto.String(s.MesosRole)
+	}
+	if s.MesosAuthPrincipal != "" {
+		info.Principal = proto.String(s.MesosAuthPrincipal)
+		if s.MesosAuthSecretFile == "" {
+			return nil, nil, errors.New("authentication principal specified without the required credentials file")
+		}
+		secret, err := ioutil.ReadFile(s.MesosAuthSecretFile)
+		if err != nil {
+			return nil, nil, err
+		}
+		cred = &mesos.Credential{
+			Principal: proto.String(s.MesosAuthPrincipal),
+			Secret:    secret,
+		}
+	}
+	return
+}
+
+func (s *SchedulerServer) fetchFrameworkID(client tools.EtcdGetSet) (*mesos.FrameworkID, error) {
+	if s.FailoverTimeout > 0 {
+		if response, err := client.Get(meta.FrameworkIDKey, false, false); err != nil {
+			if !tools.IsEtcdNotFound(err) {
+				return nil, fmt.Errorf("unexpected failure attempting to load framework ID from etcd: %v", err)
+			}
+			log.V(1).Infof("did not find framework ID in etcd")
+		} else if response.Node.Value != "" {
+			log.Infof("configuring FrameworkInfo with Id found in etcd: '%s'", response.Node.Value)
+			return mutil.NewFrameworkID(response.Node.Value), nil
+		}
+	} else {
+		//TODO(jdef) this seems like a totally hackish way to clean up the framework ID
+		if _, err := client.Delete(meta.FrameworkIDKey, true); err != nil {
+			if !tools.IsEtcdNotFound(err) {
+				return nil, fmt.Errorf("failed to delete framework ID from etcd: %v", err)
+			}
+			log.V(1).Infof("nothing to delete: did not find framework ID in etcd")
+		}
+	}
+	return nil, nil
+}
+
+func (s *SchedulerServer) getUsername() (username string, err error) {
+	username = s.MesosUser
+	if username == "" {
+		if u, err := user.Current(); err == nil {
+			username = u.Username
+			if username == "" {
+				username = defaultMesosUser
+			}
+		}
+	}
+	return
+}
--- a/contrib/mesos/pkg/scheduler/service/service_test.go
+++ b/contrib/mesos/pkg/scheduler/service/service_test.go
@@ -0,0 +1,108 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+// +build unit_test
+
+package service
+
+import (
+	"testing"
+	"time"
+)
+
+type fakeSchedulerProcess struct {
+	doneFunc     func() <-chan struct{}
+	failoverFunc func() <-chan struct{}
+}
+
+func (self *fakeSchedulerProcess) Terminal() <-chan struct{} {
+	if self == nil || self.doneFunc == nil {
+		return nil
+	}
+	return self.doneFunc()
+}
+
+func (self *fakeSchedulerProcess) Failover() <-chan struct{} {
+	if self == nil || self.failoverFunc == nil {
+		return nil
+	}
+	return self.failoverFunc()
+}
+
+func (self *fakeSchedulerProcess) End() <-chan struct{} {
+	ch := make(chan struct{})
+	close(ch)
+	return ch
+}
+
+func Test_awaitFailoverDone(t *testing.T) {
+	done := make(chan struct{})
+	p := &fakeSchedulerProcess{
+		doneFunc: func() <-chan struct{} { return done },
+	}
+	ss := &SchedulerServer{}
+	failoverHandlerCalled := false
+	failoverFailedHandler := func() error {
+		failoverHandlerCalled = true
+		return nil
+	}
+	errCh := make(chan error, 1)
+	go func() {
+		errCh <- ss.awaitFailover(p, failoverFailedHandler)
+	}()
+	close(done)
+	select {
+	case err := <-errCh:
+		if err != nil {
+			t.Fatalf("unexpected error: %v", err)
+		}
+	case <-time.After(1 * time.Second):
+		t.Fatalf("timed out waiting for failover")
+	}
+	if failoverHandlerCalled {
+		t.Fatalf("unexpected call to failover handler")
+	}
+}
+
+func Test_awaitFailoverDoneFailover(t *testing.T) {
+	ch := make(chan struct{})
+	p := &fakeSchedulerProcess{
+		doneFunc:     func() <-chan struct{} { return ch },
+		failoverFunc: func() <-chan struct{} { return ch },
+	}
+	ss := &SchedulerServer{}
+	failoverHandlerCalled := false
+	failoverFailedHandler := func() error {
+		failoverHandlerCalled = true
+		return nil
+	}
+	errCh := make(chan error, 1)
+	go func() {
+		errCh <- ss.awaitFailover(p, failoverFailedHandler)
+	}()
+	close(ch)
+	select {
+	case err := <-errCh:
+		if err != nil {
+			t.Fatalf("unexpected error: %v", err)
+		}
+	case <-time.After(1 * time.Second):
+		t.Fatalf("timed out waiting for failover")
+	}
+	if !failoverHandlerCalled {
+		t.Fatalf("expected call to failover handler")
+	}
+}
--- a/contrib/mesos/pkg/scheduler/service/util.go
+++ b/contrib/mesos/pkg/scheduler/service/util.go
@@ -0,0 +1,88 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package service
+
+import (
+	"bytes"
+	"fmt"
+	"hash/crc64"
+	"sort"
+	"strconv"
+
+	mesos "github.com/mesos/mesos-go/mesosproto"
+)
+
+// compute a hashcode for ExecutorInfo that may be used as a reasonable litmus test
+// with respect to compatibility across HA schedulers. the intent is that an HA scheduler
+// should fail-fast if it doesn't pass this test, rather than generating (potentially many)
+// errors at run-time because a Mesos master decides that the ExecutorInfo generated by a
+// secondary scheduler doesn't match that of the primary scheduler.
+//
+// see https://github.com/apache/mesos/blob/0.22.0/src/common/type_utils.cpp#L110
+func hashExecutorInfo(info *mesos.ExecutorInfo) uint64 {
+	// !!! we specifically do NOT include:
+	// - Framework ID because it's a value that's initialized too late for us to use
+	// - Executor ID because it's a value that includes a copy of this hash
+	buf := &bytes.Buffer{}
+	buf.WriteString(info.GetName())
+	buf.WriteString(info.GetSource())
+	buf.Write(info.Data)
+
+	if info.Command != nil {
+		buf.WriteString(info.Command.GetValue())
+		buf.WriteString(info.Command.GetUser())
+		buf.WriteString(strconv.FormatBool(info.Command.GetShell()))
+		if sz := len(info.Command.Arguments); sz > 0 {
+			x := make([]string, sz)
+			copy(x, info.Command.Arguments)
+			sort.Strings(x)
+			for _, item := range x {
+				buf.WriteString(item)
+			}
+		}
+		if vars := info.Command.Environment.GetVariables(); vars != nil && len(vars) > 0 {
+			names := []string{}
+			e := make(map[string]string)
+
+			for _, v := range vars {
+				if name := v.GetName(); name != "" {
+					names = append(names, name)
+					e[name] = v.GetValue()
+				}
+			}
+			sort.Strings(names)
+			for _, n := range names {
+				buf.WriteString(n)
+				buf.WriteString("=")
+				buf.WriteString(e[n])
+			}
+		}
+		if uris := info.Command.GetUris(); len(uris) > 0 {
+			su := []string{}
+			for _, uri := range uris {
+				su = append(su, fmt.Sprintf("%s%t%t", uri.GetValue(), uri.GetExecutable(), uri.GetExtract()))
+			}
+			sort.Strings(su)
+			for _, uri := range su {
+				buf.WriteString(uri)
+			}
+		}
+		//TODO(jdef) add support for Resources and Container
+	}
+	table := crc64.MakeTable(crc64.ECMA)
+	return crc64.Checksum(buf.Bytes(), table)
+}
--- a/Show More
+++ b/Show More