Update godeps for etcd 3.0.4

2016-07-22 13:54:40 -05:00
parent 456c43c22d
commit 5f008faa8b
457 changed files with 25492 additions and 10481 deletions
--- a/vendor/github.com/coreos/etcd/mvcc/backend/backend.go
+++ b/vendor/github.com/coreos/etcd/mvcc/backend/backend.go
@@ -0,0 +1,349 @@
+// Copyright 2015 The etcd Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package backend
+
+import (
+	"fmt"
+	"hash/crc32"
+	"io"
+	"io/ioutil"
+	"os"
+	"path"
+	"sync"
+	"sync/atomic"
+	"time"
+
+	"github.com/boltdb/bolt"
+	"github.com/coreos/pkg/capnslog"
+)
+
+var (
+	defaultBatchLimit    = 10000
+	defaultBatchInterval = 100 * time.Millisecond
+
+	defragLimit = 10000
+
+	// InitialMmapSize is the initial size of the mmapped region. Setting this larger than
+	// the potential max db size can prevent writer from blocking reader.
+	// This only works for linux.
+	InitialMmapSize = int64(10 * 1024 * 1024 * 1024)
+
+	plog = capnslog.NewPackageLogger("github.com/coreos/etcd/mvcc", "backend")
+)
+
+const (
+	// DefaultQuotaBytes is the number of bytes the backend Size may
+	// consume before exceeding the space quota.
+	DefaultQuotaBytes = int64(2 * 1024 * 1024 * 1024) // 2GB
+	// MaxQuotaBytes is the maximum number of bytes suggested for a backend
+	// quota. A larger quota may lead to degraded performance.
+	MaxQuotaBytes = int64(8 * 1024 * 1024 * 1024) // 8GB
+)
+
+type Backend interface {
+	BatchTx() BatchTx
+	Snapshot() Snapshot
+	Hash(ignores map[IgnoreKey]struct{}) (uint32, error)
+	// Size returns the current size of the backend.
+	Size() int64
+	Defrag() error
+	ForceCommit()
+	Close() error
+}
+
+type Snapshot interface {
+	// Size gets the size of the snapshot.
+	Size() int64
+	// WriteTo writes the snapshot into the given writer.
+	WriteTo(w io.Writer) (n int64, err error)
+	// Close closes the snapshot.
+	Close() error
+}
+
+type backend struct {
+	// size and commits are used with atomic operations so they must be
+	// 64-bit aligned, otherwise 32-bit tests will crash
+
+	// size is the number of bytes in the backend
+	size int64
+	// commits counts number of commits since start
+	commits int64
+
+	mu sync.RWMutex
+	db *bolt.DB
+
+	batchInterval time.Duration
+	batchLimit    int
+	batchTx       *batchTx
+
+	stopc chan struct{}
+	donec chan struct{}
+}
+
+func New(path string, d time.Duration, limit int) Backend {
+	return newBackend(path, d, limit)
+}
+
+func NewDefaultBackend(path string) Backend {
+	return newBackend(path, defaultBatchInterval, defaultBatchLimit)
+}
+
+func newBackend(path string, d time.Duration, limit int) *backend {
+	db, err := bolt.Open(path, 0600, boltOpenOptions)
+	if err != nil {
+		plog.Panicf("cannot open database at %s (%v)", path, err)
+	}
+
+	b := &backend{
+		db: db,
+
+		batchInterval: d,
+		batchLimit:    limit,
+
+		stopc: make(chan struct{}),
+		donec: make(chan struct{}),
+	}
+	b.batchTx = newBatchTx(b)
+	go b.run()
+	return b
+}
+
+// BatchTx returns the current batch tx in coalescer. The tx can be used for read and
+// write operations. The write result can be retrieved within the same tx immediately.
+// The write result is isolated with other txs until the current one get committed.
+func (b *backend) BatchTx() BatchTx {
+	return b.batchTx
+}
+
+// ForceCommit forces the current batching tx to commit.
+func (b *backend) ForceCommit() {
+	b.batchTx.Commit()
+}
+
+func (b *backend) Snapshot() Snapshot {
+	b.batchTx.Commit()
+
+	b.mu.RLock()
+	defer b.mu.RUnlock()
+	tx, err := b.db.Begin(false)
+	if err != nil {
+		plog.Fatalf("cannot begin tx (%s)", err)
+	}
+	return &snapshot{tx}
+}
+
+type IgnoreKey struct {
+	Bucket string
+	Key    string
+}
+
+func (b *backend) Hash(ignores map[IgnoreKey]struct{}) (uint32, error) {
+	h := crc32.New(crc32.MakeTable(crc32.Castagnoli))
+
+	b.mu.RLock()
+	defer b.mu.RUnlock()
+	err := b.db.View(func(tx *bolt.Tx) error {
+		c := tx.Cursor()
+		for next, _ := c.First(); next != nil; next, _ = c.Next() {
+			b := tx.Bucket(next)
+			if b == nil {
+				return fmt.Errorf("cannot get hash of bucket %s", string(next))
+			}
+			h.Write(next)
+			b.ForEach(func(k, v []byte) error {
+				bk := IgnoreKey{Bucket: string(next), Key: string(k)}
+				if _, ok := ignores[bk]; !ok {
+					h.Write(k)
+					h.Write(v)
+				}
+				return nil
+			})
+		}
+		return nil
+	})
+
+	if err != nil {
+		return 0, err
+	}
+
+	return h.Sum32(), nil
+}
+
+func (b *backend) Size() int64 {
+	return atomic.LoadInt64(&b.size)
+}
+
+func (b *backend) run() {
+	defer close(b.donec)
+	t := time.NewTimer(b.batchInterval)
+	defer t.Stop()
+	for {
+		select {
+		case <-t.C:
+		case <-b.stopc:
+			b.batchTx.CommitAndStop()
+			return
+		}
+		b.batchTx.Commit()
+		t.Reset(b.batchInterval)
+	}
+}
+
+func (b *backend) Close() error {
+	close(b.stopc)
+	<-b.donec
+	return b.db.Close()
+}
+
+// Commits returns total number of commits since start
+func (b *backend) Commits() int64 {
+	return atomic.LoadInt64(&b.commits)
+}
+
+func (b *backend) Defrag() error {
+	err := b.defrag()
+	if err != nil {
+		return err
+	}
+
+	// commit to update metadata like db.size
+	b.batchTx.Commit()
+
+	return nil
+}
+
+func (b *backend) defrag() error {
+	// TODO: make this non-blocking?
+	// lock batchTx to ensure nobody is using previous tx, and then
+	// close previous ongoing tx.
+	b.batchTx.Lock()
+	defer b.batchTx.Unlock()
+
+	// lock database after lock tx to avoid deadlock.
+	b.mu.Lock()
+	defer b.mu.Unlock()
+
+	b.batchTx.commit(true)
+	b.batchTx.tx = nil
+
+	tmpdb, err := bolt.Open(b.db.Path()+".tmp", 0600, boltOpenOptions)
+	if err != nil {
+		return err
+	}
+
+	err = defragdb(b.db, tmpdb, defragLimit)
+
+	if err != nil {
+		tmpdb.Close()
+		os.RemoveAll(tmpdb.Path())
+		return err
+	}
+
+	dbp := b.db.Path()
+	tdbp := tmpdb.Path()
+
+	err = b.db.Close()
+	if err != nil {
+		plog.Fatalf("cannot close database (%s)", err)
+	}
+	err = tmpdb.Close()
+	if err != nil {
+		plog.Fatalf("cannot close database (%s)", err)
+	}
+	err = os.Rename(tdbp, dbp)
+	if err != nil {
+		plog.Fatalf("cannot rename database (%s)", err)
+	}
+
+	b.db, err = bolt.Open(dbp, 0600, boltOpenOptions)
+	if err != nil {
+		plog.Panicf("cannot open database at %s (%v)", dbp, err)
+	}
+	b.batchTx.tx, err = b.db.Begin(true)
+	if err != nil {
+		plog.Fatalf("cannot begin tx (%s)", err)
+	}
+
+	return nil
+}
+
+func defragdb(odb, tmpdb *bolt.DB, limit int) error {
+	// open a tx on tmpdb for writes
+	tmptx, err := tmpdb.Begin(true)
+	if err != nil {
+		return err
+	}
+
+	// open a tx on old db for read
+	tx, err := odb.Begin(false)
+	if err != nil {
+		return err
+	}
+	defer tx.Rollback()
+
+	c := tx.Cursor()
+
+	count := 0
+	for next, _ := c.First(); next != nil; next, _ = c.Next() {
+		b := tx.Bucket(next)
+		if b == nil {
+			return fmt.Errorf("backend: cannot defrag bucket %s", string(next))
+		}
+
+		tmpb, berr := tmptx.CreateBucketIfNotExists(next)
+		if berr != nil {
+			return berr
+		}
+
+		b.ForEach(func(k, v []byte) error {
+			count++
+			if count > limit {
+				err = tmptx.Commit()
+				if err != nil {
+					return err
+				}
+				tmptx, err = tmpdb.Begin(true)
+				if err != nil {
+					return err
+				}
+				tmpb = tmptx.Bucket(next)
+				count = 0
+			}
+			return tmpb.Put(k, v)
+		})
+	}
+
+	return tmptx.Commit()
+}
+
+// NewTmpBackend creates a backend implementation for testing.
+func NewTmpBackend(batchInterval time.Duration, batchLimit int) (*backend, string) {
+	dir, err := ioutil.TempDir(os.TempDir(), "etcd_backend_test")
+	if err != nil {
+		plog.Fatal(err)
+	}
+	tmpPath := path.Join(dir, "database")
+	return newBackend(tmpPath, batchInterval, batchLimit), tmpPath
+}
+
+func NewDefaultTmpBackend() (*backend, string) {
+	return NewTmpBackend(defaultBatchInterval, defaultBatchLimit)
+}
+
+type snapshot struct {
+	*bolt.Tx
+}
+
+func (s *snapshot) Close() error { return s.Tx.Rollback() }
--- a/vendor/github.com/coreos/etcd/mvcc/backend/batch_tx.go
+++ b/vendor/github.com/coreos/etcd/mvcc/backend/batch_tx.go
@@ -0,0 +1,191 @@
+// Copyright 2015 The etcd Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package backend
+
+import (
+	"bytes"
+	"sync"
+	"sync/atomic"
+	"time"
+
+	"github.com/boltdb/bolt"
+)
+
+type BatchTx interface {
+	Lock()
+	Unlock()
+	UnsafeCreateBucket(name []byte)
+	UnsafePut(bucketName []byte, key []byte, value []byte)
+	UnsafeSeqPut(bucketName []byte, key []byte, value []byte)
+	UnsafeRange(bucketName []byte, key, endKey []byte, limit int64) (keys [][]byte, vals [][]byte)
+	UnsafeDelete(bucketName []byte, key []byte)
+	UnsafeForEach(bucketName []byte, visitor func(k, v []byte) error) error
+	Commit()
+	CommitAndStop()
+}
+
+type batchTx struct {
+	sync.Mutex
+	tx      *bolt.Tx
+	backend *backend
+	pending int
+}
+
+func newBatchTx(backend *backend) *batchTx {
+	tx := &batchTx{backend: backend}
+	tx.Commit()
+	return tx
+}
+
+func (t *batchTx) UnsafeCreateBucket(name []byte) {
+	_, err := t.tx.CreateBucket(name)
+	if err != nil && err != bolt.ErrBucketExists {
+		plog.Fatalf("cannot create bucket %s (%v)", name, err)
+	}
+	t.pending++
+}
+
+// UnsafePut must be called holding the lock on the tx.
+func (t *batchTx) UnsafePut(bucketName []byte, key []byte, value []byte) {
+	t.unsafePut(bucketName, key, value, false)
+}
+
+// UnsafeSeqPut must be called holding the lock on the tx.
+func (t *batchTx) UnsafeSeqPut(bucketName []byte, key []byte, value []byte) {
+	t.unsafePut(bucketName, key, value, true)
+}
+
+func (t *batchTx) unsafePut(bucketName []byte, key []byte, value []byte, seq bool) {
+	bucket := t.tx.Bucket(bucketName)
+	if bucket == nil {
+		plog.Fatalf("bucket %s does not exist", bucketName)
+	}
+	if seq {
+		// it is useful to increase fill percent when the workloads are mostly append-only.
+		// this can delay the page split and reduce space usage.
+		bucket.FillPercent = 0.9
+	}
+	if err := bucket.Put(key, value); err != nil {
+		plog.Fatalf("cannot put key into bucket (%v)", err)
+	}
+	t.pending++
+}
+
+// UnsafeRange must be called holding the lock on the tx.
+func (t *batchTx) UnsafeRange(bucketName []byte, key, endKey []byte, limit int64) (keys [][]byte, vs [][]byte) {
+	bucket := t.tx.Bucket(bucketName)
+	if bucket == nil {
+		plog.Fatalf("bucket %s does not exist", bucketName)
+	}
+
+	if len(endKey) == 0 {
+		if v := bucket.Get(key); v == nil {
+			return keys, vs
+		} else {
+			return append(keys, key), append(vs, v)
+		}
+	}
+
+	c := bucket.Cursor()
+	for ck, cv := c.Seek(key); ck != nil && bytes.Compare(ck, endKey) < 0; ck, cv = c.Next() {
+		vs = append(vs, cv)
+		keys = append(keys, ck)
+		if limit > 0 && limit == int64(len(keys)) {
+			break
+		}
+	}
+
+	return keys, vs
+}
+
+// UnsafeDelete must be called holding the lock on the tx.
+func (t *batchTx) UnsafeDelete(bucketName []byte, key []byte) {
+	bucket := t.tx.Bucket(bucketName)
+	if bucket == nil {
+		plog.Fatalf("bucket %s does not exist", bucketName)
+	}
+	err := bucket.Delete(key)
+	if err != nil {
+		plog.Fatalf("cannot delete key from bucket (%v)", err)
+	}
+	t.pending++
+}
+
+// UnsafeForEach must be called holding the lock on the tx.
+func (t *batchTx) UnsafeForEach(bucketName []byte, visitor func(k, v []byte) error) error {
+	b := t.tx.Bucket(bucketName)
+	if b == nil {
+		// bucket does not exist
+		return nil
+	}
+	return b.ForEach(visitor)
+}
+
+// Commit commits a previous tx and begins a new writable one.
+func (t *batchTx) Commit() {
+	t.Lock()
+	defer t.Unlock()
+	t.commit(false)
+}
+
+// CommitAndStop commits the previous tx and do not create a new one.
+func (t *batchTx) CommitAndStop() {
+	t.Lock()
+	defer t.Unlock()
+	t.commit(true)
+}
+
+func (t *batchTx) Unlock() {
+	if t.pending >= t.backend.batchLimit {
+		t.commit(false)
+		t.pending = 0
+	}
+	t.Mutex.Unlock()
+}
+
+func (t *batchTx) commit(stop bool) {
+	var err error
+	// commit the last tx
+	if t.tx != nil {
+		if t.pending == 0 && !stop {
+			t.backend.mu.RLock()
+			defer t.backend.mu.RUnlock()
+			atomic.StoreInt64(&t.backend.size, t.tx.Size())
+			return
+		}
+		start := time.Now()
+		err = t.tx.Commit()
+		commitDurations.Observe(time.Since(start).Seconds())
+		atomic.AddInt64(&t.backend.commits, 1)
+
+		t.pending = 0
+		if err != nil {
+			plog.Fatalf("cannot commit tx (%s)", err)
+		}
+	}
+
+	if stop {
+		return
+	}
+
+	t.backend.mu.RLock()
+	defer t.backend.mu.RUnlock()
+	// begin a new tx
+	t.tx, err = t.backend.db.Begin(true)
+	if err != nil {
+		plog.Fatalf("cannot begin tx (%s)", err)
+	}
+	atomic.StoreInt64(&t.backend.size, t.tx.Size())
+}
--- a/vendor/github.com/coreos/etcd/mvcc/backend/boltoption_default.go
+++ b/vendor/github.com/coreos/etcd/mvcc/backend/boltoption_default.go
@@ -0,0 +1,21 @@
+// Copyright 2016 The etcd Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build !linux
+
+package backend
+
+import "github.com/boltdb/bolt"
+
+var boltOpenOptions *bolt.Options = nil
--- a/vendor/github.com/coreos/etcd/mvcc/backend/boltoption_linux.go
+++ b/vendor/github.com/coreos/etcd/mvcc/backend/boltoption_linux.go
@@ -0,0 +1,32 @@
+// Copyright 2015 The etcd Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package backend
+
+import (
+	"syscall"
+
+	"github.com/boltdb/bolt"
+)
+
+// syscall.MAP_POPULATE on linux 2.6.23+ does sequential read-ahead
+// which can speed up entire-database read with boltdb. We want to
+// enable MAP_POPULATE for faster key-value store recovery in storage
+// package. If your kernel version is lower than 2.6.23
+// (https://github.com/torvalds/linux/releases/tag/v2.6.23), mmap might
+// silently ignore this flag. Please update your kernel to prevent this.
+var boltOpenOptions = &bolt.Options{
+	MmapFlags:       syscall.MAP_POPULATE,
+	InitialMmapSize: int(InitialMmapSize),
+}
--- a/vendor/github.com/coreos/etcd/mvcc/backend/doc.go
+++ b/vendor/github.com/coreos/etcd/mvcc/backend/doc.go
@@ -0,0 +1,16 @@
+// Copyright 2015 The etcd Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package backend defines a standard interface for etcd's backend MVCC storage.
+package backend
--- a/vendor/github.com/coreos/etcd/mvcc/backend/metrics.go
+++ b/vendor/github.com/coreos/etcd/mvcc/backend/metrics.go
@@ -0,0 +1,31 @@
+// Copyright 2016 The etcd Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package backend
+
+import "github.com/prometheus/client_golang/prometheus"
+
+var (
+	commitDurations = prometheus.NewHistogram(prometheus.HistogramOpts{
+		Namespace: "etcd",
+		Subsystem: "disk",
+		Name:      "backend_commit_duration_seconds",
+		Help:      "The latency distributions of commit called by backend.",
+		Buckets:   prometheus.ExponentialBuckets(0.001, 2, 14),
+	})
+)
+
+func init() {
+	prometheus.MustRegister(commitDurations)
+}
--- a/vendor/github.com/coreos/etcd/mvcc/doc.go
+++ b/vendor/github.com/coreos/etcd/mvcc/doc.go
@@ -0,0 +1,16 @@
+// Copyright 2015 The etcd Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package mvcc defines etcd's stable MVCC storage.
+package mvcc
--- a/vendor/github.com/coreos/etcd/mvcc/index.go
+++ b/vendor/github.com/coreos/etcd/mvcc/index.go
@@ -0,0 +1,217 @@
+// Copyright 2015 The etcd Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mvcc
+
+import (
+	"sort"
+	"sync"
+
+	"github.com/google/btree"
+)
+
+type index interface {
+	Get(key []byte, atRev int64) (rev, created revision, ver int64, err error)
+	Range(key, end []byte, atRev int64) ([][]byte, []revision)
+	Put(key []byte, rev revision)
+	Restore(key []byte, created, modified revision, ver int64)
+	Tombstone(key []byte, rev revision) error
+	RangeSince(key, end []byte, rev int64) []revision
+	Compact(rev int64) map[revision]struct{}
+	Equal(b index) bool
+}
+
+type treeIndex struct {
+	sync.RWMutex
+	tree *btree.BTree
+}
+
+func newTreeIndex() index {
+	return &treeIndex{
+		tree: btree.New(32),
+	}
+}
+
+func (ti *treeIndex) Put(key []byte, rev revision) {
+	keyi := &keyIndex{key: key}
+
+	ti.Lock()
+	defer ti.Unlock()
+	item := ti.tree.Get(keyi)
+	if item == nil {
+		keyi.put(rev.main, rev.sub)
+		ti.tree.ReplaceOrInsert(keyi)
+		return
+	}
+	okeyi := item.(*keyIndex)
+	okeyi.put(rev.main, rev.sub)
+}
+
+func (ti *treeIndex) Restore(key []byte, created, modified revision, ver int64) {
+	keyi := &keyIndex{key: key}
+
+	ti.Lock()
+	defer ti.Unlock()
+	item := ti.tree.Get(keyi)
+	if item == nil {
+		keyi.restore(created, modified, ver)
+		ti.tree.ReplaceOrInsert(keyi)
+		return
+	}
+	okeyi := item.(*keyIndex)
+	okeyi.put(modified.main, modified.sub)
+}
+
+func (ti *treeIndex) Get(key []byte, atRev int64) (modified, created revision, ver int64, err error) {
+	keyi := &keyIndex{key: key}
+
+	ti.RLock()
+	defer ti.RUnlock()
+	item := ti.tree.Get(keyi)
+	if item == nil {
+		return revision{}, revision{}, 0, ErrRevisionNotFound
+	}
+
+	keyi = item.(*keyIndex)
+	return keyi.get(atRev)
+}
+
+func (ti *treeIndex) Range(key, end []byte, atRev int64) (keys [][]byte, revs []revision) {
+	if end == nil {
+		rev, _, _, err := ti.Get(key, atRev)
+		if err != nil {
+			return nil, nil
+		}
+		return [][]byte{key}, []revision{rev}
+	}
+
+	keyi := &keyIndex{key: key}
+	endi := &keyIndex{key: end}
+
+	ti.RLock()
+	defer ti.RUnlock()
+
+	ti.tree.AscendGreaterOrEqual(keyi, func(item btree.Item) bool {
+		if len(endi.key) > 0 && !item.Less(endi) {
+			return false
+		}
+		curKeyi := item.(*keyIndex)
+		rev, _, _, err := curKeyi.get(atRev)
+		if err != nil {
+			return true
+		}
+		revs = append(revs, rev)
+		keys = append(keys, curKeyi.key)
+		return true
+	})
+
+	return keys, revs
+}
+
+func (ti *treeIndex) Tombstone(key []byte, rev revision) error {
+	keyi := &keyIndex{key: key}
+
+	ti.Lock()
+	defer ti.Unlock()
+	item := ti.tree.Get(keyi)
+	if item == nil {
+		return ErrRevisionNotFound
+	}
+
+	ki := item.(*keyIndex)
+	return ki.tombstone(rev.main, rev.sub)
+}
+
+// RangeSince returns all revisions from key(including) to end(excluding)
+// at or after the given rev. The returned slice is sorted in the order
+// of revision.
+func (ti *treeIndex) RangeSince(key, end []byte, rev int64) []revision {
+	ti.RLock()
+	defer ti.RUnlock()
+
+	keyi := &keyIndex{key: key}
+	if end == nil {
+		item := ti.tree.Get(keyi)
+		if item == nil {
+			return nil
+		}
+		keyi = item.(*keyIndex)
+		return keyi.since(rev)
+	}
+
+	endi := &keyIndex{key: end}
+	var revs []revision
+	ti.tree.AscendGreaterOrEqual(keyi, func(item btree.Item) bool {
+		if len(endi.key) > 0 && !item.Less(endi) {
+			return false
+		}
+		curKeyi := item.(*keyIndex)
+		revs = append(revs, curKeyi.since(rev)...)
+		return true
+	})
+	sort.Sort(revisions(revs))
+
+	return revs
+}
+
+func (ti *treeIndex) Compact(rev int64) map[revision]struct{} {
+	available := make(map[revision]struct{})
+	var emptyki []*keyIndex
+	plog.Printf("store.index: compact %d", rev)
+	// TODO: do not hold the lock for long time?
+	// This is probably OK. Compacting 10M keys takes O(10ms).
+	ti.Lock()
+	defer ti.Unlock()
+	ti.tree.Ascend(compactIndex(rev, available, &emptyki))
+	for _, ki := range emptyki {
+		item := ti.tree.Delete(ki)
+		if item == nil {
+			plog.Panic("store.index: unexpected delete failure during compaction")
+		}
+	}
+	return available
+}
+
+func compactIndex(rev int64, available map[revision]struct{}, emptyki *[]*keyIndex) func(i btree.Item) bool {
+	return func(i btree.Item) bool {
+		keyi := i.(*keyIndex)
+		keyi.compact(rev, available)
+		if keyi.isEmpty() {
+			*emptyki = append(*emptyki, keyi)
+		}
+		return true
+	}
+}
+
+func (a *treeIndex) Equal(bi index) bool {
+	b := bi.(*treeIndex)
+
+	if a.tree.Len() != b.tree.Len() {
+		return false
+	}
+
+	equal := true
+
+	a.tree.Ascend(func(item btree.Item) bool {
+		aki := item.(*keyIndex)
+		bki := b.tree.Get(item).(*keyIndex)
+		if !aki.equal(bki) {
+			equal = false
+			return false
+		}
+		return true
+	})
+
+	return equal
+}
--- a/vendor/github.com/coreos/etcd/mvcc/key_index.go
+++ b/vendor/github.com/coreos/etcd/mvcc/key_index.go
@@ -0,0 +1,333 @@
+// Copyright 2015 The etcd Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mvcc
+
+import (
+	"bytes"
+	"errors"
+	"fmt"
+
+	"github.com/google/btree"
+)
+
+var (
+	ErrRevisionNotFound = errors.New("mvcc: revision not found")
+)
+
+// keyIndex stores the revisions of a key in the backend.
+// Each keyIndex has at least one key generation.
+// Each generation might have several key versions.
+// Tombstone on a key appends an tombstone version at the end
+// of the current generation and creates a new empty generation.
+// Each version of a key has an index pointing to the backend.
+//
+// For example: put(1.0);put(2.0);tombstone(3.0);put(4.0);tombstone(5.0) on key "foo"
+// generate a keyIndex:
+// key:     "foo"
+// rev: 5
+// generations:
+//    {empty}
+//    {4.0, 5.0(t)}
+//    {1.0, 2.0, 3.0(t)}
+//
+// Compact a keyIndex removes the versions with smaller or equal to
+// rev except the largest one. If the generation becomes empty
+// during compaction, it will be removed. if all the generations get
+// removed, the keyIndex should be removed.
+
+// For example:
+// compact(2) on the previous example
+// generations:
+//    {empty}
+//    {4.0, 5.0(t)}
+//    {2.0, 3.0(t)}
+//
+// compact(4)
+// generations:
+//    {empty}
+//    {4.0, 5.0(t)}
+//
+// compact(5):
+// generations:
+//    {empty} -> key SHOULD be removed.
+//
+// compact(6):
+// generations:
+//    {empty} -> key SHOULD be removed.
+type keyIndex struct {
+	key         []byte
+	modified    revision // the main rev of the last modification
+	generations []generation
+}
+
+// put puts a revision to the keyIndex.
+func (ki *keyIndex) put(main int64, sub int64) {
+	rev := revision{main: main, sub: sub}
+
+	if !rev.GreaterThan(ki.modified) {
+		plog.Panicf("store.keyindex: put with unexpected smaller revision [%v / %v]", rev, ki.modified)
+	}
+	if len(ki.generations) == 0 {
+		ki.generations = append(ki.generations, generation{})
+	}
+	g := &ki.generations[len(ki.generations)-1]
+	if len(g.revs) == 0 { // create a new key
+		keysGauge.Inc()
+		g.created = rev
+	}
+	g.revs = append(g.revs, rev)
+	g.ver++
+	ki.modified = rev
+}
+
+func (ki *keyIndex) restore(created, modified revision, ver int64) {
+	if len(ki.generations) != 0 {
+		plog.Panicf("store.keyindex: cannot restore non-empty keyIndex")
+	}
+
+	ki.modified = modified
+	g := generation{created: created, ver: ver, revs: []revision{modified}}
+	ki.generations = append(ki.generations, g)
+	keysGauge.Inc()
+}
+
+// tombstone puts a revision, pointing to a tombstone, to the keyIndex.
+// It also creates a new empty generation in the keyIndex.
+// It returns ErrRevisionNotFound when tombstone on an empty generation.
+func (ki *keyIndex) tombstone(main int64, sub int64) error {
+	if ki.isEmpty() {
+		plog.Panicf("store.keyindex: unexpected tombstone on empty keyIndex %s", string(ki.key))
+	}
+	if ki.generations[len(ki.generations)-1].isEmpty() {
+		return ErrRevisionNotFound
+	}
+	ki.put(main, sub)
+	ki.generations = append(ki.generations, generation{})
+	keysGauge.Dec()
+	return nil
+}
+
+// get gets the modified, created revision and version of the key that satisfies the given atRev.
+// Rev must be higher than or equal to the given atRev.
+func (ki *keyIndex) get(atRev int64) (modified, created revision, ver int64, err error) {
+	if ki.isEmpty() {
+		plog.Panicf("store.keyindex: unexpected get on empty keyIndex %s", string(ki.key))
+	}
+	g := ki.findGeneration(atRev)
+	if g.isEmpty() {
+		return revision{}, revision{}, 0, ErrRevisionNotFound
+	}
+
+	n := g.walk(func(rev revision) bool { return rev.main > atRev })
+	if n != -1 {
+		return g.revs[n], g.created, g.ver - int64(len(g.revs)-n-1), nil
+	}
+
+	return revision{}, revision{}, 0, ErrRevisionNotFound
+}
+
+// since returns revisions since the given rev. Only the revision with the
+// largest sub revision will be returned if multiple revisions have the same
+// main revision.
+func (ki *keyIndex) since(rev int64) []revision {
+	if ki.isEmpty() {
+		plog.Panicf("store.keyindex: unexpected get on empty keyIndex %s", string(ki.key))
+	}
+	since := revision{rev, 0}
+	var gi int
+	// find the generations to start checking
+	for gi = len(ki.generations) - 1; gi > 0; gi-- {
+		g := ki.generations[gi]
+		if g.isEmpty() {
+			continue
+		}
+		if since.GreaterThan(g.created) {
+			break
+		}
+	}
+
+	var revs []revision
+	var last int64
+	for ; gi < len(ki.generations); gi++ {
+		for _, r := range ki.generations[gi].revs {
+			if since.GreaterThan(r) {
+				continue
+			}
+			if r.main == last {
+				// replace the revision with a new one that has higher sub value,
+				// because the original one should not be seen by external
+				revs[len(revs)-1] = r
+				continue
+			}
+			revs = append(revs, r)
+			last = r.main
+		}
+	}
+	return revs
+}
+
+// compact compacts a keyIndex by removing the versions with smaller or equal
+// revision than the given atRev except the largest one (If the largest one is
+// a tombstone, it will not be kept).
+// If a generation becomes empty during compaction, it will be removed.
+func (ki *keyIndex) compact(atRev int64, available map[revision]struct{}) {
+	if ki.isEmpty() {
+		plog.Panicf("store.keyindex: unexpected compact on empty keyIndex %s", string(ki.key))
+	}
+
+	// walk until reaching the first revision that has an revision smaller or equal to
+	// the atRev.
+	// add it to the available map
+	f := func(rev revision) bool {
+		if rev.main <= atRev {
+			available[rev] = struct{}{}
+			return false
+		}
+		return true
+	}
+
+	i, g := 0, &ki.generations[0]
+	// find first generation includes atRev or created after atRev
+	for i < len(ki.generations)-1 {
+		if tomb := g.revs[len(g.revs)-1].main; tomb > atRev {
+			break
+		}
+		i++
+		g = &ki.generations[i]
+	}
+
+	if !g.isEmpty() {
+		n := g.walk(f)
+		// remove the previous contents.
+		if n != -1 {
+			g.revs = g.revs[n:]
+		}
+		// remove any tombstone
+		if len(g.revs) == 1 && i != len(ki.generations)-1 {
+			delete(available, g.revs[0])
+			i++
+		}
+	}
+	// remove the previous generations.
+	ki.generations = ki.generations[i:]
+	return
+}
+
+func (ki *keyIndex) isEmpty() bool {
+	return len(ki.generations) == 1 && ki.generations[0].isEmpty()
+}
+
+// findGeneration finds out the generation of the keyIndex that the
+// given rev belongs to. If the given rev is at the gap of two generations,
+// which means that the key does not exist at the given rev, it returns nil.
+func (ki *keyIndex) findGeneration(rev int64) *generation {
+	lastg := len(ki.generations) - 1
+	cg := lastg
+
+	for cg >= 0 {
+		if len(ki.generations[cg].revs) == 0 {
+			cg--
+			continue
+		}
+		g := ki.generations[cg]
+		if cg != lastg {
+			if tomb := g.revs[len(g.revs)-1].main; tomb <= rev {
+				return nil
+			}
+		}
+		if g.revs[0].main <= rev {
+			return &ki.generations[cg]
+		}
+		cg--
+	}
+	return nil
+}
+
+func (a *keyIndex) Less(b btree.Item) bool {
+	return bytes.Compare(a.key, b.(*keyIndex).key) == -1
+}
+
+func (a *keyIndex) equal(b *keyIndex) bool {
+	if !bytes.Equal(a.key, b.key) {
+		return false
+	}
+	if a.modified != b.modified {
+		return false
+	}
+	if len(a.generations) != len(b.generations) {
+		return false
+	}
+	for i := range a.generations {
+		ag, bg := a.generations[i], b.generations[i]
+		if !ag.equal(bg) {
+			return false
+		}
+	}
+	return true
+}
+
+func (ki *keyIndex) String() string {
+	var s string
+	for _, g := range ki.generations {
+		s += g.String()
+	}
+	return s
+}
+
+// generation contains multiple revisions of a key.
+type generation struct {
+	ver     int64
+	created revision // when the generation is created (put in first revision).
+	revs    []revision
+}
+
+func (g *generation) isEmpty() bool { return g == nil || len(g.revs) == 0 }
+
+// walk walks through the revisions in the generation in descending order.
+// It passes the revision to the given function.
+// walk returns until: 1. it finishes walking all pairs 2. the function returns false.
+// walk returns the position at where it stopped. If it stopped after
+// finishing walking, -1 will be returned.
+func (g *generation) walk(f func(rev revision) bool) int {
+	l := len(g.revs)
+	for i := range g.revs {
+		ok := f(g.revs[l-i-1])
+		if !ok {
+			return l - i - 1
+		}
+	}
+	return -1
+}
+
+func (g *generation) String() string {
+	return fmt.Sprintf("g: created[%d] ver[%d], revs %#v\n", g.created, g.ver, g.revs)
+}
+
+func (a generation) equal(b generation) bool {
+	if a.ver != b.ver {
+		return false
+	}
+	if len(a.revs) != len(b.revs) {
+		return false
+	}
+
+	for i := range a.revs {
+		ar, br := a.revs[i], b.revs[i]
+		if ar != br {
+			return false
+		}
+	}
+	return true
+}
--- a/vendor/github.com/coreos/etcd/mvcc/kv.go
+++ b/vendor/github.com/coreos/etcd/mvcc/kv.go
@@ -0,0 +1,119 @@
+// Copyright 2015 The etcd Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mvcc
+
+import (
+	"github.com/coreos/etcd/lease"
+	"github.com/coreos/etcd/mvcc/backend"
+	"github.com/coreos/etcd/mvcc/mvccpb"
+)
+
+type RangeOptions struct {
+	Limit int64
+	Rev   int64
+	Count bool
+}
+
+type RangeResult struct {
+	KVs   []mvccpb.KeyValue
+	Rev   int64
+	Count int
+}
+
+type KV interface {
+	// Rev returns the current revision of the KV.
+	Rev() int64
+
+	// FirstRev returns the first revision of the KV.
+	// After a compaction, the first revision increases to the compaction
+	// revision.
+	FirstRev() int64
+
+	// Range gets the keys in the range at rangeRev.
+	// The returned rev is the current revision of the KV when the operation is executed.
+	// If rangeRev <=0, range gets the keys at currentRev.
+	// If `end` is nil, the request returns the key.
+	// If `end` is not nil and not empty, it gets the keys in range [key, range_end).
+	// If `end` is not nil and empty, it gets the keys greater than or equal to key.
+	// Limit limits the number of keys returned.
+	// If the required rev is compacted, ErrCompacted will be returned.
+	Range(key, end []byte, ro RangeOptions) (r *RangeResult, err error)
+
+	// Put puts the given key, value into the store. Put also takes additional argument lease to
+	// attach a lease to a key-value pair as meta-data. KV implementation does not validate the lease
+	// id.
+	// A put also increases the rev of the store, and generates one event in the event history.
+	// The returned rev is the current revision of the KV when the operation is executed.
+	Put(key, value []byte, lease lease.LeaseID) (rev int64)
+
+	// DeleteRange deletes the given range from the store.
+	// A deleteRange increases the rev of the store if any key in the range exists.
+	// The number of key deleted will be returned.
+	// The returned rev is the current revision of the KV when the operation is executed.
+	// It also generates one event for each key delete in the event history.
+	// if the `end` is nil, deleteRange deletes the key.
+	// if the `end` is not nil, deleteRange deletes the keys in range [key, range_end).
+	DeleteRange(key, end []byte) (n, rev int64)
+
+	// TxnBegin begins a txn. Only Txn prefixed operation can be executed, others will be blocked
+	// until txn ends. Only one on-going txn is allowed.
+	// TxnBegin returns an int64 txn ID.
+	// All txn prefixed operations with same txn ID will be done with the same rev.
+	TxnBegin() int64
+	// TxnEnd ends the on-going txn with txn ID. If the on-going txn ID is not matched, error is returned.
+	TxnEnd(txnID int64) error
+	// TxnRange returns the current revision of the KV when the operation is executed.
+	TxnRange(txnID int64, key, end []byte, ro RangeOptions) (r *RangeResult, err error)
+	TxnPut(txnID int64, key, value []byte, lease lease.LeaseID) (rev int64, err error)
+	TxnDeleteRange(txnID int64, key, end []byte) (n, rev int64, err error)
+
+	// Compact frees all superseded keys with revisions less than rev.
+	Compact(rev int64) (<-chan struct{}, error)
+
+	// Hash retrieves the hash of KV state and revision.
+	// This method is designed for consistency checking purpose.
+	Hash() (hash uint32, revision int64, err error)
+
+	// Commit commits txns into the underlying backend.
+	Commit()
+
+	// Restore restores the KV store from a backend.
+	Restore(b backend.Backend) error
+	Close() error
+}
+
+// WatchableKV is a KV that can be watched.
+type WatchableKV interface {
+	KV
+	Watchable
+}
+
+// Watchable is the interface that wraps the NewWatchStream function.
+type Watchable interface {
+	// NewWatchStream returns a WatchStream that can be used to
+	// watch events happened or happening on the KV.
+	NewWatchStream() WatchStream
+}
+
+// ConsistentWatchableKV is a WatchableKV that understands the consistency
+// algorithm and consistent index.
+// If the consistent index of executing entry is not larger than the
+// consistent index of ConsistentWatchableKV, all operations in
+// this entry are skipped and return empty response.
+type ConsistentWatchableKV interface {
+	WatchableKV
+	// ConsistentIndex returns the current consistent index of the KV.
+	ConsistentIndex() uint64
+}
--- a/vendor/github.com/coreos/etcd/mvcc/kvstore.go
+++ b/vendor/github.com/coreos/etcd/mvcc/kvstore.go
@@ -0,0 +1,681 @@
+// Copyright 2015 The etcd Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mvcc
+
+import (
+	"encoding/binary"
+	"errors"
+	"math"
+	"math/rand"
+	"sync"
+	"time"
+
+	"github.com/coreos/etcd/lease"
+	"github.com/coreos/etcd/mvcc/backend"
+	"github.com/coreos/etcd/mvcc/mvccpb"
+	"github.com/coreos/etcd/pkg/schedule"
+	"github.com/coreos/pkg/capnslog"
+	"golang.org/x/net/context"
+)
+
+var (
+	keyBucketName  = []byte("key")
+	metaBucketName = []byte("meta")
+
+	// markedRevBytesLen is the byte length of marked revision.
+	// The first `revBytesLen` bytes represents a normal revision. The last
+	// one byte is the mark.
+	markedRevBytesLen      = revBytesLen + 1
+	markBytePosition       = markedRevBytesLen - 1
+	markTombstone     byte = 't'
+
+	consistentIndexKeyName  = []byte("consistent_index")
+	scheduledCompactKeyName = []byte("scheduledCompactRev")
+	finishedCompactKeyName  = []byte("finishedCompactRev")
+
+	ErrTxnIDMismatch = errors.New("mvcc: txn id mismatch")
+	ErrCompacted     = errors.New("mvcc: required revision has been compacted")
+	ErrFutureRev     = errors.New("mvcc: required revision is a future revision")
+	ErrCanceled      = errors.New("mvcc: watcher is canceled")
+
+	plog = capnslog.NewPackageLogger("github.com/coreos/etcd", "mvcc")
+)
+
+// ConsistentIndexGetter is an interface that wraps the Get method.
+// Consistent index is the offset of an entry in a consistent replicated log.
+type ConsistentIndexGetter interface {
+	// ConsistentIndex returns the consistent index of current executing entry.
+	ConsistentIndex() uint64
+}
+
+type store struct {
+	mu sync.Mutex // guards the following
+
+	ig ConsistentIndexGetter
+
+	b       backend.Backend
+	kvindex index
+
+	le lease.Lessor
+
+	currentRev revision
+	// the main revision of the last compaction
+	compactMainRev int64
+
+	tx    backend.BatchTx
+	txnID int64 // tracks the current txnID to verify txn operations
+
+	// bytesBuf8 is a byte slice of length 8
+	// to avoid a repetitive allocation in saveIndex.
+	bytesBuf8 []byte
+
+	changes   []mvccpb.KeyValue
+	fifoSched schedule.Scheduler
+
+	stopc chan struct{}
+}
+
+// NewStore returns a new store. It is useful to create a store inside
+// mvcc pkg. It should only be used for testing externally.
+func NewStore(b backend.Backend, le lease.Lessor, ig ConsistentIndexGetter) *store {
+	s := &store{
+		b:       b,
+		ig:      ig,
+		kvindex: newTreeIndex(),
+
+		le: le,
+
+		currentRev:     revision{main: 1},
+		compactMainRev: -1,
+
+		bytesBuf8: make([]byte, 8, 8),
+		fifoSched: schedule.NewFIFOScheduler(),
+
+		stopc: make(chan struct{}),
+	}
+
+	if s.le != nil {
+		s.le.SetRangeDeleter(s)
+	}
+
+	tx := s.b.BatchTx()
+	tx.Lock()
+	tx.UnsafeCreateBucket(keyBucketName)
+	tx.UnsafeCreateBucket(metaBucketName)
+	tx.Unlock()
+	s.b.ForceCommit()
+
+	if err := s.restore(); err != nil {
+		// TODO: return the error instead of panic here?
+		panic("failed to recover store from backend")
+	}
+
+	return s
+}
+
+func (s *store) Rev() int64 {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	return s.currentRev.main
+}
+
+func (s *store) FirstRev() int64 {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	return s.compactMainRev
+}
+
+func (s *store) Put(key, value []byte, lease lease.LeaseID) int64 {
+	id := s.TxnBegin()
+	s.put(key, value, lease)
+	s.txnEnd(id)
+
+	putCounter.Inc()
+
+	return int64(s.currentRev.main)
+}
+
+func (s *store) Range(key, end []byte, ro RangeOptions) (r *RangeResult, err error) {
+	id := s.TxnBegin()
+	kvs, count, rev, err := s.rangeKeys(key, end, ro.Limit, ro.Rev, ro.Count)
+	s.txnEnd(id)
+
+	rangeCounter.Inc()
+
+	r = &RangeResult{
+		KVs:   kvs,
+		Count: count,
+		Rev:   rev,
+	}
+
+	return r, err
+}
+
+func (s *store) DeleteRange(key, end []byte) (n, rev int64) {
+	id := s.TxnBegin()
+	n = s.deleteRange(key, end)
+	s.txnEnd(id)
+
+	deleteCounter.Inc()
+
+	return n, int64(s.currentRev.main)
+}
+
+func (s *store) TxnBegin() int64 {
+	s.mu.Lock()
+	s.currentRev.sub = 0
+	s.tx = s.b.BatchTx()
+	s.tx.Lock()
+	s.saveIndex()
+
+	s.txnID = rand.Int63()
+	return s.txnID
+}
+
+func (s *store) TxnEnd(txnID int64) error {
+	err := s.txnEnd(txnID)
+	if err != nil {
+		return err
+	}
+
+	txnCounter.Inc()
+	return nil
+}
+
+// txnEnd is used for unlocking an internal txn. It does
+// not increase the txnCounter.
+func (s *store) txnEnd(txnID int64) error {
+	if txnID != s.txnID {
+		return ErrTxnIDMismatch
+	}
+
+	s.tx.Unlock()
+	if s.currentRev.sub != 0 {
+		s.currentRev.main += 1
+	}
+	s.currentRev.sub = 0
+
+	dbTotalSize.Set(float64(s.b.Size()))
+	s.mu.Unlock()
+	return nil
+}
+
+func (s *store) TxnRange(txnID int64, key, end []byte, ro RangeOptions) (r *RangeResult, err error) {
+	if txnID != s.txnID {
+		return nil, ErrTxnIDMismatch
+	}
+
+	kvs, count, rev, err := s.rangeKeys(key, end, ro.Limit, ro.Rev, ro.Count)
+
+	r = &RangeResult{
+		KVs:   kvs,
+		Count: count,
+		Rev:   rev,
+	}
+	return r, err
+}
+
+func (s *store) TxnPut(txnID int64, key, value []byte, lease lease.LeaseID) (rev int64, err error) {
+	if txnID != s.txnID {
+		return 0, ErrTxnIDMismatch
+	}
+
+	s.put(key, value, lease)
+	return int64(s.currentRev.main + 1), nil
+}
+
+func (s *store) TxnDeleteRange(txnID int64, key, end []byte) (n, rev int64, err error) {
+	if txnID != s.txnID {
+		return 0, 0, ErrTxnIDMismatch
+	}
+
+	n = s.deleteRange(key, end)
+	if n != 0 || s.currentRev.sub != 0 {
+		rev = int64(s.currentRev.main + 1)
+	} else {
+		rev = int64(s.currentRev.main)
+	}
+	return n, rev, nil
+}
+
+func (s *store) compactBarrier(ctx context.Context, ch chan struct{}) {
+	if ctx == nil || ctx.Err() != nil {
+		s.mu.Lock()
+		select {
+		case <-s.stopc:
+		default:
+			f := func(ctx context.Context) { s.compactBarrier(ctx, ch) }
+			s.fifoSched.Schedule(f)
+		}
+		s.mu.Unlock()
+		return
+	}
+	close(ch)
+}
+
+func (s *store) Compact(rev int64) (<-chan struct{}, error) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	if rev <= s.compactMainRev {
+		ch := make(chan struct{})
+		f := func(ctx context.Context) { s.compactBarrier(ctx, ch) }
+		s.fifoSched.Schedule(f)
+		return ch, ErrCompacted
+	}
+	if rev > s.currentRev.main {
+		return nil, ErrFutureRev
+	}
+
+	start := time.Now()
+
+	s.compactMainRev = rev
+
+	rbytes := newRevBytes()
+	revToBytes(revision{main: rev}, rbytes)
+
+	tx := s.b.BatchTx()
+	tx.Lock()
+	tx.UnsafePut(metaBucketName, scheduledCompactKeyName, rbytes)
+	tx.Unlock()
+	// ensure that desired compaction is persisted
+	s.b.ForceCommit()
+
+	keep := s.kvindex.Compact(rev)
+	ch := make(chan struct{})
+	var j = func(ctx context.Context) {
+		if ctx.Err() != nil {
+			s.compactBarrier(ctx, ch)
+			return
+		}
+		if !s.scheduleCompaction(rev, keep) {
+			s.compactBarrier(nil, ch)
+			return
+		}
+		close(ch)
+	}
+
+	s.fifoSched.Schedule(j)
+
+	indexCompactionPauseDurations.Observe(float64(time.Since(start) / time.Millisecond))
+	return ch, nil
+}
+
+func (s *store) Hash() (uint32, int64, error) {
+	s.b.ForceCommit()
+
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	// ignore hash consistent index field for now.
+	// consistent index might be changed due to v2 internal sync, which
+	// is not controllable by the user.
+	ignores := make(map[backend.IgnoreKey]struct{})
+	bk := backend.IgnoreKey{Bucket: string(metaBucketName), Key: string(consistentIndexKeyName)}
+	ignores[bk] = struct{}{}
+
+	h, err := s.b.Hash(ignores)
+	rev := s.currentRev.main
+	return h, rev, err
+}
+
+func (s *store) Commit() {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	s.tx = s.b.BatchTx()
+	s.tx.Lock()
+	s.saveIndex()
+	s.tx.Unlock()
+	s.b.ForceCommit()
+}
+
+func (s *store) Restore(b backend.Backend) error {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	close(s.stopc)
+	s.fifoSched.Stop()
+
+	s.b = b
+	s.kvindex = newTreeIndex()
+	s.currentRev = revision{main: 1}
+	s.compactMainRev = -1
+	s.tx = b.BatchTx()
+	s.txnID = -1
+	s.fifoSched = schedule.NewFIFOScheduler()
+	s.stopc = make(chan struct{})
+
+	return s.restore()
+}
+
+func (s *store) restore() error {
+	min, max := newRevBytes(), newRevBytes()
+	revToBytes(revision{main: 1}, min)
+	revToBytes(revision{main: math.MaxInt64, sub: math.MaxInt64}, max)
+
+	// restore index
+	tx := s.b.BatchTx()
+	tx.Lock()
+	_, finishedCompactBytes := tx.UnsafeRange(metaBucketName, finishedCompactKeyName, nil, 0)
+	if len(finishedCompactBytes) != 0 {
+		s.compactMainRev = bytesToRev(finishedCompactBytes[0]).main
+		plog.Printf("restore compact to %d", s.compactMainRev)
+	}
+
+	// TODO: limit N to reduce max memory usage
+	keys, vals := tx.UnsafeRange(keyBucketName, min, max, 0)
+	for i, key := range keys {
+		var kv mvccpb.KeyValue
+		if err := kv.Unmarshal(vals[i]); err != nil {
+			plog.Fatalf("cannot unmarshal event: %v", err)
+		}
+
+		rev := bytesToRev(key[:revBytesLen])
+
+		// restore index
+		switch {
+		case isTombstone(key):
+			s.kvindex.Tombstone(kv.Key, rev)
+			if lease.LeaseID(kv.Lease) != lease.NoLease {
+				err := s.le.Detach(lease.LeaseID(kv.Lease), []lease.LeaseItem{{Key: string(kv.Key)}})
+				if err != nil && err != lease.ErrLeaseNotFound {
+					plog.Fatalf("unexpected Detach error %v", err)
+				}
+			}
+		default:
+			s.kvindex.Restore(kv.Key, revision{kv.CreateRevision, 0}, rev, kv.Version)
+			if lease.LeaseID(kv.Lease) != lease.NoLease {
+				if s.le == nil {
+					panic("no lessor to attach lease")
+				}
+				err := s.le.Attach(lease.LeaseID(kv.Lease), []lease.LeaseItem{{Key: string(kv.Key)}})
+				// We are walking through the kv history here. It is possible that we attached a key to
+				// the lease and the lease was revoked later.
+				// Thus attaching an old version of key to a none existing lease is possible here, and
+				// we should just ignore the error.
+				if err != nil && err != lease.ErrLeaseNotFound {
+					panic("unexpected Attach error")
+				}
+			}
+		}
+
+		// update revision
+		s.currentRev = rev
+	}
+
+	_, scheduledCompactBytes := tx.UnsafeRange(metaBucketName, scheduledCompactKeyName, nil, 0)
+	scheduledCompact := int64(0)
+	if len(scheduledCompactBytes) != 0 {
+		scheduledCompact = bytesToRev(scheduledCompactBytes[0]).main
+		if scheduledCompact <= s.compactMainRev {
+			scheduledCompact = 0
+		}
+	}
+
+	tx.Unlock()
+
+	if scheduledCompact != 0 {
+		s.Compact(scheduledCompact)
+		plog.Printf("resume scheduled compaction at %d", scheduledCompact)
+	}
+
+	return nil
+}
+
+func (s *store) Close() error {
+	close(s.stopc)
+	s.fifoSched.Stop()
+	return nil
+}
+
+func (a *store) Equal(b *store) bool {
+	if a.currentRev != b.currentRev {
+		return false
+	}
+	if a.compactMainRev != b.compactMainRev {
+		return false
+	}
+	return a.kvindex.Equal(b.kvindex)
+}
+
+// range is a keyword in Go, add Keys suffix.
+func (s *store) rangeKeys(key, end []byte, limit, rangeRev int64, countOnly bool) (kvs []mvccpb.KeyValue, count int, curRev int64, err error) {
+	curRev = int64(s.currentRev.main)
+	if s.currentRev.sub > 0 {
+		curRev += 1
+	}
+
+	if rangeRev > curRev {
+		return nil, -1, s.currentRev.main, ErrFutureRev
+	}
+	var rev int64
+	if rangeRev <= 0 {
+		rev = curRev
+	} else {
+		rev = rangeRev
+	}
+	if rev < s.compactMainRev {
+		return nil, -1, 0, ErrCompacted
+	}
+
+	_, revpairs := s.kvindex.Range(key, end, int64(rev))
+	if len(revpairs) == 0 {
+		return nil, 0, curRev, nil
+	}
+	if countOnly {
+		return nil, len(revpairs), curRev, nil
+	}
+
+	for _, revpair := range revpairs {
+		start, end := revBytesRange(revpair)
+
+		_, vs := s.tx.UnsafeRange(keyBucketName, start, end, 0)
+		if len(vs) != 1 {
+			plog.Fatalf("range cannot find rev (%d,%d)", revpair.main, revpair.sub)
+		}
+
+		var kv mvccpb.KeyValue
+		if err := kv.Unmarshal(vs[0]); err != nil {
+			plog.Fatalf("cannot unmarshal event: %v", err)
+		}
+		kvs = append(kvs, kv)
+		if limit > 0 && len(kvs) >= int(limit) {
+			break
+		}
+	}
+	return kvs, len(kvs), curRev, nil
+}
+
+func (s *store) put(key, value []byte, leaseID lease.LeaseID) {
+	rev := s.currentRev.main + 1
+	c := rev
+	oldLease := lease.NoLease
+
+	// if the key exists before, use its previous created and
+	// get its previous leaseID
+	grev, created, ver, err := s.kvindex.Get(key, rev)
+	if err == nil {
+		c = created.main
+		ibytes := newRevBytes()
+		revToBytes(grev, ibytes)
+		_, vs := s.tx.UnsafeRange(keyBucketName, ibytes, nil, 0)
+		var kv mvccpb.KeyValue
+		if err = kv.Unmarshal(vs[0]); err != nil {
+			plog.Fatalf("cannot unmarshal value: %v", err)
+		}
+		oldLease = lease.LeaseID(kv.Lease)
+	}
+
+	ibytes := newRevBytes()
+	revToBytes(revision{main: rev, sub: s.currentRev.sub}, ibytes)
+
+	ver = ver + 1
+	kv := mvccpb.KeyValue{
+		Key:            key,
+		Value:          value,
+		CreateRevision: c,
+		ModRevision:    rev,
+		Version:        ver,
+		Lease:          int64(leaseID),
+	}
+
+	d, err := kv.Marshal()
+	if err != nil {
+		plog.Fatalf("cannot marshal event: %v", err)
+	}
+
+	s.tx.UnsafeSeqPut(keyBucketName, ibytes, d)
+	s.kvindex.Put(key, revision{main: rev, sub: s.currentRev.sub})
+	s.changes = append(s.changes, kv)
+	s.currentRev.sub += 1
+
+	if oldLease != lease.NoLease {
+		if s.le == nil {
+			panic("no lessor to detach lease")
+		}
+
+		err = s.le.Detach(oldLease, []lease.LeaseItem{{Key: string(key)}})
+		if err != nil {
+			panic("unexpected error from lease detach")
+		}
+	}
+
+	if leaseID != lease.NoLease {
+		if s.le == nil {
+			panic("no lessor to attach lease")
+		}
+
+		err = s.le.Attach(leaseID, []lease.LeaseItem{{Key: string(key)}})
+		if err != nil {
+			panic("unexpected error from lease Attach")
+		}
+	}
+}
+
+func (s *store) deleteRange(key, end []byte) int64 {
+	rrev := s.currentRev.main
+	if s.currentRev.sub > 0 {
+		rrev += 1
+	}
+	keys, revs := s.kvindex.Range(key, end, rrev)
+
+	if len(keys) == 0 {
+		return 0
+	}
+
+	for i, key := range keys {
+		s.delete(key, revs[i])
+	}
+	return int64(len(keys))
+}
+
+func (s *store) delete(key []byte, rev revision) {
+	mainrev := s.currentRev.main + 1
+
+	ibytes := newRevBytes()
+	revToBytes(revision{main: mainrev, sub: s.currentRev.sub}, ibytes)
+	ibytes = appendMarkTombstone(ibytes)
+
+	kv := mvccpb.KeyValue{
+		Key: key,
+	}
+
+	d, err := kv.Marshal()
+	if err != nil {
+		plog.Fatalf("cannot marshal event: %v", err)
+	}
+
+	s.tx.UnsafeSeqPut(keyBucketName, ibytes, d)
+	err = s.kvindex.Tombstone(key, revision{main: mainrev, sub: s.currentRev.sub})
+	if err != nil {
+		plog.Fatalf("cannot tombstone an existing key (%s): %v", string(key), err)
+	}
+	s.changes = append(s.changes, kv)
+	s.currentRev.sub += 1
+
+	ibytes = newRevBytes()
+	revToBytes(rev, ibytes)
+	_, vs := s.tx.UnsafeRange(keyBucketName, ibytes, nil, 0)
+
+	kv.Reset()
+	if err = kv.Unmarshal(vs[0]); err != nil {
+		plog.Fatalf("cannot unmarshal value: %v", err)
+	}
+
+	if lease.LeaseID(kv.Lease) != lease.NoLease {
+		err = s.le.Detach(lease.LeaseID(kv.Lease), []lease.LeaseItem{{Key: string(kv.Key)}})
+		if err != nil {
+			plog.Fatalf("cannot detach %v", err)
+		}
+	}
+}
+
+func (s *store) getChanges() []mvccpb.KeyValue {
+	changes := s.changes
+	s.changes = make([]mvccpb.KeyValue, 0, 4)
+	return changes
+}
+
+func (s *store) saveIndex() {
+	if s.ig == nil {
+		return
+	}
+	tx := s.tx
+	bs := s.bytesBuf8
+	binary.BigEndian.PutUint64(bs, s.ig.ConsistentIndex())
+	// put the index into the underlying backend
+	// tx has been locked in TxnBegin, so there is no need to lock it again
+	tx.UnsafePut(metaBucketName, consistentIndexKeyName, bs)
+}
+
+func (s *store) ConsistentIndex() uint64 {
+	// TODO: cache index in a uint64 field?
+	tx := s.b.BatchTx()
+	tx.Lock()
+	defer tx.Unlock()
+	_, vs := tx.UnsafeRange(metaBucketName, consistentIndexKeyName, nil, 0)
+	if len(vs) == 0 {
+		return 0
+	}
+	return binary.BigEndian.Uint64(vs[0])
+}
+
+// appendMarkTombstone appends tombstone mark to normal revision bytes.
+func appendMarkTombstone(b []byte) []byte {
+	if len(b) != revBytesLen {
+		plog.Panicf("cannot append mark to non normal revision bytes")
+	}
+	return append(b, markTombstone)
+}
+
+// isTombstone checks whether the revision bytes is a tombstone.
+func isTombstone(b []byte) bool {
+	return len(b) == markedRevBytesLen && b[markBytePosition] == markTombstone
+}
+
+// revBytesRange returns the range of revision bytes at
+// the given revision.
+func revBytesRange(rev revision) (start, end []byte) {
+	start = newRevBytes()
+	revToBytes(rev, start)
+
+	end = newRevBytes()
+	endRev := revision{main: rev.main, sub: rev.sub + 1}
+	revToBytes(endRev, end)
+
+	return start, end
+}
--- a/vendor/github.com/coreos/etcd/mvcc/kvstore_compaction.go
+++ b/vendor/github.com/coreos/etcd/mvcc/kvstore_compaction.go
@@ -0,0 +1,66 @@
+// Copyright 2015 The etcd Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mvcc
+
+import (
+	"encoding/binary"
+	"time"
+)
+
+func (s *store) scheduleCompaction(compactMainRev int64, keep map[revision]struct{}) bool {
+	totalStart := time.Now()
+	defer dbCompactionTotalDurations.Observe(float64(time.Since(totalStart) / time.Millisecond))
+
+	end := make([]byte, 8)
+	binary.BigEndian.PutUint64(end, uint64(compactMainRev+1))
+
+	batchsize := int64(10000)
+	last := make([]byte, 8+1+8)
+	for {
+		var rev revision
+
+		start := time.Now()
+		tx := s.b.BatchTx()
+		tx.Lock()
+
+		keys, _ := tx.UnsafeRange(keyBucketName, last, end, batchsize)
+		for _, key := range keys {
+			rev = bytesToRev(key)
+			if _, ok := keep[rev]; !ok {
+				tx.UnsafeDelete(keyBucketName, key)
+			}
+		}
+
+		if len(keys) < int(batchsize) {
+			rbytes := make([]byte, 8+1+8)
+			revToBytes(revision{main: compactMainRev}, rbytes)
+			tx.UnsafePut(metaBucketName, finishedCompactKeyName, rbytes)
+			tx.Unlock()
+			plog.Printf("finished scheduled compaction at %d (took %v)", compactMainRev, time.Since(totalStart))
+			return true
+		}
+
+		// update last
+		revToBytes(revision{main: rev.main, sub: rev.sub + 1}, last)
+		tx.Unlock()
+		dbCompactionPauseDurations.Observe(float64(time.Since(start) / time.Millisecond))
+
+		select {
+		case <-time.After(100 * time.Millisecond):
+		case <-s.stopc:
+			return false
+		}
+	}
+}
--- a/vendor/github.com/coreos/etcd/mvcc/metrics.go
+++ b/vendor/github.com/coreos/etcd/mvcc/metrics.go
@@ -0,0 +1,163 @@
+// Copyright 2015 The etcd Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mvcc
+
+import (
+	"github.com/prometheus/client_golang/prometheus"
+)
+
+var (
+	rangeCounter = prometheus.NewCounter(
+		prometheus.CounterOpts{
+			Namespace: "etcd_debugging",
+			Subsystem: "mvcc",
+			Name:      "range_total",
+			Help:      "Total number of ranges seen by this member.",
+		})
+
+	putCounter = prometheus.NewCounter(
+		prometheus.CounterOpts{
+			Namespace: "etcd_debugging",
+			Subsystem: "mvcc",
+			Name:      "put_total",
+			Help:      "Total number of puts seen by this member.",
+		})
+
+	deleteCounter = prometheus.NewCounter(
+		prometheus.CounterOpts{
+			Namespace: "etcd_debugging",
+			Subsystem: "mvcc",
+			Name:      "delete_total",
+			Help:      "Total number of deletes seen by this member.",
+		})
+
+	txnCounter = prometheus.NewCounter(
+		prometheus.CounterOpts{
+			Namespace: "etcd_debugging",
+			Subsystem: "mvcc",
+			Name:      "txn_total",
+			Help:      "Total number of txns seen by this member.",
+		})
+
+	keysGauge = prometheus.NewGauge(
+		prometheus.GaugeOpts{
+			Namespace: "etcd_debugging",
+			Subsystem: "mvcc",
+			Name:      "keys_total",
+			Help:      "Total number of keys.",
+		})
+
+	watchStreamGauge = prometheus.NewGauge(
+		prometheus.GaugeOpts{
+			Namespace: "etcd_debugging",
+			Subsystem: "mvcc",
+			Name:      "watch_stream_total",
+			Help:      "Total number of watch streams.",
+		})
+
+	watcherGauge = prometheus.NewGauge(
+		prometheus.GaugeOpts{
+			Namespace: "etcd_debugging",
+			Subsystem: "mvcc",
+			Name:      "watcher_total",
+			Help:      "Total number of watchers.",
+		})
+
+	slowWatcherGauge = prometheus.NewGauge(
+		prometheus.GaugeOpts{
+			Namespace: "etcd_debugging",
+			Subsystem: "mvcc",
+			Name:      "slow_watcher_total",
+			Help:      "Total number of unsynced slow watchers.",
+		})
+
+	totalEventsCounter = prometheus.NewCounter(
+		prometheus.CounterOpts{
+			Namespace: "etcd_debugging",
+			Subsystem: "mvcc",
+			Name:      "events_total",
+			Help:      "Total number of events sent by this member.",
+		})
+
+	pendingEventsGauge = prometheus.NewGauge(
+		prometheus.GaugeOpts{
+			Namespace: "etcd_debugging",
+			Subsystem: "mvcc",
+			Name:      "pending_events_total",
+			Help:      "Total number of pending events to be sent.",
+		})
+
+	indexCompactionPauseDurations = prometheus.NewHistogram(
+		prometheus.HistogramOpts{
+			Namespace: "etcd_debugging",
+			Subsystem: "mvcc",
+			Name:      "index_compaction_pause_duration_milliseconds",
+			Help:      "Bucketed histogram of index compaction pause duration.",
+			// 0.5ms -> 1second
+			Buckets: prometheus.ExponentialBuckets(0.5, 2, 12),
+		})
+
+	dbCompactionPauseDurations = prometheus.NewHistogram(
+		prometheus.HistogramOpts{
+			Namespace: "etcd_debugging",
+			Subsystem: "mvcc",
+			Name:      "db_compaction_pause_duration_milliseconds",
+			Help:      "Bucketed histogram of db compaction pause duration.",
+			// 1ms -> 4second
+			Buckets: prometheus.ExponentialBuckets(1, 2, 13),
+		})
+
+	dbCompactionTotalDurations = prometheus.NewHistogram(
+		prometheus.HistogramOpts{
+			Namespace: "etcd_debugging",
+			Subsystem: "mvcc",
+			Name:      "db_compaction_total_duration_milliseconds",
+			Help:      "Bucketed histogram of db compaction total duration.",
+			// 100ms -> 800second
+			Buckets: prometheus.ExponentialBuckets(100, 2, 14),
+		})
+
+	dbTotalSize = prometheus.NewGauge(prometheus.GaugeOpts{
+		Namespace: "etcd_debugging",
+		Subsystem: "mvcc",
+		Name:      "db_total_size_in_bytes",
+		Help:      "Total size of the underlying database in bytes.",
+	})
+)
+
+func init() {
+	prometheus.MustRegister(rangeCounter)
+	prometheus.MustRegister(putCounter)
+	prometheus.MustRegister(deleteCounter)
+	prometheus.MustRegister(txnCounter)
+	prometheus.MustRegister(keysGauge)
+	prometheus.MustRegister(watchStreamGauge)
+	prometheus.MustRegister(watcherGauge)
+	prometheus.MustRegister(slowWatcherGauge)
+	prometheus.MustRegister(totalEventsCounter)
+	prometheus.MustRegister(pendingEventsGauge)
+	prometheus.MustRegister(indexCompactionPauseDurations)
+	prometheus.MustRegister(dbCompactionPauseDurations)
+	prometheus.MustRegister(dbCompactionTotalDurations)
+	prometheus.MustRegister(dbTotalSize)
+}
+
+// ReportEventReceived reports that an event is received.
+// This function should be called when the external systems received an
+// event from mvcc.Watcher.
+func ReportEventReceived(n int) {
+	pendingEventsGauge.Sub(float64(n))
+	totalEventsCounter.Add(float64(n))
+}
--- a/vendor/github.com/coreos/etcd/mvcc/mvccpb/kv.pb.go
+++ b/vendor/github.com/coreos/etcd/mvcc/mvccpb/kv.pb.go
@@ -0,0 +1,681 @@
+// Code generated by protoc-gen-gogo.
+// source: kv.proto
+// DO NOT EDIT!
+
+/*
+	Package mvccpb is a generated protocol buffer package.
+
+	It is generated from these files:
+		kv.proto
+
+	It has these top-level messages:
+		KeyValue
+		Event
+*/
+package mvccpb
+
+import (
+	"fmt"
+
+	proto "github.com/golang/protobuf/proto"
+
+	math "math"
+)
+
+import io "io"
+
+// Reference imports to suppress errors if they are not otherwise used.
+var _ = proto.Marshal
+var _ = fmt.Errorf
+var _ = math.Inf
+
+// This is a compile-time assertion to ensure that this generated file
+// is compatible with the proto package it is being compiled against.
+const _ = proto.ProtoPackageIsVersion1
+
+type Event_EventType int32
+
+const (
+	PUT    Event_EventType = 0
+	DELETE Event_EventType = 1
+)
+
+var Event_EventType_name = map[int32]string{
+	0: "PUT",
+	1: "DELETE",
+}
+var Event_EventType_value = map[string]int32{
+	"PUT":    0,
+	"DELETE": 1,
+}
+
+func (x Event_EventType) String() string {
+	return proto.EnumName(Event_EventType_name, int32(x))
+}
+func (Event_EventType) EnumDescriptor() ([]byte, []int) { return fileDescriptorKv, []int{1, 0} }
+
+type KeyValue struct {
+	// key is the key in bytes. An empty key is not allowed.
+	Key []byte `protobuf:"bytes,1,opt,name=key,proto3" json:"key,omitempty"`
+	// create_revision is the revision of last creation on this key.
+	CreateRevision int64 `protobuf:"varint,2,opt,name=create_revision,json=createRevision,proto3" json:"create_revision,omitempty"`
+	// mod_revision is the revision of last modification on this key.
+	ModRevision int64 `protobuf:"varint,3,opt,name=mod_revision,json=modRevision,proto3" json:"mod_revision,omitempty"`
+	// version is the version of the key. A deletion resets
+	// the version to zero and any modification of the key
+	// increases its version.
+	Version int64 `protobuf:"varint,4,opt,name=version,proto3" json:"version,omitempty"`
+	// value is the value held by the key, in bytes.
+	Value []byte `protobuf:"bytes,5,opt,name=value,proto3" json:"value,omitempty"`
+	// lease is the ID of the lease that attached to key.
+	// When the attached lease expires, the key will be deleted.
+	// If lease is 0, then no lease is attached to the key.
+	Lease int64 `protobuf:"varint,6,opt,name=lease,proto3" json:"lease,omitempty"`
+}
+
+func (m *KeyValue) Reset()                    { *m = KeyValue{} }
+func (m *KeyValue) String() string            { return proto.CompactTextString(m) }
+func (*KeyValue) ProtoMessage()               {}
+func (*KeyValue) Descriptor() ([]byte, []int) { return fileDescriptorKv, []int{0} }
+
+type Event struct {
+	// type is the kind of event. If type is a PUT, it indicates
+	// new data has been stored to the key. If type is a DELETE,
+	// it indicates the key was deleted.
+	Type Event_EventType `protobuf:"varint,1,opt,name=type,proto3,enum=mvccpb.Event_EventType" json:"type,omitempty"`
+	// kv holds the KeyValue for the event.
+	// A PUT event contains current kv pair.
+	// A PUT event with kv.Version=1 indicates the creation of a key.
+	// A DELETE/EXPIRE event contains the deleted key with
+	// its modification revision set to the revision of deletion.
+	Kv *KeyValue `protobuf:"bytes,2,opt,name=kv" json:"kv,omitempty"`
+}
+
+func (m *Event) Reset()                    { *m = Event{} }
+func (m *Event) String() string            { return proto.CompactTextString(m) }
+func (*Event) ProtoMessage()               {}
+func (*Event) Descriptor() ([]byte, []int) { return fileDescriptorKv, []int{1} }
+
+func init() {
+	proto.RegisterType((*KeyValue)(nil), "mvccpb.KeyValue")
+	proto.RegisterType((*Event)(nil), "mvccpb.Event")
+	proto.RegisterEnum("mvccpb.Event_EventType", Event_EventType_name, Event_EventType_value)
+}
+func (m *KeyValue) Marshal() (data []byte, err error) {
+	size := m.Size()
+	data = make([]byte, size)
+	n, err := m.MarshalTo(data)
+	if err != nil {
+		return nil, err
+	}
+	return data[:n], nil
+}
+
+func (m *KeyValue) MarshalTo(data []byte) (int, error) {
+	var i int
+	_ = i
+	var l int
+	_ = l
+	if len(m.Key) > 0 {
+		data[i] = 0xa
+		i++
+		i = encodeVarintKv(data, i, uint64(len(m.Key)))
+		i += copy(data[i:], m.Key)
+	}
+	if m.CreateRevision != 0 {
+		data[i] = 0x10
+		i++
+		i = encodeVarintKv(data, i, uint64(m.CreateRevision))
+	}
+	if m.ModRevision != 0 {
+		data[i] = 0x18
+		i++
+		i = encodeVarintKv(data, i, uint64(m.ModRevision))
+	}
+	if m.Version != 0 {
+		data[i] = 0x20
+		i++
+		i = encodeVarintKv(data, i, uint64(m.Version))
+	}
+	if len(m.Value) > 0 {
+		data[i] = 0x2a
+		i++
+		i = encodeVarintKv(data, i, uint64(len(m.Value)))
+		i += copy(data[i:], m.Value)
+	}
+	if m.Lease != 0 {
+		data[i] = 0x30
+		i++
+		i = encodeVarintKv(data, i, uint64(m.Lease))
+	}
+	return i, nil
+}
+
+func (m *Event) Marshal() (data []byte, err error) {
+	size := m.Size()
+	data = make([]byte, size)
+	n, err := m.MarshalTo(data)
+	if err != nil {
+		return nil, err
+	}
+	return data[:n], nil
+}
+
+func (m *Event) MarshalTo(data []byte) (int, error) {
+	var i int
+	_ = i
+	var l int
+	_ = l
+	if m.Type != 0 {
+		data[i] = 0x8
+		i++
+		i = encodeVarintKv(data, i, uint64(m.Type))
+	}
+	if m.Kv != nil {
+		data[i] = 0x12
+		i++
+		i = encodeVarintKv(data, i, uint64(m.Kv.Size()))
+		n1, err := m.Kv.MarshalTo(data[i:])
+		if err != nil {
+			return 0, err
+		}
+		i += n1
+	}
+	return i, nil
+}
+
+func encodeFixed64Kv(data []byte, offset int, v uint64) int {
+	data[offset] = uint8(v)
+	data[offset+1] = uint8(v >> 8)
+	data[offset+2] = uint8(v >> 16)
+	data[offset+3] = uint8(v >> 24)
+	data[offset+4] = uint8(v >> 32)
+	data[offset+5] = uint8(v >> 40)
+	data[offset+6] = uint8(v >> 48)
+	data[offset+7] = uint8(v >> 56)
+	return offset + 8
+}
+func encodeFixed32Kv(data []byte, offset int, v uint32) int {
+	data[offset] = uint8(v)
+	data[offset+1] = uint8(v >> 8)
+	data[offset+2] = uint8(v >> 16)
+	data[offset+3] = uint8(v >> 24)
+	return offset + 4
+}
+func encodeVarintKv(data []byte, offset int, v uint64) int {
+	for v >= 1<<7 {
+		data[offset] = uint8(v&0x7f | 0x80)
+		v >>= 7
+		offset++
+	}
+	data[offset] = uint8(v)
+	return offset + 1
+}
+func (m *KeyValue) Size() (n int) {
+	var l int
+	_ = l
+	l = len(m.Key)
+	if l > 0 {
+		n += 1 + l + sovKv(uint64(l))
+	}
+	if m.CreateRevision != 0 {
+		n += 1 + sovKv(uint64(m.CreateRevision))
+	}
+	if m.ModRevision != 0 {
+		n += 1 + sovKv(uint64(m.ModRevision))
+	}
+	if m.Version != 0 {
+		n += 1 + sovKv(uint64(m.Version))
+	}
+	l = len(m.Value)
+	if l > 0 {
+		n += 1 + l + sovKv(uint64(l))
+	}
+	if m.Lease != 0 {
+		n += 1 + sovKv(uint64(m.Lease))
+	}
+	return n
+}
+
+func (m *Event) Size() (n int) {
+	var l int
+	_ = l
+	if m.Type != 0 {
+		n += 1 + sovKv(uint64(m.Type))
+	}
+	if m.Kv != nil {
+		l = m.Kv.Size()
+		n += 1 + l + sovKv(uint64(l))
+	}
+	return n
+}
+
+func sovKv(x uint64) (n int) {
+	for {
+		n++
+		x >>= 7
+		if x == 0 {
+			break
+		}
+	}
+	return n
+}
+func sozKv(x uint64) (n int) {
+	return sovKv(uint64((x << 1) ^ uint64((int64(x) >> 63))))
+}
+func (m *KeyValue) Unmarshal(data []byte) error {
+	l := len(data)
+	iNdEx := 0
+	for iNdEx < l {
+		preIndex := iNdEx
+		var wire uint64
+		for shift := uint(0); ; shift += 7 {
+			if shift >= 64 {
+				return ErrIntOverflowKv
+			}
+			if iNdEx >= l {
+				return io.ErrUnexpectedEOF
+			}
+			b := data[iNdEx]
+			iNdEx++
+			wire |= (uint64(b) & 0x7F) << shift
+			if b < 0x80 {
+				break
+			}
+		}
+		fieldNum := int32(wire >> 3)
+		wireType := int(wire & 0x7)
+		if wireType == 4 {
+			return fmt.Errorf("proto: KeyValue: wiretype end group for non-group")
+		}
+		if fieldNum <= 0 {
+			return fmt.Errorf("proto: KeyValue: illegal tag %d (wire type %d)", fieldNum, wire)
+		}
+		switch fieldNum {
+		case 1:
+			if wireType != 2 {
+				return fmt.Errorf("proto: wrong wireType = %d for field Key", wireType)
+			}
+			var byteLen int
+			for shift := uint(0); ; shift += 7 {
+				if shift >= 64 {
+					return ErrIntOverflowKv
+				}
+				if iNdEx >= l {
+					return io.ErrUnexpectedEOF
+				}
+				b := data[iNdEx]
+				iNdEx++
+				byteLen |= (int(b) & 0x7F) << shift
+				if b < 0x80 {
+					break
+				}
+			}
+			if byteLen < 0 {
+				return ErrInvalidLengthKv
+			}
+			postIndex := iNdEx + byteLen
+			if postIndex > l {
+				return io.ErrUnexpectedEOF
+			}
+			m.Key = append(m.Key[:0], data[iNdEx:postIndex]...)
+			if m.Key == nil {
+				m.Key = []byte{}
+			}
+			iNdEx = postIndex
+		case 2:
+			if wireType != 0 {
+				return fmt.Errorf("proto: wrong wireType = %d for field CreateRevision", wireType)
+			}
+			m.CreateRevision = 0
+			for shift := uint(0); ; shift += 7 {
+				if shift >= 64 {
+					return ErrIntOverflowKv
+				}
+				if iNdEx >= l {
+					return io.ErrUnexpectedEOF
+				}
+				b := data[iNdEx]
+				iNdEx++
+				m.CreateRevision |= (int64(b) & 0x7F) << shift
+				if b < 0x80 {
+					break
+				}
+			}
+		case 3:
+			if wireType != 0 {
+				return fmt.Errorf("proto: wrong wireType = %d for field ModRevision", wireType)
+			}
+			m.ModRevision = 0
+			for shift := uint(0); ; shift += 7 {
+				if shift >= 64 {
+					return ErrIntOverflowKv
+				}
+				if iNdEx >= l {
+					return io.ErrUnexpectedEOF
+				}
+				b := data[iNdEx]
+				iNdEx++
+				m.ModRevision |= (int64(b) & 0x7F) << shift
+				if b < 0x80 {
+					break
+				}
+			}
+		case 4:
+			if wireType != 0 {
+				return fmt.Errorf("proto: wrong wireType = %d for field Version", wireType)
+			}
+			m.Version = 0
+			for shift := uint(0); ; shift += 7 {
+				if shift >= 64 {
+					return ErrIntOverflowKv
+				}
+				if iNdEx >= l {
+					return io.ErrUnexpectedEOF
+				}
+				b := data[iNdEx]
+				iNdEx++
+				m.Version |= (int64(b) & 0x7F) << shift
+				if b < 0x80 {
+					break
+				}
+			}
+		case 5:
+			if wireType != 2 {
+				return fmt.Errorf("proto: wrong wireType = %d for field Value", wireType)
+			}
+			var byteLen int
+			for shift := uint(0); ; shift += 7 {
+				if shift >= 64 {
+					return ErrIntOverflowKv
+				}
+				if iNdEx >= l {
+					return io.ErrUnexpectedEOF
+				}
+				b := data[iNdEx]
+				iNdEx++
+				byteLen |= (int(b) & 0x7F) << shift
+				if b < 0x80 {
+					break
+				}
+			}
+			if byteLen < 0 {
+				return ErrInvalidLengthKv
+			}
+			postIndex := iNdEx + byteLen
+			if postIndex > l {
+				return io.ErrUnexpectedEOF
+			}
+			m.Value = append(m.Value[:0], data[iNdEx:postIndex]...)
+			if m.Value == nil {
+				m.Value = []byte{}
+			}
+			iNdEx = postIndex
+		case 6:
+			if wireType != 0 {
+				return fmt.Errorf("proto: wrong wireType = %d for field Lease", wireType)
+			}
+			m.Lease = 0
+			for shift := uint(0); ; shift += 7 {
+				if shift >= 64 {
+					return ErrIntOverflowKv
+				}
+				if iNdEx >= l {
+					return io.ErrUnexpectedEOF
+				}
+				b := data[iNdEx]
+				iNdEx++
+				m.Lease |= (int64(b) & 0x7F) << shift
+				if b < 0x80 {
+					break
+				}
+			}
+		default:
+			iNdEx = preIndex
+			skippy, err := skipKv(data[iNdEx:])
+			if err != nil {
+				return err
+			}
+			if skippy < 0 {
+				return ErrInvalidLengthKv
+			}
+			if (iNdEx + skippy) > l {
+				return io.ErrUnexpectedEOF
+			}
+			iNdEx += skippy
+		}
+	}
+
+	if iNdEx > l {
+		return io.ErrUnexpectedEOF
+	}
+	return nil
+}
+func (m *Event) Unmarshal(data []byte) error {
+	l := len(data)
+	iNdEx := 0
+	for iNdEx < l {
+		preIndex := iNdEx
+		var wire uint64
+		for shift := uint(0); ; shift += 7 {
+			if shift >= 64 {
+				return ErrIntOverflowKv
+			}
+			if iNdEx >= l {
+				return io.ErrUnexpectedEOF
+			}
+			b := data[iNdEx]
+			iNdEx++
+			wire |= (uint64(b) & 0x7F) << shift
+			if b < 0x80 {
+				break
+			}
+		}
+		fieldNum := int32(wire >> 3)
+		wireType := int(wire & 0x7)
+		if wireType == 4 {
+			return fmt.Errorf("proto: Event: wiretype end group for non-group")
+		}
+		if fieldNum <= 0 {
+			return fmt.Errorf("proto: Event: illegal tag %d (wire type %d)", fieldNum, wire)
+		}
+		switch fieldNum {
+		case 1:
+			if wireType != 0 {
+				return fmt.Errorf("proto: wrong wireType = %d for field Type", wireType)
+			}
+			m.Type = 0
+			for shift := uint(0); ; shift += 7 {
+				if shift >= 64 {
+					return ErrIntOverflowKv
+				}
+				if iNdEx >= l {
+					return io.ErrUnexpectedEOF
+				}
+				b := data[iNdEx]
+				iNdEx++
+				m.Type |= (Event_EventType(b) & 0x7F) << shift
+				if b < 0x80 {
+					break
+				}
+			}
+		case 2:
+			if wireType != 2 {
+				return fmt.Errorf("proto: wrong wireType = %d for field Kv", wireType)
+			}
+			var msglen int
+			for shift := uint(0); ; shift += 7 {
+				if shift >= 64 {
+					return ErrIntOverflowKv
+				}
+				if iNdEx >= l {
+					return io.ErrUnexpectedEOF
+				}
+				b := data[iNdEx]
+				iNdEx++
+				msglen |= (int(b) & 0x7F) << shift
+				if b < 0x80 {
+					break
+				}
+			}
+			if msglen < 0 {
+				return ErrInvalidLengthKv
+			}
+			postIndex := iNdEx + msglen
+			if postIndex > l {
+				return io.ErrUnexpectedEOF
+			}
+			if m.Kv == nil {
+				m.Kv = &KeyValue{}
+			}
+			if err := m.Kv.Unmarshal(data[iNdEx:postIndex]); err != nil {
+				return err
+			}
+			iNdEx = postIndex
+		default:
+			iNdEx = preIndex
+			skippy, err := skipKv(data[iNdEx:])
+			if err != nil {
+				return err
+			}
+			if skippy < 0 {
+				return ErrInvalidLengthKv
+			}
+			if (iNdEx + skippy) > l {
+				return io.ErrUnexpectedEOF
+			}
+			iNdEx += skippy
+		}
+	}
+
+	if iNdEx > l {
+		return io.ErrUnexpectedEOF
+	}
+	return nil
+}
+func skipKv(data []byte) (n int, err error) {
+	l := len(data)
+	iNdEx := 0
+	for iNdEx < l {
+		var wire uint64
+		for shift := uint(0); ; shift += 7 {
+			if shift >= 64 {
+				return 0, ErrIntOverflowKv
+			}
+			if iNdEx >= l {
+				return 0, io.ErrUnexpectedEOF
+			}
+			b := data[iNdEx]
+			iNdEx++
+			wire |= (uint64(b) & 0x7F) << shift
+			if b < 0x80 {
+				break
+			}
+		}
+		wireType := int(wire & 0x7)
+		switch wireType {
+		case 0:
+			for shift := uint(0); ; shift += 7 {
+				if shift >= 64 {
+					return 0, ErrIntOverflowKv
+				}
+				if iNdEx >= l {
+					return 0, io.ErrUnexpectedEOF
+				}
+				iNdEx++
+				if data[iNdEx-1] < 0x80 {
+					break
+				}
+			}
+			return iNdEx, nil
+		case 1:
+			iNdEx += 8
+			return iNdEx, nil
+		case 2:
+			var length int
+			for shift := uint(0); ; shift += 7 {
+				if shift >= 64 {
+					return 0, ErrIntOverflowKv
+				}
+				if iNdEx >= l {
+					return 0, io.ErrUnexpectedEOF
+				}
+				b := data[iNdEx]
+				iNdEx++
+				length |= (int(b) & 0x7F) << shift
+				if b < 0x80 {
+					break
+				}
+			}
+			iNdEx += length
+			if length < 0 {
+				return 0, ErrInvalidLengthKv
+			}
+			return iNdEx, nil
+		case 3:
+			for {
+				var innerWire uint64
+				var start int = iNdEx
+				for shift := uint(0); ; shift += 7 {
+					if shift >= 64 {
+						return 0, ErrIntOverflowKv
+					}
+					if iNdEx >= l {
+						return 0, io.ErrUnexpectedEOF
+					}
+					b := data[iNdEx]
+					iNdEx++
+					innerWire |= (uint64(b) & 0x7F) << shift
+					if b < 0x80 {
+						break
+					}
+				}
+				innerWireType := int(innerWire & 0x7)
+				if innerWireType == 4 {
+					break
+				}
+				next, err := skipKv(data[start:])
+				if err != nil {
+					return 0, err
+				}
+				iNdEx = start + next
+			}
+			return iNdEx, nil
+		case 4:
+			return iNdEx, nil
+		case 5:
+			iNdEx += 4
+			return iNdEx, nil
+		default:
+			return 0, fmt.Errorf("proto: illegal wireType %d", wireType)
+		}
+	}
+	panic("unreachable")
+}
+
+var (
+	ErrInvalidLengthKv = fmt.Errorf("proto: negative length found during unmarshaling")
+	ErrIntOverflowKv   = fmt.Errorf("proto: integer overflow")
+)
+
+var fileDescriptorKv = []byte{
+	// 285 bytes of a gzipped FileDescriptorProto
+	0x1f, 0x8b, 0x08, 0x00, 0x00, 0x09, 0x6e, 0x88, 0x02, 0xff, 0x44, 0x90, 0x41, 0x4a, 0xc3, 0x40,
+	0x14, 0x86, 0x33, 0x4d, 0x9b, 0xd6, 0xd7, 0x52, 0xc3, 0x50, 0x70, 0x70, 0x31, 0xc4, 0x6e, 0x2c,
+	0x08, 0x11, 0xea, 0x0d, 0xc4, 0xac, 0x74, 0x21, 0x21, 0xba, 0x95, 0x34, 0x7d, 0x94, 0x92, 0xa6,
+	0x13, 0xd2, 0x38, 0x98, 0x9b, 0x78, 0x0a, 0xcf, 0xd1, 0x65, 0x8f, 0x60, 0xe3, 0x45, 0x24, 0x6f,
+	0x4c, 0xdd, 0x0c, 0xef, 0xff, 0xff, 0x6f, 0x98, 0xff, 0x0d, 0x0c, 0x52, 0xed, 0xe7, 0x85, 0x2a,
+	0x15, 0x77, 0x32, 0x9d, 0x24, 0xf9, 0xe2, 0x72, 0xb2, 0x52, 0x2b, 0x45, 0xd6, 0x6d, 0x33, 0x99,
+	0x74, 0xfa, 0xc5, 0x60, 0xf0, 0x88, 0xd5, 0x6b, 0xbc, 0x79, 0x47, 0xee, 0x82, 0x9d, 0x62, 0x25,
+	0x98, 0xc7, 0x66, 0xa3, 0xb0, 0x19, 0xf9, 0x35, 0x9c, 0x27, 0x05, 0xc6, 0x25, 0xbe, 0x15, 0xa8,
+	0xd7, 0xbb, 0xb5, 0xda, 0x8a, 0x8e, 0xc7, 0x66, 0x76, 0x38, 0x36, 0x76, 0xf8, 0xe7, 0xf2, 0x2b,
+	0x18, 0x65, 0x6a, 0xf9, 0x4f, 0xd9, 0x44, 0x0d, 0x33, 0xb5, 0x3c, 0x21, 0x02, 0xfa, 0x1a, 0x0b,
+	0x4a, 0xbb, 0x94, 0xb6, 0x92, 0x4f, 0xa0, 0xa7, 0x9b, 0x02, 0xa2, 0x47, 0x2f, 0x1b, 0xd1, 0xb8,
+	0x1b, 0x8c, 0x77, 0x28, 0x1c, 0xa2, 0x8d, 0x98, 0x7e, 0x40, 0x2f, 0xd0, 0xb8, 0x2d, 0xf9, 0x0d,
+	0x74, 0xcb, 0x2a, 0x47, 0x6a, 0x3b, 0x9e, 0x5f, 0xf8, 0x66, 0x4d, 0x9f, 0x42, 0x73, 0x46, 0x55,
+	0x8e, 0x21, 0x41, 0xdc, 0x83, 0x4e, 0xaa, 0xa9, 0xfa, 0x70, 0xee, 0xb6, 0x68, 0xbb, 0x77, 0xd8,
+	0x49, 0xf5, 0xd4, 0x83, 0xb3, 0xd3, 0x25, 0xde, 0x07, 0xfb, 0xf9, 0x25, 0x72, 0x2d, 0x0e, 0xe0,
+	0x3c, 0x04, 0x4f, 0x41, 0x14, 0xb8, 0xec, 0x5e, 0xec, 0x8f, 0xd2, 0x3a, 0x1c, 0xa5, 0xb5, 0xaf,
+	0x25, 0x3b, 0xd4, 0x92, 0x7d, 0xd7, 0x92, 0x7d, 0xfe, 0x48, 0x6b, 0xe1, 0xd0, 0x5f, 0xde, 0xfd,
+	0x06, 0x00, 0x00, 0xff, 0xff, 0xd6, 0x21, 0x8f, 0x2c, 0x75, 0x01, 0x00, 0x00,
+}
--- a/vendor/github.com/coreos/etcd/mvcc/mvccpb/kv.proto
+++ b/vendor/github.com/coreos/etcd/mvcc/mvccpb/kv.proto
@@ -0,0 +1,46 @@
+syntax = "proto3";
+package mvccpb;
+
+import "gogoproto/gogo.proto";
+
+option (gogoproto.marshaler_all) = true;
+option (gogoproto.sizer_all) = true;
+option (gogoproto.unmarshaler_all) = true;
+option (gogoproto.goproto_getters_all) = false;
+option (gogoproto.goproto_enum_prefix_all) = false;
+
+message KeyValue {
+  // key is the key in bytes. An empty key is not allowed.
+  bytes key = 1;
+  // create_revision is the revision of last creation on this key.
+  int64 create_revision = 2;
+  // mod_revision is the revision of last modification on this key.
+  int64 mod_revision = 3;
+  // version is the version of the key. A deletion resets
+  // the version to zero and any modification of the key
+  // increases its version.
+  int64 version = 4;
+  // value is the value held by the key, in bytes.
+  bytes value = 5;
+  // lease is the ID of the lease that attached to key.
+  // When the attached lease expires, the key will be deleted.
+  // If lease is 0, then no lease is attached to the key.
+  int64 lease = 6;
+}
+
+message Event {
+  enum EventType {
+    PUT = 0;
+    DELETE = 1;
+  }
+  // type is the kind of event. If type is a PUT, it indicates
+  // new data has been stored to the key. If type is a DELETE,
+  // it indicates the key was deleted.
+  EventType type = 1;
+  // kv holds the KeyValue for the event.
+  // A PUT event contains current kv pair.
+  // A PUT event with kv.Version=1 indicates the creation of a key.
+  // A DELETE/EXPIRE event contains the deleted key with
+  // its modification revision set to the revision of deletion.
+  KeyValue kv = 2;
+}
--- a/vendor/github.com/coreos/etcd/mvcc/revision.go
+++ b/vendor/github.com/coreos/etcd/mvcc/revision.go
@@ -0,0 +1,67 @@
+// Copyright 2015 The etcd Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mvcc
+
+import "encoding/binary"
+
+// revBytesLen is the byte length of a normal revision.
+// First 8 bytes is the revision.main in big-endian format. The 9th byte
+// is a '_'. The last 8 bytes is the revision.sub in big-endian format.
+const revBytesLen = 8 + 1 + 8
+
+// A revision indicates modification of the key-value space.
+// The set of changes that share same main revision changes the key-value space atomically.
+type revision struct {
+	// main is the main revision of a set of changes that happen atomically.
+	main int64
+
+	// sub is the the sub revision of a change in a set of changes that happen
+	// atomically. Each change has different increasing sub revision in that
+	// set.
+	sub int64
+}
+
+func (a revision) GreaterThan(b revision) bool {
+	if a.main > b.main {
+		return true
+	}
+	if a.main < b.main {
+		return false
+	}
+	return a.sub > b.sub
+}
+
+func newRevBytes() []byte {
+	return make([]byte, revBytesLen, markedRevBytesLen)
+}
+
+func revToBytes(rev revision, bytes []byte) {
+	binary.BigEndian.PutUint64(bytes, uint64(rev.main))
+	bytes[8] = '_'
+	binary.BigEndian.PutUint64(bytes[9:], uint64(rev.sub))
+}
+
+func bytesToRev(bytes []byte) revision {
+	return revision{
+		main: int64(binary.BigEndian.Uint64(bytes[0:8])),
+		sub:  int64(binary.BigEndian.Uint64(bytes[9:])),
+	}
+}
+
+type revisions []revision
+
+func (a revisions) Len() int           { return len(a) }
+func (a revisions) Less(i, j int) bool { return a[j].GreaterThan(a[i]) }
+func (a revisions) Swap(i, j int)      { a[i], a[j] = a[j], a[i] }
--- a/vendor/github.com/coreos/etcd/mvcc/util.go
+++ b/vendor/github.com/coreos/etcd/mvcc/util.go
@@ -0,0 +1,56 @@
+// Copyright 2016 The etcd Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mvcc
+
+import (
+	"encoding/binary"
+
+	"github.com/coreos/etcd/mvcc/backend"
+	"github.com/coreos/etcd/mvcc/mvccpb"
+)
+
+func UpdateConsistentIndex(be backend.Backend, index uint64) {
+	tx := be.BatchTx()
+	tx.Lock()
+	defer tx.Unlock()
+
+	var oldi uint64
+	_, vs := tx.UnsafeRange(metaBucketName, consistentIndexKeyName, nil, 0)
+	if len(vs) != 0 {
+		oldi = binary.BigEndian.Uint64(vs[0])
+	}
+
+	if index <= oldi {
+		return
+	}
+
+	bs := make([]byte, 8)
+	binary.BigEndian.PutUint64(bs, index)
+	tx.UnsafePut(metaBucketName, consistentIndexKeyName, bs)
+}
+
+func WriteKV(be backend.Backend, kv mvccpb.KeyValue) {
+	ibytes := newRevBytes()
+	revToBytes(revision{main: kv.ModRevision}, ibytes)
+
+	d, err := kv.Marshal()
+	if err != nil {
+		plog.Fatalf("cannot marshal event: %v", err)
+	}
+
+	be.BatchTx().Lock()
+	be.BatchTx().UnsafePut(keyBucketName, ibytes, d)
+	be.BatchTx().Unlock()
+}
--- a/vendor/github.com/coreos/etcd/mvcc/watchable_store.go
+++ b/vendor/github.com/coreos/etcd/mvcc/watchable_store.go
@@ -0,0 +1,548 @@
+// Copyright 2015 The etcd Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mvcc
+
+import (
+	"sync"
+	"time"
+
+	"github.com/coreos/etcd/lease"
+	"github.com/coreos/etcd/mvcc/backend"
+	"github.com/coreos/etcd/mvcc/mvccpb"
+)
+
+const (
+	// chanBufLen is the length of the buffered chan
+	// for sending out watched events.
+	// TODO: find a good buf value. 1024 is just a random one that
+	// seems to be reasonable.
+	chanBufLen = 1024
+
+	// maxWatchersPerSync is the number of watchers to sync in a single batch
+	maxWatchersPerSync = 512
+)
+
+type watchable interface {
+	watch(key, end []byte, startRev int64, id WatchID, ch chan<- WatchResponse) (*watcher, cancelFunc)
+	progress(w *watcher)
+	rev() int64
+}
+
+type watchableStore struct {
+	mu sync.Mutex
+
+	*store
+
+	// victims are watcher batches that were blocked on the watch channel
+	victims []watcherBatch
+	victimc chan struct{}
+
+	// contains all unsynced watchers that needs to sync with events that have happened
+	unsynced watcherGroup
+
+	// contains all synced watchers that are in sync with the progress of the store.
+	// The key of the map is the key that the watcher watches on.
+	synced watcherGroup
+
+	stopc chan struct{}
+	wg    sync.WaitGroup
+}
+
+// cancelFunc updates unsynced and synced maps when running
+// cancel operations.
+type cancelFunc func()
+
+func New(b backend.Backend, le lease.Lessor, ig ConsistentIndexGetter) ConsistentWatchableKV {
+	return newWatchableStore(b, le, ig)
+}
+
+func newWatchableStore(b backend.Backend, le lease.Lessor, ig ConsistentIndexGetter) *watchableStore {
+	s := &watchableStore{
+		store:    NewStore(b, le, ig),
+		victimc:  make(chan struct{}, 1),
+		unsynced: newWatcherGroup(),
+		synced:   newWatcherGroup(),
+		stopc:    make(chan struct{}),
+	}
+	if s.le != nil {
+		// use this store as the deleter so revokes trigger watch events
+		s.le.SetRangeDeleter(s)
+	}
+	s.wg.Add(2)
+	go s.syncWatchersLoop()
+	go s.syncVictimsLoop()
+	return s
+}
+
+func (s *watchableStore) Put(key, value []byte, lease lease.LeaseID) (rev int64) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	rev = s.store.Put(key, value, lease)
+	changes := s.store.getChanges()
+	if len(changes) != 1 {
+		plog.Panicf("unexpected len(changes) != 1 after put")
+	}
+
+	ev := mvccpb.Event{
+		Type: mvccpb.PUT,
+		Kv:   &changes[0],
+	}
+	s.notify(rev, []mvccpb.Event{ev})
+	return rev
+}
+
+func (s *watchableStore) DeleteRange(key, end []byte) (n, rev int64) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	n, rev = s.store.DeleteRange(key, end)
+	changes := s.store.getChanges()
+
+	if len(changes) != int(n) {
+		plog.Panicf("unexpected len(changes) != n after deleteRange")
+	}
+
+	if n == 0 {
+		return n, rev
+	}
+
+	evs := make([]mvccpb.Event, n)
+	for i := range changes {
+		evs[i] = mvccpb.Event{
+			Type: mvccpb.DELETE,
+			Kv:   &changes[i]}
+		evs[i].Kv.ModRevision = rev
+	}
+	s.notify(rev, evs)
+	return n, rev
+}
+
+func (s *watchableStore) TxnBegin() int64 {
+	s.mu.Lock()
+	return s.store.TxnBegin()
+}
+
+func (s *watchableStore) TxnEnd(txnID int64) error {
+	err := s.store.TxnEnd(txnID)
+	if err != nil {
+		return err
+	}
+
+	changes := s.getChanges()
+	if len(changes) == 0 {
+		s.mu.Unlock()
+		return nil
+	}
+
+	rev := s.store.Rev()
+	evs := make([]mvccpb.Event, len(changes))
+	for i, change := range changes {
+		switch change.CreateRevision {
+		case 0:
+			evs[i] = mvccpb.Event{
+				Type: mvccpb.DELETE,
+				Kv:   &changes[i]}
+			evs[i].Kv.ModRevision = rev
+		default:
+			evs[i] = mvccpb.Event{
+				Type: mvccpb.PUT,
+				Kv:   &changes[i]}
+		}
+	}
+
+	s.notify(rev, evs)
+	s.mu.Unlock()
+
+	return nil
+}
+
+func (s *watchableStore) Close() error {
+	close(s.stopc)
+	s.wg.Wait()
+	return s.store.Close()
+}
+
+func (s *watchableStore) NewWatchStream() WatchStream {
+	watchStreamGauge.Inc()
+	return &watchStream{
+		watchable: s,
+		ch:        make(chan WatchResponse, chanBufLen),
+		cancels:   make(map[WatchID]cancelFunc),
+		watchers:  make(map[WatchID]*watcher),
+	}
+}
+
+func (s *watchableStore) watch(key, end []byte, startRev int64, id WatchID, ch chan<- WatchResponse) (*watcher, cancelFunc) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	wa := &watcher{
+		key:    key,
+		end:    end,
+		minRev: startRev,
+		id:     id,
+		ch:     ch,
+	}
+
+	s.store.mu.Lock()
+	synced := startRev > s.store.currentRev.main || startRev == 0
+	if synced {
+		wa.minRev = s.store.currentRev.main + 1
+		if startRev > wa.minRev {
+			wa.minRev = startRev
+		}
+	}
+	s.store.mu.Unlock()
+	if synced {
+		s.synced.add(wa)
+	} else {
+		slowWatcherGauge.Inc()
+		s.unsynced.add(wa)
+	}
+	watcherGauge.Inc()
+
+	return wa, func() { s.cancelWatcher(wa) }
+}
+
+// cancelWatcher removes references of the watcher from the watchableStore
+func (s *watchableStore) cancelWatcher(wa *watcher) {
+	for {
+		s.mu.Lock()
+
+		if s.unsynced.delete(wa) {
+			slowWatcherGauge.Dec()
+			break
+		} else if s.synced.delete(wa) {
+			break
+		} else if wa.compacted {
+			break
+		}
+
+		if !wa.victim {
+			panic("watcher not victim but not in watch groups")
+		}
+
+		var victimBatch watcherBatch
+		for _, wb := range s.victims {
+			if wb[wa] != nil {
+				victimBatch = wb
+				break
+			}
+		}
+		if victimBatch != nil {
+			slowWatcherGauge.Dec()
+			delete(victimBatch, wa)
+			break
+		}
+
+		// victim being processed so not accessible; retry
+		s.mu.Unlock()
+		time.Sleep(time.Millisecond)
+	}
+
+	watcherGauge.Dec()
+	s.mu.Unlock()
+}
+
+// syncWatchersLoop syncs the watcher in the unsynced map every 100ms.
+func (s *watchableStore) syncWatchersLoop() {
+	defer s.wg.Done()
+
+	for {
+		s.mu.Lock()
+		st := time.Now()
+		lastUnsyncedWatchers := s.unsynced.size()
+		s.syncWatchers()
+		unsyncedWatchers := s.unsynced.size()
+		s.mu.Unlock()
+		syncDuration := time.Since(st)
+
+		waitDuration := 100 * time.Millisecond
+		// more work pending?
+		if unsyncedWatchers != 0 && lastUnsyncedWatchers > unsyncedWatchers {
+			// be fair to other store operations by yielding time taken
+			waitDuration = syncDuration
+		}
+
+		select {
+		case <-time.After(waitDuration):
+		case <-s.stopc:
+			return
+		}
+	}
+}
+
+// syncVictimsLoop tries to write precomputed watcher responses to
+// watchers that had a blocked watcher channel
+func (s *watchableStore) syncVictimsLoop() {
+	defer s.wg.Done()
+
+	for {
+		for s.moveVictims() != 0 {
+			// try to update all victim watchers
+		}
+		s.mu.Lock()
+		isEmpty := len(s.victims) == 0
+		s.mu.Unlock()
+
+		var tickc <-chan time.Time
+		if !isEmpty {
+			tickc = time.After(10 * time.Millisecond)
+		}
+
+		select {
+		case <-tickc:
+		case <-s.victimc:
+		case <-s.stopc:
+			return
+		}
+	}
+}
+
+// moveVictims tries to update watches with already pending event data
+func (s *watchableStore) moveVictims() (moved int) {
+	s.mu.Lock()
+	victims := s.victims
+	s.victims = nil
+	s.mu.Unlock()
+
+	var newVictim watcherBatch
+	for _, wb := range victims {
+		// try to send responses again
+		for w, eb := range wb {
+			// watcher has observed the store up to, but not including, w.minRev
+			rev := w.minRev - 1
+			select {
+			case w.ch <- WatchResponse{WatchID: w.id, Events: eb.evs, Revision: rev}:
+				pendingEventsGauge.Add(float64(len(eb.evs)))
+			default:
+				if newVictim == nil {
+					newVictim = make(watcherBatch)
+				}
+				newVictim[w] = eb
+				continue
+			}
+			moved++
+		}
+
+		// assign completed victim watchers to unsync/sync
+		s.mu.Lock()
+		s.store.mu.Lock()
+		curRev := s.store.currentRev.main
+		for w, eb := range wb {
+			if newVictim != nil && newVictim[w] != nil {
+				// couldn't send watch response; stays victim
+				continue
+			}
+			w.victim = false
+			if eb.moreRev != 0 {
+				w.minRev = eb.moreRev
+			}
+			if w.minRev <= curRev {
+				s.unsynced.add(w)
+			} else {
+				slowWatcherGauge.Dec()
+				s.synced.add(w)
+			}
+		}
+		s.store.mu.Unlock()
+		s.mu.Unlock()
+	}
+
+	if len(newVictim) > 0 {
+		s.mu.Lock()
+		s.victims = append(s.victims, newVictim)
+		s.mu.Unlock()
+	}
+
+	return moved
+}
+
+// syncWatchers syncs unsynced watchers by:
+//	1. choose a set of watchers from the unsynced watcher group
+//	2. iterate over the set to get the minimum revision and remove compacted watchers
+//	3. use minimum revision to get all key-value pairs and send those events to watchers
+//	4. remove synced watchers in set from unsynced group and move to synced group
+func (s *watchableStore) syncWatchers() {
+	if s.unsynced.size() == 0 {
+		return
+	}
+
+	s.store.mu.Lock()
+	defer s.store.mu.Unlock()
+
+	// in order to find key-value pairs from unsynced watchers, we need to
+	// find min revision index, and these revisions can be used to
+	// query the backend store of key-value pairs
+	curRev := s.store.currentRev.main
+	compactionRev := s.store.compactMainRev
+	wg, minRev := s.unsynced.choose(maxWatchersPerSync, curRev, compactionRev)
+	minBytes, maxBytes := newRevBytes(), newRevBytes()
+	revToBytes(revision{main: minRev}, minBytes)
+	revToBytes(revision{main: curRev + 1}, maxBytes)
+
+	// UnsafeRange returns keys and values. And in boltdb, keys are revisions.
+	// values are actual key-value pairs in backend.
+	tx := s.store.b.BatchTx()
+	tx.Lock()
+	revs, vs := tx.UnsafeRange(keyBucketName, minBytes, maxBytes, 0)
+	evs := kvsToEvents(wg, revs, vs)
+	tx.Unlock()
+
+	var victims watcherBatch
+	wb := newWatcherBatch(wg, evs)
+	for w := range wg.watchers {
+		w.minRev = curRev + 1
+
+		eb, ok := wb[w]
+		if !ok {
+			// bring un-notified watcher to synced
+			s.synced.add(w)
+			s.unsynced.delete(w)
+			continue
+		}
+
+		if eb.moreRev != 0 {
+			w.minRev = eb.moreRev
+		}
+
+		select {
+		case w.ch <- WatchResponse{WatchID: w.id, Events: eb.evs, Revision: curRev}:
+			pendingEventsGauge.Add(float64(len(eb.evs)))
+		default:
+			if victims == nil {
+				victims = make(watcherBatch)
+			}
+			w.victim = true
+		}
+
+		if w.victim {
+			victims[w] = eb
+		} else {
+			if eb.moreRev != 0 {
+				// stay unsynced; more to read
+				continue
+			}
+			s.synced.add(w)
+		}
+		s.unsynced.delete(w)
+	}
+	s.addVictim(victims)
+
+	vsz := 0
+	for _, v := range s.victims {
+		vsz += len(v)
+	}
+	slowWatcherGauge.Set(float64(s.unsynced.size() + vsz))
+}
+
+// kvsToEvents gets all events for the watchers from all key-value pairs
+func kvsToEvents(wg *watcherGroup, revs, vals [][]byte) (evs []mvccpb.Event) {
+	for i, v := range vals {
+		var kv mvccpb.KeyValue
+		if err := kv.Unmarshal(v); err != nil {
+			plog.Panicf("cannot unmarshal event: %v", err)
+		}
+
+		if !wg.contains(string(kv.Key)) {
+			continue
+		}
+
+		ty := mvccpb.PUT
+		if isTombstone(revs[i]) {
+			ty = mvccpb.DELETE
+			// patch in mod revision so watchers won't skip
+			kv.ModRevision = bytesToRev(revs[i]).main
+		}
+		evs = append(evs, mvccpb.Event{Kv: &kv, Type: ty})
+	}
+	return evs
+}
+
+// notify notifies the fact that given event at the given rev just happened to
+// watchers that watch on the key of the event.
+func (s *watchableStore) notify(rev int64, evs []mvccpb.Event) {
+	var victim watcherBatch
+	for w, eb := range newWatcherBatch(&s.synced, evs) {
+		if eb.revs != 1 {
+			plog.Panicf("unexpected multiple revisions in notification")
+		}
+		select {
+		case w.ch <- WatchResponse{WatchID: w.id, Events: eb.evs, Revision: rev}:
+			pendingEventsGauge.Add(float64(len(eb.evs)))
+		default:
+			// move slow watcher to victims
+			w.minRev = rev + 1
+			if victim == nil {
+				victim = make(watcherBatch)
+			}
+			w.victim = true
+			victim[w] = eb
+			s.synced.delete(w)
+			slowWatcherGauge.Inc()
+		}
+	}
+	s.addVictim(victim)
+}
+
+func (s *watchableStore) addVictim(victim watcherBatch) {
+	if victim == nil {
+		return
+	}
+	s.victims = append(s.victims, victim)
+	select {
+	case s.victimc <- struct{}{}:
+	default:
+	}
+}
+
+func (s *watchableStore) rev() int64 { return s.store.Rev() }
+
+func (s *watchableStore) progress(w *watcher) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	if _, ok := s.synced.watchers[w]; ok {
+		select {
+		case w.ch <- WatchResponse{WatchID: w.id, Revision: s.rev()}:
+		default:
+			// If the ch is full, this watcher is receiving events.
+			// We do not need to send progress at all.
+		}
+	}
+}
+
+type watcher struct {
+	// the watcher key
+	key []byte
+	// end indicates the end of the range to watch.
+	// If end is set, the watcher is on a range.
+	end []byte
+
+	// victim is set when ch is blocked and undergoing victim processing
+	victim bool
+
+	// compacted is set when the watcher is removed because of compaction
+	compacted bool
+
+	// minRev is the minimum revision update the watcher will accept
+	minRev int64
+	id     WatchID
+
+	// a chan to send out the watch response.
+	// The chan might be shared with other watchers.
+	ch chan<- WatchResponse
+}
--- a/vendor/github.com/coreos/etcd/mvcc/watcher.go
+++ b/vendor/github.com/coreos/etcd/mvcc/watcher.go
@@ -0,0 +1,161 @@
+// Copyright 2015 The etcd Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mvcc
+
+import (
+	"errors"
+	"sync"
+
+	"github.com/coreos/etcd/mvcc/mvccpb"
+)
+
+var (
+	ErrWatcherNotExist = errors.New("mvcc: watcher does not exist")
+)
+
+type WatchID int64
+
+type WatchStream interface {
+	// Watch creates a watcher. The watcher watches the events happening or
+	// happened on the given key or range [key, end) from the given startRev.
+	//
+	// The whole event history can be watched unless compacted.
+	// If `startRev` <=0, watch observes events after currentRev.
+	//
+	// The returned `id` is the ID of this watcher. It appears as WatchID
+	// in events that are sent to the created watcher through stream channel.
+	//
+	Watch(key, end []byte, startRev int64) WatchID
+
+	// Chan returns a chan. All watch response will be sent to the returned chan.
+	Chan() <-chan WatchResponse
+
+	// RequestProgress requests the progress of the watcher with given ID. The response
+	// will only be sent if the watcher is currently synced.
+	// The responses will be sent through the WatchRespone Chan attached
+	// with this stream to ensure correct ordering.
+	// The responses contains no events. The revision in the response is the progress
+	// of the watchers since the watcher is currently synced.
+	RequestProgress(id WatchID)
+
+	// Cancel cancels a watcher by giving its ID. If watcher does not exist, an error will be
+	// returned.
+	Cancel(id WatchID) error
+
+	// Close closes Chan and release all related resources.
+	Close()
+
+	// Rev returns the current revision of the KV the stream watches on.
+	Rev() int64
+}
+
+type WatchResponse struct {
+	// WatchID is the WatchID of the watcher this response sent to.
+	WatchID WatchID
+
+	// Events contains all the events that needs to send.
+	Events []mvccpb.Event
+
+	// Revision is the revision of the KV when the watchResponse is created.
+	// For a normal response, the revision should be the same as the last
+	// modified revision inside Events. For a delayed response to a unsynced
+	// watcher, the revision is greater than the last modified revision
+	// inside Events.
+	Revision int64
+
+	// CompactRevision is set when the watcher is cancelled due to compaction.
+	CompactRevision int64
+}
+
+// watchStream contains a collection of watchers that share
+// one streaming chan to send out watched events and other control events.
+type watchStream struct {
+	watchable watchable
+	ch        chan WatchResponse
+
+	mu sync.Mutex // guards fields below it
+	// nextID is the ID pre-allocated for next new watcher in this stream
+	nextID   WatchID
+	closed   bool
+	cancels  map[WatchID]cancelFunc
+	watchers map[WatchID]*watcher
+}
+
+// Watch creates a new watcher in the stream and returns its WatchID.
+// TODO: return error if ws is closed?
+func (ws *watchStream) Watch(key, end []byte, startRev int64) WatchID {
+	ws.mu.Lock()
+	defer ws.mu.Unlock()
+	if ws.closed {
+		return -1
+	}
+
+	id := ws.nextID
+	ws.nextID++
+
+	w, c := ws.watchable.watch(key, end, startRev, id, ws.ch)
+
+	ws.cancels[id] = c
+	ws.watchers[id] = w
+	return id
+}
+
+func (ws *watchStream) Chan() <-chan WatchResponse {
+	return ws.ch
+}
+
+func (ws *watchStream) Cancel(id WatchID) error {
+	ws.mu.Lock()
+	cancel, ok := ws.cancels[id]
+	ok = ok && !ws.closed
+	if ok {
+		delete(ws.cancels, id)
+		delete(ws.watchers, id)
+	}
+	ws.mu.Unlock()
+	if !ok {
+		return ErrWatcherNotExist
+	}
+	cancel()
+	return nil
+}
+
+func (ws *watchStream) Close() {
+	ws.mu.Lock()
+	defer ws.mu.Unlock()
+
+	for _, cancel := range ws.cancels {
+		cancel()
+	}
+	ws.closed = true
+	close(ws.ch)
+	watchStreamGauge.Dec()
+}
+
+func (ws *watchStream) Rev() int64 {
+	ws.mu.Lock()
+	defer ws.mu.Unlock()
+	return ws.watchable.rev()
+}
+
+func (ws *watchStream) RequestProgress(id WatchID) {
+	ws.mu.Lock()
+	w, ok := ws.watchers[id]
+	ws.mu.Unlock()
+	if !ok {
+		return
+	}
+	ws.watchable.progress(w)
+}
--- a/vendor/github.com/coreos/etcd/mvcc/watcher_group.go
+++ b/vendor/github.com/coreos/etcd/mvcc/watcher_group.go
@@ -0,0 +1,279 @@
+// Copyright 2016 The etcd Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mvcc
+
+import (
+	"math"
+
+	"github.com/coreos/etcd/mvcc/mvccpb"
+	"github.com/coreos/etcd/pkg/adt"
+)
+
+var (
+	// watchBatchMaxRevs is the maximum distinct revisions that
+	// may be sent to an unsynced watcher at a time. Declared as
+	// var instead of const for testing purposes.
+	watchBatchMaxRevs = 1000
+)
+
+type eventBatch struct {
+	// evs is a batch of revision-ordered events
+	evs []mvccpb.Event
+	// revs is the minimum unique revisions observed for this batch
+	revs int
+	// moreRev is first revision with more events following this batch
+	moreRev int64
+}
+
+func (eb *eventBatch) add(ev mvccpb.Event) {
+	if eb.revs > watchBatchMaxRevs {
+		// maxed out batch size
+		return
+	}
+
+	if len(eb.evs) == 0 {
+		// base case
+		eb.revs = 1
+		eb.evs = append(eb.evs, ev)
+		return
+	}
+
+	// revision accounting
+	ebRev := eb.evs[len(eb.evs)-1].Kv.ModRevision
+	evRev := ev.Kv.ModRevision
+	if evRev > ebRev {
+		eb.revs++
+		if eb.revs > watchBatchMaxRevs {
+			eb.moreRev = evRev
+			return
+		}
+	}
+
+	eb.evs = append(eb.evs, ev)
+}
+
+type watcherBatch map[*watcher]*eventBatch
+
+func (wb watcherBatch) add(w *watcher, ev mvccpb.Event) {
+	eb := wb[w]
+	if eb == nil {
+		eb = &eventBatch{}
+		wb[w] = eb
+	}
+	eb.add(ev)
+}
+
+// newWatcherBatch maps watchers to their matched events. It enables quick
+// events look up by watcher.
+func newWatcherBatch(wg *watcherGroup, evs []mvccpb.Event) watcherBatch {
+	wb := make(watcherBatch)
+	for _, ev := range evs {
+		for w := range wg.watcherSetByKey(string(ev.Kv.Key)) {
+			if ev.Kv.ModRevision >= w.minRev {
+				// don't double notify
+				wb.add(w, ev)
+			}
+		}
+	}
+	return wb
+}
+
+type watcherSet map[*watcher]struct{}
+
+func (w watcherSet) add(wa *watcher) {
+	if _, ok := w[wa]; ok {
+		panic("add watcher twice!")
+	}
+	w[wa] = struct{}{}
+}
+
+func (w watcherSet) union(ws watcherSet) {
+	for wa := range ws {
+		w.add(wa)
+	}
+}
+
+func (w watcherSet) delete(wa *watcher) {
+	if _, ok := w[wa]; !ok {
+		panic("removing missing watcher!")
+	}
+	delete(w, wa)
+}
+
+type watcherSetByKey map[string]watcherSet
+
+func (w watcherSetByKey) add(wa *watcher) {
+	set := w[string(wa.key)]
+	if set == nil {
+		set = make(watcherSet)
+		w[string(wa.key)] = set
+	}
+	set.add(wa)
+}
+
+func (w watcherSetByKey) delete(wa *watcher) bool {
+	k := string(wa.key)
+	if v, ok := w[k]; ok {
+		if _, ok := v[wa]; ok {
+			delete(v, wa)
+			if len(v) == 0 {
+				// remove the set; nothing left
+				delete(w, k)
+			}
+			return true
+		}
+	}
+	return false
+}
+
+// watcherGroup is a collection of watchers organized by their ranges
+type watcherGroup struct {
+	// keyWatchers has the watchers that watch on a single key
+	keyWatchers watcherSetByKey
+	// ranges has the watchers that watch a range; it is sorted by interval
+	ranges adt.IntervalTree
+	// watchers is the set of all watchers
+	watchers watcherSet
+}
+
+func newWatcherGroup() watcherGroup {
+	return watcherGroup{
+		keyWatchers: make(watcherSetByKey),
+		watchers:    make(watcherSet),
+	}
+}
+
+// add puts a watcher in the group.
+func (wg *watcherGroup) add(wa *watcher) {
+	wg.watchers.add(wa)
+	if wa.end == nil {
+		wg.keyWatchers.add(wa)
+		return
+	}
+
+	// interval already registered?
+	ivl := adt.NewStringAffineInterval(string(wa.key), string(wa.end))
+	if iv := wg.ranges.Find(ivl); iv != nil {
+		iv.Val.(watcherSet).add(wa)
+		return
+	}
+
+	// not registered, put in interval tree
+	ws := make(watcherSet)
+	ws.add(wa)
+	wg.ranges.Insert(ivl, ws)
+}
+
+// contains is whether the given key has a watcher in the group.
+func (wg *watcherGroup) contains(key string) bool {
+	_, ok := wg.keyWatchers[key]
+	return ok || wg.ranges.Contains(adt.NewStringAffinePoint(key))
+}
+
+// size gives the number of unique watchers in the group.
+func (wg *watcherGroup) size() int { return len(wg.watchers) }
+
+// delete removes a watcher from the group.
+func (wg *watcherGroup) delete(wa *watcher) bool {
+	if _, ok := wg.watchers[wa]; !ok {
+		return false
+	}
+	wg.watchers.delete(wa)
+	if wa.end == nil {
+		wg.keyWatchers.delete(wa)
+		return true
+	}
+
+	ivl := adt.NewStringAffineInterval(string(wa.key), string(wa.end))
+	iv := wg.ranges.Find(ivl)
+	if iv == nil {
+		return false
+	}
+
+	ws := iv.Val.(watcherSet)
+	delete(ws, wa)
+	if len(ws) == 0 {
+		// remove interval missing watchers
+		if ok := wg.ranges.Delete(ivl); !ok {
+			panic("could not remove watcher from interval tree")
+		}
+	}
+
+	return true
+}
+
+// choose selects watchers from the watcher group to update
+func (wg *watcherGroup) choose(maxWatchers int, curRev, compactRev int64) (*watcherGroup, int64) {
+	if len(wg.watchers) < maxWatchers {
+		return wg, wg.chooseAll(curRev, compactRev)
+	}
+	ret := newWatcherGroup()
+	for w := range wg.watchers {
+		if maxWatchers <= 0 {
+			break
+		}
+		maxWatchers--
+		ret.add(w)
+	}
+	return &ret, ret.chooseAll(curRev, compactRev)
+}
+
+func (wg *watcherGroup) chooseAll(curRev, compactRev int64) int64 {
+	minRev := int64(math.MaxInt64)
+	for w := range wg.watchers {
+		if w.minRev > curRev {
+			panic("watcher current revision should not exceed current revision")
+		}
+		if w.minRev < compactRev {
+			select {
+			case w.ch <- WatchResponse{WatchID: w.id, CompactRevision: compactRev}:
+				w.compacted = true
+				wg.delete(w)
+			default:
+				// retry next time
+			}
+			continue
+		}
+		if minRev > w.minRev {
+			minRev = w.minRev
+		}
+	}
+	return minRev
+}
+
+// watcherSetByKey gets the set of watchers that receive events on the given key.
+func (wg *watcherGroup) watcherSetByKey(key string) watcherSet {
+	wkeys := wg.keyWatchers[key]
+	wranges := wg.ranges.Stab(adt.NewStringAffinePoint(key))
+
+	// zero-copy cases
+	switch {
+	case len(wranges) == 0:
+		// no need to merge ranges or copy; reuse single-key set
+		return wkeys
+	case len(wranges) == 0 && len(wkeys) == 0:
+		return nil
+	case len(wranges) == 1 && len(wkeys) == 0:
+		return wranges[0].Val.(watcherSet)
+	}
+
+	// copy case
+	ret := make(watcherSet)
+	ret.union(wg.keyWatchers[key])
+	for _, item := range wranges {
+		ret.union(item.Val.(watcherSet))
+	}
+	return ret
+}