From a14f791d8c4ec432f93f58eac9ce647226ed483e Mon Sep 17 00:00:00 2001 From: Marcin Wielgus Date: Mon, 1 Feb 2016 12:12:37 +0100 Subject: [PATCH] Revert "Merge pull request #20329 from kubernetes/revert-20323-bump-influxdb" This reverts commit b4188ec459cd89b8cf2fe03daa6888ef9421b6c1, reversing changes made to 28951bd66ad18098befc426bf0baef7019083af0. --- Godeps/Godeps.json | 45 +- Godeps/LICENSES.md | 4 + .../github.com/armon/go-metrics/.gitignore | 22 + .../src/github.com/armon/go-metrics/LICENSE | 20 + .../src/github.com/armon/go-metrics/README.md | 71 + .../github.com/armon/go-metrics/const_unix.go | 12 + .../armon/go-metrics/const_windows.go | 13 + .../armon/go-metrics/datadog/dogstatsd.go | 125 + .../src/github.com/armon/go-metrics/inmem.go | 241 ++ .../armon/go-metrics/inmem_signal.go | 100 + .../github.com/armon/go-metrics/metrics.go | 115 + .../armon/go-metrics/prometheus/prometheus.go | 88 + .../src/github.com/armon/go-metrics/sink.go | 52 + .../src/github.com/armon/go-metrics/start.go | 95 + .../src/github.com/armon/go-metrics/statsd.go | 154 + .../github.com/armon/go-metrics/statsite.go | 142 + .../github.com/hashicorp/go-msgpack/LICENSE | 25 + .../hashicorp/go-msgpack/codec/0doc.go | 143 + .../hashicorp/go-msgpack/codec/README.md | 174 + .../hashicorp/go-msgpack/codec/binc.go | 786 +++++ .../hashicorp/go-msgpack/codec/decode.go | 1048 ++++++ .../hashicorp/go-msgpack/codec/encode.go | 1001 ++++++ .../hashicorp/go-msgpack/codec/helper.go | 589 ++++ .../go-msgpack/codec/helper_internal.go | 127 + .../hashicorp/go-msgpack/codec/msgpack.go | 816 +++++ .../go-msgpack/codec/msgpack_test.py | 110 + .../hashicorp/go-msgpack/codec/rpc.go | 152 + .../hashicorp/go-msgpack/codec/simple.go | 461 +++ .../hashicorp/go-msgpack/codec/time.go | 193 ++ .../github.com/hashicorp/raft-boltdb/LICENSE | 362 ++ .../hashicorp/raft-boltdb/README.md | 11 + .../hashicorp/raft-boltdb/bolt_store.go | 231 ++ .../github.com/hashicorp/raft-boltdb/util.go | 37 + .../src/github.com/hashicorp/raft/.gitignore | 23 + .../src/github.com/hashicorp/raft/.travis.yml | 14 + .../src/github.com/hashicorp/raft/LICENSE | 354 ++ .../src/github.com/hashicorp/raft/Makefile | 17 + .../src/github.com/hashicorp/raft/README.md | 89 + .../github.com/hashicorp/raft/bench/bench.go | 171 + .../src/github.com/hashicorp/raft/commands.go | 84 + .../src/github.com/hashicorp/raft/config.go | 134 + .../hashicorp/raft/discard_snapshot.go | 48 + .../hashicorp/raft/file_snapshot.go | 470 +++ .../src/github.com/hashicorp/raft/fsm.go | 37 + .../src/github.com/hashicorp/raft/future.go | 182 + .../src/github.com/hashicorp/raft/inflight.go | 213 ++ .../github.com/hashicorp/raft/inmem_store.go | 116 + .../hashicorp/raft/inmem_transport.go | 315 ++ .../src/github.com/hashicorp/raft/log.go | 60 + .../github.com/hashicorp/raft/log_cache.go | 79 + .../hashicorp/raft/net_transport.go | 622 ++++ .../src/github.com/hashicorp/raft/peer.go | 122 + .../src/github.com/hashicorp/raft/raft.go | 1887 +++++++++++ .../github.com/hashicorp/raft/replication.go | 517 +++ .../src/github.com/hashicorp/raft/snapshot.go | 40 + .../src/github.com/hashicorp/raft/stable.go | 15 + .../src/github.com/hashicorp/raft/state.go | 169 + .../hashicorp/raft/tcp_transport.go | 105 + .../github.com/hashicorp/raft/transport.go | 85 + .../src/github.com/hashicorp/raft/util.go | 200 ++ .../src/github.com/influxdb/influxdb/LICENSE | 2 +- .../influxdb/influxdb/_vendor/raft/LICENSE | 20 - .../influxdb/influxdb/client/README.md | 208 +- .../influxdb/client/examples/example.go | 200 -- .../influxdb/influxdb/client/influxdb.go | 1073 +++--- .../influxdb/influxdb/client/series.go | 19 - .../influxdb/influxdb/client/shard_space.go | 15 - .../influxdb/influxdb/influxql/INFLUXQL.md | 650 ++++ .../influxdb/influxdb/influxql/NOTES | 682 ++++ .../influxdb/influxdb/influxql/ast.go | 3016 +++++++++++++++++ .../influxdb/influxdb/influxql/doc.go | 64 + .../influxdb/influxdb/influxql/functions.go | 1115 ++++++ .../influxdb/influxdb/influxql/parser.go | 2238 ++++++++++++ .../influxdb/influxdb/influxql/result.go | 223 ++ .../influxdb/influxdb/influxql/scanner.go | 561 +++ .../influxdb/influxdb/influxql/token.go | 296 ++ .../influxdb/influxdb/meta/config.go | 52 + .../github.com/influxdb/influxdb/meta/data.go | 1055 ++++++ .../influxdb/influxdb/meta/errors.go | 116 + .../influxdb/meta/internal/meta.pb.go | 1167 +++++++ .../influxdb/meta/internal/meta.proto | 257 ++ .../influxdb/meta/statement_executor.go | 280 ++ .../influxdb/influxdb/meta/store.go | 1876 ++++++++++ .../influxdb/influxdb/snapshot/snapshot.go | 529 +++ .../github.com/influxdb/influxdb/toml/toml.go | 72 + .../influxdb/influxdb/tsdb/README.md | 85 + .../influxdb/influxdb/tsdb/batcher.go | 142 + .../influxdb/influxdb/tsdb/config.go | 34 + .../github.com/influxdb/influxdb/tsdb/doc.go | 5 + .../influxdb/influxdb/tsdb/engine.go | 961 ++++++ .../influxdb/tsdb/internal/meta.pb.go | 123 + .../influxdb/tsdb/internal/meta.proto | 27 + .../influxdb/influxdb/tsdb/mapper.go | 751 ++++ .../github.com/influxdb/influxdb/tsdb/meta.go | 1279 +++++++ .../influxdb/influxdb/tsdb/monitor.go | 83 + .../influxdb/influxdb/tsdb/points.go | 1135 +++++++ .../influxdb/influxdb/tsdb/query_executor.go | 1031 ++++++ .../influxdb/influxdb/tsdb/shard.go | 1218 +++++++ .../influxdb/influxdb/tsdb/snapshot_writer.go | 124 + .../influxdb/influxdb/tsdb/store.go | 343 ++ .../google/heapster-controller.yaml | 30 +- .../heapster-controller-combined.yaml | 32 +- .../influxdb/heapster-controller.yaml | 26 +- .../influxdb/influxdb-grafana-controller.yaml | 11 +- .../standalone/heapster-controller.yaml | 10 +- hack/jenkins/e2e.sh | 2 - .../admission/initialresources/influxdb.go | 99 +- test/e2e/initial_resources.go | 10 +- test/e2e/monitoring.go | 89 +- 109 files changed, 36182 insertions(+), 987 deletions(-) create mode 100644 Godeps/_workspace/src/github.com/armon/go-metrics/.gitignore create mode 100644 Godeps/_workspace/src/github.com/armon/go-metrics/LICENSE create mode 100644 Godeps/_workspace/src/github.com/armon/go-metrics/README.md create mode 100644 Godeps/_workspace/src/github.com/armon/go-metrics/const_unix.go create mode 100644 Godeps/_workspace/src/github.com/armon/go-metrics/const_windows.go create mode 100644 Godeps/_workspace/src/github.com/armon/go-metrics/datadog/dogstatsd.go create mode 100644 Godeps/_workspace/src/github.com/armon/go-metrics/inmem.go create mode 100644 Godeps/_workspace/src/github.com/armon/go-metrics/inmem_signal.go create mode 100644 Godeps/_workspace/src/github.com/armon/go-metrics/metrics.go create mode 100644 Godeps/_workspace/src/github.com/armon/go-metrics/prometheus/prometheus.go create mode 100644 Godeps/_workspace/src/github.com/armon/go-metrics/sink.go create mode 100644 Godeps/_workspace/src/github.com/armon/go-metrics/start.go create mode 100644 Godeps/_workspace/src/github.com/armon/go-metrics/statsd.go create mode 100644 Godeps/_workspace/src/github.com/armon/go-metrics/statsite.go create mode 100644 Godeps/_workspace/src/github.com/hashicorp/go-msgpack/LICENSE create mode 100644 Godeps/_workspace/src/github.com/hashicorp/go-msgpack/codec/0doc.go create mode 100644 Godeps/_workspace/src/github.com/hashicorp/go-msgpack/codec/README.md create mode 100644 Godeps/_workspace/src/github.com/hashicorp/go-msgpack/codec/binc.go create mode 100644 Godeps/_workspace/src/github.com/hashicorp/go-msgpack/codec/decode.go create mode 100644 Godeps/_workspace/src/github.com/hashicorp/go-msgpack/codec/encode.go create mode 100644 Godeps/_workspace/src/github.com/hashicorp/go-msgpack/codec/helper.go create mode 100644 Godeps/_workspace/src/github.com/hashicorp/go-msgpack/codec/helper_internal.go create mode 100644 Godeps/_workspace/src/github.com/hashicorp/go-msgpack/codec/msgpack.go create mode 100644 Godeps/_workspace/src/github.com/hashicorp/go-msgpack/codec/msgpack_test.py create mode 100644 Godeps/_workspace/src/github.com/hashicorp/go-msgpack/codec/rpc.go create mode 100644 Godeps/_workspace/src/github.com/hashicorp/go-msgpack/codec/simple.go create mode 100644 Godeps/_workspace/src/github.com/hashicorp/go-msgpack/codec/time.go create mode 100644 Godeps/_workspace/src/github.com/hashicorp/raft-boltdb/LICENSE create mode 100644 Godeps/_workspace/src/github.com/hashicorp/raft-boltdb/README.md create mode 100644 Godeps/_workspace/src/github.com/hashicorp/raft-boltdb/bolt_store.go create mode 100644 Godeps/_workspace/src/github.com/hashicorp/raft-boltdb/util.go create mode 100644 Godeps/_workspace/src/github.com/hashicorp/raft/.gitignore create mode 100644 Godeps/_workspace/src/github.com/hashicorp/raft/.travis.yml create mode 100644 Godeps/_workspace/src/github.com/hashicorp/raft/LICENSE create mode 100644 Godeps/_workspace/src/github.com/hashicorp/raft/Makefile create mode 100644 Godeps/_workspace/src/github.com/hashicorp/raft/README.md create mode 100644 Godeps/_workspace/src/github.com/hashicorp/raft/bench/bench.go create mode 100644 Godeps/_workspace/src/github.com/hashicorp/raft/commands.go create mode 100644 Godeps/_workspace/src/github.com/hashicorp/raft/config.go create mode 100644 Godeps/_workspace/src/github.com/hashicorp/raft/discard_snapshot.go create mode 100644 Godeps/_workspace/src/github.com/hashicorp/raft/file_snapshot.go create mode 100644 Godeps/_workspace/src/github.com/hashicorp/raft/fsm.go create mode 100644 Godeps/_workspace/src/github.com/hashicorp/raft/future.go create mode 100644 Godeps/_workspace/src/github.com/hashicorp/raft/inflight.go create mode 100644 Godeps/_workspace/src/github.com/hashicorp/raft/inmem_store.go create mode 100644 Godeps/_workspace/src/github.com/hashicorp/raft/inmem_transport.go create mode 100644 Godeps/_workspace/src/github.com/hashicorp/raft/log.go create mode 100644 Godeps/_workspace/src/github.com/hashicorp/raft/log_cache.go create mode 100644 Godeps/_workspace/src/github.com/hashicorp/raft/net_transport.go create mode 100644 Godeps/_workspace/src/github.com/hashicorp/raft/peer.go create mode 100644 Godeps/_workspace/src/github.com/hashicorp/raft/raft.go create mode 100644 Godeps/_workspace/src/github.com/hashicorp/raft/replication.go create mode 100644 Godeps/_workspace/src/github.com/hashicorp/raft/snapshot.go create mode 100644 Godeps/_workspace/src/github.com/hashicorp/raft/stable.go create mode 100644 Godeps/_workspace/src/github.com/hashicorp/raft/state.go create mode 100644 Godeps/_workspace/src/github.com/hashicorp/raft/tcp_transport.go create mode 100644 Godeps/_workspace/src/github.com/hashicorp/raft/transport.go create mode 100644 Godeps/_workspace/src/github.com/hashicorp/raft/util.go delete mode 100644 Godeps/_workspace/src/github.com/influxdb/influxdb/_vendor/raft/LICENSE delete mode 100644 Godeps/_workspace/src/github.com/influxdb/influxdb/client/examples/example.go delete mode 100644 Godeps/_workspace/src/github.com/influxdb/influxdb/client/series.go delete mode 100644 Godeps/_workspace/src/github.com/influxdb/influxdb/client/shard_space.go create mode 100644 Godeps/_workspace/src/github.com/influxdb/influxdb/influxql/INFLUXQL.md create mode 100644 Godeps/_workspace/src/github.com/influxdb/influxdb/influxql/NOTES create mode 100644 Godeps/_workspace/src/github.com/influxdb/influxdb/influxql/ast.go create mode 100644 Godeps/_workspace/src/github.com/influxdb/influxdb/influxql/doc.go create mode 100644 Godeps/_workspace/src/github.com/influxdb/influxdb/influxql/functions.go create mode 100644 Godeps/_workspace/src/github.com/influxdb/influxdb/influxql/parser.go create mode 100644 Godeps/_workspace/src/github.com/influxdb/influxdb/influxql/result.go create mode 100644 Godeps/_workspace/src/github.com/influxdb/influxdb/influxql/scanner.go create mode 100644 Godeps/_workspace/src/github.com/influxdb/influxdb/influxql/token.go create mode 100644 Godeps/_workspace/src/github.com/influxdb/influxdb/meta/config.go create mode 100644 Godeps/_workspace/src/github.com/influxdb/influxdb/meta/data.go create mode 100644 Godeps/_workspace/src/github.com/influxdb/influxdb/meta/errors.go create mode 100644 Godeps/_workspace/src/github.com/influxdb/influxdb/meta/internal/meta.pb.go create mode 100644 Godeps/_workspace/src/github.com/influxdb/influxdb/meta/internal/meta.proto create mode 100644 Godeps/_workspace/src/github.com/influxdb/influxdb/meta/statement_executor.go create mode 100644 Godeps/_workspace/src/github.com/influxdb/influxdb/meta/store.go create mode 100644 Godeps/_workspace/src/github.com/influxdb/influxdb/snapshot/snapshot.go create mode 100644 Godeps/_workspace/src/github.com/influxdb/influxdb/toml/toml.go create mode 100644 Godeps/_workspace/src/github.com/influxdb/influxdb/tsdb/README.md create mode 100644 Godeps/_workspace/src/github.com/influxdb/influxdb/tsdb/batcher.go create mode 100644 Godeps/_workspace/src/github.com/influxdb/influxdb/tsdb/config.go create mode 100644 Godeps/_workspace/src/github.com/influxdb/influxdb/tsdb/doc.go create mode 100644 Godeps/_workspace/src/github.com/influxdb/influxdb/tsdb/engine.go create mode 100644 Godeps/_workspace/src/github.com/influxdb/influxdb/tsdb/internal/meta.pb.go create mode 100644 Godeps/_workspace/src/github.com/influxdb/influxdb/tsdb/internal/meta.proto create mode 100644 Godeps/_workspace/src/github.com/influxdb/influxdb/tsdb/mapper.go create mode 100644 Godeps/_workspace/src/github.com/influxdb/influxdb/tsdb/meta.go create mode 100644 Godeps/_workspace/src/github.com/influxdb/influxdb/tsdb/monitor.go create mode 100644 Godeps/_workspace/src/github.com/influxdb/influxdb/tsdb/points.go create mode 100644 Godeps/_workspace/src/github.com/influxdb/influxdb/tsdb/query_executor.go create mode 100644 Godeps/_workspace/src/github.com/influxdb/influxdb/tsdb/shard.go create mode 100644 Godeps/_workspace/src/github.com/influxdb/influxdb/tsdb/snapshot_writer.go create mode 100644 Godeps/_workspace/src/github.com/influxdb/influxdb/tsdb/store.go diff --git a/Godeps/Godeps.json b/Godeps/Godeps.json index 987910fe027..5847fcde6da 100644 --- a/Godeps/Godeps.json +++ b/Godeps/Godeps.json @@ -47,6 +47,10 @@ "Comment": "v0.7.4-6-g5d54e27", "Rev": "5d54e27f1764a0309eafe12c9df7bac03f241646" }, + { + "ImportPath": "github.com/armon/go-metrics", + "Rev": "345426c77237ece5dab0e1605c3e4b35c3f54757" + }, { "ImportPath": "github.com/aws/aws-sdk-go/aws", "Comment": "v1.0.7", @@ -679,6 +683,18 @@ "ImportPath": "github.com/gorilla/mux", "Rev": "8096f47503459bcc74d1f4c487b7e6e42e5746b5" }, + { + "ImportPath": "github.com/hashicorp/go-msgpack/codec", + "Rev": "fa3f63826f7c23912c15263591e65d54d080b458" + }, + { + "ImportPath": "github.com/hashicorp/raft", + "Rev": "057b893fd996696719e98b6c44649ea14968c811" + }, + { + "ImportPath": "github.com/hashicorp/raft-boltdb", + "Rev": "d1e82c1ec3f15ee991f7cc7ffd5b67ff6f5bbaee" + }, { "ImportPath": "github.com/imdario/mergo", "Comment": "0.1.3-8-g6633656", @@ -690,8 +706,33 @@ }, { "ImportPath": "github.com/influxdb/influxdb/client", - "Comment": "v0.8.8", - "Rev": "afde71eb1740fd763ab9450e1f700ba0e53c36d0" + "Comment": "v0.9.2.1", + "Rev": "b237c68bab4756507baf6840023be103853e77db" + }, + { + "ImportPath": "github.com/influxdb/influxdb/influxql", + "Comment": "v0.9.2.1", + "Rev": "b237c68bab4756507baf6840023be103853e77db" + }, + { + "ImportPath": "github.com/influxdb/influxdb/meta", + "Comment": "v0.9.2.1", + "Rev": "b237c68bab4756507baf6840023be103853e77db" + }, + { + "ImportPath": "github.com/influxdb/influxdb/snapshot", + "Comment": "v0.9.2.1", + "Rev": "b237c68bab4756507baf6840023be103853e77db" + }, + { + "ImportPath": "github.com/influxdb/influxdb/toml", + "Comment": "v0.9.2.1", + "Rev": "b237c68bab4756507baf6840023be103853e77db" + }, + { + "ImportPath": "github.com/influxdb/influxdb/tsdb", + "Comment": "v0.9.2.1", + "Rev": "b237c68bab4756507baf6840023be103853e77db" }, { "ImportPath": "github.com/jmespath/go-jmespath", diff --git a/Godeps/LICENSES.md b/Godeps/LICENSES.md index 652d803eae6..cf5170dfe9c 100644 --- a/Godeps/LICENSES.md +++ b/Godeps/LICENSES.md @@ -8,6 +8,7 @@ bitbucket.org/ww/goautoneg | spdxBSD3 github.com/abbot/go-http-auth | Apache-2 github.com/appc/cni | Apache-2 github.com/appc/spec | Apache-2 +github.com/armon/go-metrics | MITname github.com/aws/aws-sdk-go | Apache-2 github.com/beorn7/perks/quantile | MIT? github.com/blang/semver | MITname @@ -50,6 +51,9 @@ github.com/google/cadvisor | Apache-2 github.com/google/gofuzz | Apache-2 github.com/gorilla/context | spdxBSD3 github.com/gorilla/mux | spdxBSD3 +github.com/hashicorp/go-msgpack | spdxBSD3 +github.com/hashicorp/raft | IntelPart08 +github.com/hashicorp/raft-boltdb | IntelPart08 github.com/imdario/mergo | spdxBSD3 github.com/inconshreveable/mousetrap | Apache-2 github.com/influxdb/influxdb | MITname diff --git a/Godeps/_workspace/src/github.com/armon/go-metrics/.gitignore b/Godeps/_workspace/src/github.com/armon/go-metrics/.gitignore new file mode 100644 index 00000000000..00268614f04 --- /dev/null +++ b/Godeps/_workspace/src/github.com/armon/go-metrics/.gitignore @@ -0,0 +1,22 @@ +# Compiled Object files, Static and Dynamic libs (Shared Objects) +*.o +*.a +*.so + +# Folders +_obj +_test + +# Architecture specific extensions/prefixes +*.[568vq] +[568vq].out + +*.cgo1.go +*.cgo2.c +_cgo_defun.c +_cgo_gotypes.go +_cgo_export.* + +_testmain.go + +*.exe diff --git a/Godeps/_workspace/src/github.com/armon/go-metrics/LICENSE b/Godeps/_workspace/src/github.com/armon/go-metrics/LICENSE new file mode 100644 index 00000000000..106569e542b --- /dev/null +++ b/Godeps/_workspace/src/github.com/armon/go-metrics/LICENSE @@ -0,0 +1,20 @@ +The MIT License (MIT) + +Copyright (c) 2013 Armon Dadgar + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software is furnished to do so, +subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/Godeps/_workspace/src/github.com/armon/go-metrics/README.md b/Godeps/_workspace/src/github.com/armon/go-metrics/README.md new file mode 100644 index 00000000000..7b6f23e29f8 --- /dev/null +++ b/Godeps/_workspace/src/github.com/armon/go-metrics/README.md @@ -0,0 +1,71 @@ +go-metrics +========== + +This library provides a `metrics` package which can be used to instrument code, +expose application metrics, and profile runtime performance in a flexible manner. + +Current API: [![GoDoc](https://godoc.org/github.com/armon/go-metrics?status.svg)](https://godoc.org/github.com/armon/go-metrics) + +Sinks +===== + +The `metrics` package makes use of a `MetricSink` interface to support delivery +to any type of backend. Currently the following sinks are provided: + +* StatsiteSink : Sinks to a [statsite](https://github.com/armon/statsite/) instance (TCP) +* StatsdSink: Sinks to a [StatsD](https://github.com/etsy/statsd/) / statsite instance (UDP) +* PrometheusSink: Sinks to a [Prometheus](http://prometheus.io/) metrics endpoint (exposed via HTTP for scrapes) +* InmemSink : Provides in-memory aggregation, can be used to export stats +* FanoutSink : Sinks to multiple sinks. Enables writing to multiple statsite instances for example. +* BlackholeSink : Sinks to nowhere + +In addition to the sinks, the `InmemSignal` can be used to catch a signal, +and dump a formatted output of recent metrics. For example, when a process gets +a SIGUSR1, it can dump to stderr recent performance metrics for debugging. + +Examples +======== + +Here is an example of using the package: + + func SlowMethod() { + // Profiling the runtime of a method + defer metrics.MeasureSince([]string{"SlowMethod"}, time.Now()) + } + + // Configure a statsite sink as the global metrics sink + sink, _ := metrics.NewStatsiteSink("statsite:8125") + metrics.NewGlobal(metrics.DefaultConfig("service-name"), sink) + + // Emit a Key/Value pair + metrics.EmitKey([]string{"questions", "meaning of life"}, 42) + + +Here is an example of setting up an signal handler: + + // Setup the inmem sink and signal handler + inm := metrics.NewInmemSink(10*time.Second, time.Minute) + sig := metrics.DefaultInmemSignal(inm) + metrics.NewGlobal(metrics.DefaultConfig("service-name"), inm) + + // Run some code + inm.SetGauge([]string{"foo"}, 42) + inm.EmitKey([]string{"bar"}, 30) + + inm.IncrCounter([]string{"baz"}, 42) + inm.IncrCounter([]string{"baz"}, 1) + inm.IncrCounter([]string{"baz"}, 80) + + inm.AddSample([]string{"method", "wow"}, 42) + inm.AddSample([]string{"method", "wow"}, 100) + inm.AddSample([]string{"method", "wow"}, 22) + + .... + +When a signal comes in, output like the following will be dumped to stderr: + + [2014-01-28 14:57:33.04 -0800 PST][G] 'foo': 42.000 + [2014-01-28 14:57:33.04 -0800 PST][P] 'bar': 30.000 + [2014-01-28 14:57:33.04 -0800 PST][C] 'baz': Count: 3 Min: 1.000 Mean: 41.000 Max: 80.000 Stddev: 39.509 + [2014-01-28 14:57:33.04 -0800 PST][S] 'method.wow': Count: 3 Min: 22.000 Mean: 54.667 Max: 100.000 Stddev: 40.513 + diff --git a/Godeps/_workspace/src/github.com/armon/go-metrics/const_unix.go b/Godeps/_workspace/src/github.com/armon/go-metrics/const_unix.go new file mode 100644 index 00000000000..31098dd57e5 --- /dev/null +++ b/Godeps/_workspace/src/github.com/armon/go-metrics/const_unix.go @@ -0,0 +1,12 @@ +// +build !windows + +package metrics + +import ( + "syscall" +) + +const ( + // DefaultSignal is used with DefaultInmemSignal + DefaultSignal = syscall.SIGUSR1 +) diff --git a/Godeps/_workspace/src/github.com/armon/go-metrics/const_windows.go b/Godeps/_workspace/src/github.com/armon/go-metrics/const_windows.go new file mode 100644 index 00000000000..38136af3e42 --- /dev/null +++ b/Godeps/_workspace/src/github.com/armon/go-metrics/const_windows.go @@ -0,0 +1,13 @@ +// +build windows + +package metrics + +import ( + "syscall" +) + +const ( + // DefaultSignal is used with DefaultInmemSignal + // Windows has no SIGUSR1, use SIGBREAK + DefaultSignal = syscall.Signal(21) +) diff --git a/Godeps/_workspace/src/github.com/armon/go-metrics/datadog/dogstatsd.go b/Godeps/_workspace/src/github.com/armon/go-metrics/datadog/dogstatsd.go new file mode 100644 index 00000000000..aaba9fe0e22 --- /dev/null +++ b/Godeps/_workspace/src/github.com/armon/go-metrics/datadog/dogstatsd.go @@ -0,0 +1,125 @@ +package datadog + +import ( + "fmt" + "strings" + + "github.com/DataDog/datadog-go/statsd" +) + +// DogStatsdSink provides a MetricSink that can be used +// with a dogstatsd server. It utilizes the Dogstatsd client at github.com/DataDog/datadog-go/statsd +type DogStatsdSink struct { + client *statsd.Client + hostName string + propagateHostname bool +} + +// NewDogStatsdSink is used to create a new DogStatsdSink with sane defaults +func NewDogStatsdSink(addr string, hostName string) (*DogStatsdSink, error) { + client, err := statsd.New(addr) + if err != nil { + return nil, err + } + sink := &DogStatsdSink{ + client: client, + hostName: hostName, + propagateHostname: false, + } + return sink, nil +} + +// SetTags sets common tags on the Dogstatsd Client that will be sent +// along with all dogstatsd packets. +// Ref: http://docs.datadoghq.com/guides/dogstatsd/#tags +func (s *DogStatsdSink) SetTags(tags []string) { + s.client.Tags = tags +} + +// EnableHostnamePropagation forces a Dogstatsd `host` tag with the value specified by `s.HostName` +// Since the go-metrics package has its own mechanism for attaching a hostname to metrics, +// setting the `propagateHostname` flag ensures that `s.HostName` overrides the host tag naively set by the DogStatsd server +func (s *DogStatsdSink) EnableHostNamePropagation() { + s.propagateHostname = true +} + +func (s *DogStatsdSink) flattenKey(parts []string) string { + joined := strings.Join(parts, ".") + return strings.Map(func(r rune) rune { + switch r { + case ':': + fallthrough + case ' ': + return '_' + default: + return r + } + }, joined) +} + +func (s *DogStatsdSink) parseKey(key []string) ([]string, []string) { + // Since DogStatsd supports dimensionality via tags on metric keys, this sink's approach is to splice the hostname out of the key in favor of a `host` tag + // The `host` tag is either forced here, or set downstream by the DogStatsd server + + var tags []string + hostName := s.hostName + + //Splice the hostname out of the key + for i, el := range key { + if el == hostName { + key = append(key[:i], key[i+1:]...) + } + } + + if s.propagateHostname { + tags = append(tags, fmt.Sprintf("host:%s", hostName)) + } + return key, tags +} + +// Implementation of methods in the MetricSink interface + +func (s *DogStatsdSink) SetGauge(key []string, val float32) { + s.SetGaugeWithTags(key, val, []string{}) +} + +func (s *DogStatsdSink) IncrCounter(key []string, val float32) { + s.IncrCounterWithTags(key, val, []string{}) +} + +// EmitKey is not implemented since DogStatsd does not provide a metric type that holds an +// arbitrary number of values +func (s *DogStatsdSink) EmitKey(key []string, val float32) { +} + +func (s *DogStatsdSink) AddSample(key []string, val float32) { + s.AddSampleWithTags(key, val, []string{}) +} + +// The following ...WithTags methods correspond to Datadog's Tag extension to Statsd. +// http://docs.datadoghq.com/guides/dogstatsd/#tags + +func (s *DogStatsdSink) SetGaugeWithTags(key []string, val float32, tags []string) { + flatKey, tags := s.getFlatkeyAndCombinedTags(key, tags) + rate := 1.0 + s.client.Gauge(flatKey, float64(val), tags, rate) +} + +func (s *DogStatsdSink) IncrCounterWithTags(key []string, val float32, tags []string) { + flatKey, tags := s.getFlatkeyAndCombinedTags(key, tags) + rate := 1.0 + s.client.Count(flatKey, int64(val), tags, rate) +} + +func (s *DogStatsdSink) AddSampleWithTags(key []string, val float32, tags []string) { + flatKey, tags := s.getFlatkeyAndCombinedTags(key, tags) + rate := 1.0 + s.client.TimeInMilliseconds(flatKey, float64(val), tags, rate) +} + +func (s *DogStatsdSink) getFlatkeyAndCombinedTags(key []string, tags []string) (flattenedKey string, combinedTags []string) { + key, hostTags := s.parseKey(key) + flatKey := s.flattenKey(key) + tags = append(tags, hostTags...) + return flatKey, tags +} diff --git a/Godeps/_workspace/src/github.com/armon/go-metrics/inmem.go b/Godeps/_workspace/src/github.com/armon/go-metrics/inmem.go new file mode 100644 index 00000000000..da503296060 --- /dev/null +++ b/Godeps/_workspace/src/github.com/armon/go-metrics/inmem.go @@ -0,0 +1,241 @@ +package metrics + +import ( + "fmt" + "math" + "strings" + "sync" + "time" +) + +// InmemSink provides a MetricSink that does in-memory aggregation +// without sending metrics over a network. It can be embedded within +// an application to provide profiling information. +type InmemSink struct { + // How long is each aggregation interval + interval time.Duration + + // Retain controls how many metrics interval we keep + retain time.Duration + + // maxIntervals is the maximum length of intervals. + // It is retain / interval. + maxIntervals int + + // intervals is a slice of the retained intervals + intervals []*IntervalMetrics + intervalLock sync.RWMutex +} + +// IntervalMetrics stores the aggregated metrics +// for a specific interval +type IntervalMetrics struct { + sync.RWMutex + + // The start time of the interval + Interval time.Time + + // Gauges maps the key to the last set value + Gauges map[string]float32 + + // Points maps the string to the list of emitted values + // from EmitKey + Points map[string][]float32 + + // Counters maps the string key to a sum of the counter + // values + Counters map[string]*AggregateSample + + // Samples maps the key to an AggregateSample, + // which has the rolled up view of a sample + Samples map[string]*AggregateSample +} + +// NewIntervalMetrics creates a new IntervalMetrics for a given interval +func NewIntervalMetrics(intv time.Time) *IntervalMetrics { + return &IntervalMetrics{ + Interval: intv, + Gauges: make(map[string]float32), + Points: make(map[string][]float32), + Counters: make(map[string]*AggregateSample), + Samples: make(map[string]*AggregateSample), + } +} + +// AggregateSample is used to hold aggregate metrics +// about a sample +type AggregateSample struct { + Count int // The count of emitted pairs + Sum float64 // The sum of values + SumSq float64 // The sum of squared values + Min float64 // Minimum value + Max float64 // Maximum value + LastUpdated time.Time // When value was last updated +} + +// Computes a Stddev of the values +func (a *AggregateSample) Stddev() float64 { + num := (float64(a.Count) * a.SumSq) - math.Pow(a.Sum, 2) + div := float64(a.Count * (a.Count - 1)) + if div == 0 { + return 0 + } + return math.Sqrt(num / div) +} + +// Computes a mean of the values +func (a *AggregateSample) Mean() float64 { + if a.Count == 0 { + return 0 + } + return a.Sum / float64(a.Count) +} + +// Ingest is used to update a sample +func (a *AggregateSample) Ingest(v float64) { + a.Count++ + a.Sum += v + a.SumSq += (v * v) + if v < a.Min || a.Count == 1 { + a.Min = v + } + if v > a.Max || a.Count == 1 { + a.Max = v + } + a.LastUpdated = time.Now() +} + +func (a *AggregateSample) String() string { + if a.Count == 0 { + return "Count: 0" + } else if a.Stddev() == 0 { + return fmt.Sprintf("Count: %d Sum: %0.3f LastUpdated: %s", a.Count, a.Sum, a.LastUpdated) + } else { + return fmt.Sprintf("Count: %d Min: %0.3f Mean: %0.3f Max: %0.3f Stddev: %0.3f Sum: %0.3f LastUpdated: %s", + a.Count, a.Min, a.Mean(), a.Max, a.Stddev(), a.Sum, a.LastUpdated) + } +} + +// NewInmemSink is used to construct a new in-memory sink. +// Uses an aggregation interval and maximum retention period. +func NewInmemSink(interval, retain time.Duration) *InmemSink { + i := &InmemSink{ + interval: interval, + retain: retain, + maxIntervals: int(retain / interval), + } + i.intervals = make([]*IntervalMetrics, 0, i.maxIntervals) + return i +} + +func (i *InmemSink) SetGauge(key []string, val float32) { + k := i.flattenKey(key) + intv := i.getInterval() + + intv.Lock() + defer intv.Unlock() + intv.Gauges[k] = val +} + +func (i *InmemSink) EmitKey(key []string, val float32) { + k := i.flattenKey(key) + intv := i.getInterval() + + intv.Lock() + defer intv.Unlock() + vals := intv.Points[k] + intv.Points[k] = append(vals, val) +} + +func (i *InmemSink) IncrCounter(key []string, val float32) { + k := i.flattenKey(key) + intv := i.getInterval() + + intv.Lock() + defer intv.Unlock() + + agg := intv.Counters[k] + if agg == nil { + agg = &AggregateSample{} + intv.Counters[k] = agg + } + agg.Ingest(float64(val)) +} + +func (i *InmemSink) AddSample(key []string, val float32) { + k := i.flattenKey(key) + intv := i.getInterval() + + intv.Lock() + defer intv.Unlock() + + agg := intv.Samples[k] + if agg == nil { + agg = &AggregateSample{} + intv.Samples[k] = agg + } + agg.Ingest(float64(val)) +} + +// Data is used to retrieve all the aggregated metrics +// Intervals may be in use, and a read lock should be acquired +func (i *InmemSink) Data() []*IntervalMetrics { + // Get the current interval, forces creation + i.getInterval() + + i.intervalLock.RLock() + defer i.intervalLock.RUnlock() + + intervals := make([]*IntervalMetrics, len(i.intervals)) + copy(intervals, i.intervals) + return intervals +} + +func (i *InmemSink) getExistingInterval(intv time.Time) *IntervalMetrics { + i.intervalLock.RLock() + defer i.intervalLock.RUnlock() + + n := len(i.intervals) + if n > 0 && i.intervals[n-1].Interval == intv { + return i.intervals[n-1] + } + return nil +} + +func (i *InmemSink) createInterval(intv time.Time) *IntervalMetrics { + i.intervalLock.Lock() + defer i.intervalLock.Unlock() + + // Check for an existing interval + n := len(i.intervals) + if n > 0 && i.intervals[n-1].Interval == intv { + return i.intervals[n-1] + } + + // Add the current interval + current := NewIntervalMetrics(intv) + i.intervals = append(i.intervals, current) + n++ + + // Truncate the intervals if they are too long + if n >= i.maxIntervals { + copy(i.intervals[0:], i.intervals[n-i.maxIntervals:]) + i.intervals = i.intervals[:i.maxIntervals] + } + return current +} + +// getInterval returns the current interval to write to +func (i *InmemSink) getInterval() *IntervalMetrics { + intv := time.Now().Truncate(i.interval) + if m := i.getExistingInterval(intv); m != nil { + return m + } + return i.createInterval(intv) +} + +// Flattens the key for formatting, removes spaces +func (i *InmemSink) flattenKey(parts []string) string { + joined := strings.Join(parts, ".") + return strings.Replace(joined, " ", "_", -1) +} diff --git a/Godeps/_workspace/src/github.com/armon/go-metrics/inmem_signal.go b/Godeps/_workspace/src/github.com/armon/go-metrics/inmem_signal.go new file mode 100644 index 00000000000..95d08ee10f0 --- /dev/null +++ b/Godeps/_workspace/src/github.com/armon/go-metrics/inmem_signal.go @@ -0,0 +1,100 @@ +package metrics + +import ( + "bytes" + "fmt" + "io" + "os" + "os/signal" + "sync" + "syscall" +) + +// InmemSignal is used to listen for a given signal, and when received, +// to dump the current metrics from the InmemSink to an io.Writer +type InmemSignal struct { + signal syscall.Signal + inm *InmemSink + w io.Writer + sigCh chan os.Signal + + stop bool + stopCh chan struct{} + stopLock sync.Mutex +} + +// NewInmemSignal creates a new InmemSignal which listens for a given signal, +// and dumps the current metrics out to a writer +func NewInmemSignal(inmem *InmemSink, sig syscall.Signal, w io.Writer) *InmemSignal { + i := &InmemSignal{ + signal: sig, + inm: inmem, + w: w, + sigCh: make(chan os.Signal, 1), + stopCh: make(chan struct{}), + } + signal.Notify(i.sigCh, sig) + go i.run() + return i +} + +// DefaultInmemSignal returns a new InmemSignal that responds to SIGUSR1 +// and writes output to stderr. Windows uses SIGBREAK +func DefaultInmemSignal(inmem *InmemSink) *InmemSignal { + return NewInmemSignal(inmem, DefaultSignal, os.Stderr) +} + +// Stop is used to stop the InmemSignal from listening +func (i *InmemSignal) Stop() { + i.stopLock.Lock() + defer i.stopLock.Unlock() + + if i.stop { + return + } + i.stop = true + close(i.stopCh) + signal.Stop(i.sigCh) +} + +// run is a long running routine that handles signals +func (i *InmemSignal) run() { + for { + select { + case <-i.sigCh: + i.dumpStats() + case <-i.stopCh: + return + } + } +} + +// dumpStats is used to dump the data to output writer +func (i *InmemSignal) dumpStats() { + buf := bytes.NewBuffer(nil) + + data := i.inm.Data() + // Skip the last period which is still being aggregated + for i := 0; i < len(data)-1; i++ { + intv := data[i] + intv.RLock() + for name, val := range intv.Gauges { + fmt.Fprintf(buf, "[%v][G] '%s': %0.3f\n", intv.Interval, name, val) + } + for name, vals := range intv.Points { + for _, val := range vals { + fmt.Fprintf(buf, "[%v][P] '%s': %0.3f\n", intv.Interval, name, val) + } + } + for name, agg := range intv.Counters { + fmt.Fprintf(buf, "[%v][C] '%s': %s\n", intv.Interval, name, agg) + } + for name, agg := range intv.Samples { + fmt.Fprintf(buf, "[%v][S] '%s': %s\n", intv.Interval, name, agg) + } + intv.RUnlock() + } + + // Write out the bytes + i.w.Write(buf.Bytes()) +} diff --git a/Godeps/_workspace/src/github.com/armon/go-metrics/metrics.go b/Godeps/_workspace/src/github.com/armon/go-metrics/metrics.go new file mode 100644 index 00000000000..b818e4182c0 --- /dev/null +++ b/Godeps/_workspace/src/github.com/armon/go-metrics/metrics.go @@ -0,0 +1,115 @@ +package metrics + +import ( + "runtime" + "time" +) + +func (m *Metrics) SetGauge(key []string, val float32) { + if m.HostName != "" && m.EnableHostname { + key = insert(0, m.HostName, key) + } + if m.EnableTypePrefix { + key = insert(0, "gauge", key) + } + if m.ServiceName != "" { + key = insert(0, m.ServiceName, key) + } + m.sink.SetGauge(key, val) +} + +func (m *Metrics) EmitKey(key []string, val float32) { + if m.EnableTypePrefix { + key = insert(0, "kv", key) + } + if m.ServiceName != "" { + key = insert(0, m.ServiceName, key) + } + m.sink.EmitKey(key, val) +} + +func (m *Metrics) IncrCounter(key []string, val float32) { + if m.EnableTypePrefix { + key = insert(0, "counter", key) + } + if m.ServiceName != "" { + key = insert(0, m.ServiceName, key) + } + m.sink.IncrCounter(key, val) +} + +func (m *Metrics) AddSample(key []string, val float32) { + if m.EnableTypePrefix { + key = insert(0, "sample", key) + } + if m.ServiceName != "" { + key = insert(0, m.ServiceName, key) + } + m.sink.AddSample(key, val) +} + +func (m *Metrics) MeasureSince(key []string, start time.Time) { + if m.EnableTypePrefix { + key = insert(0, "timer", key) + } + if m.ServiceName != "" { + key = insert(0, m.ServiceName, key) + } + now := time.Now() + elapsed := now.Sub(start) + msec := float32(elapsed.Nanoseconds()) / float32(m.TimerGranularity) + m.sink.AddSample(key, msec) +} + +// Periodically collects runtime stats to publish +func (m *Metrics) collectStats() { + for { + time.Sleep(m.ProfileInterval) + m.emitRuntimeStats() + } +} + +// Emits various runtime statsitics +func (m *Metrics) emitRuntimeStats() { + // Export number of Goroutines + numRoutines := runtime.NumGoroutine() + m.SetGauge([]string{"runtime", "num_goroutines"}, float32(numRoutines)) + + // Export memory stats + var stats runtime.MemStats + runtime.ReadMemStats(&stats) + m.SetGauge([]string{"runtime", "alloc_bytes"}, float32(stats.Alloc)) + m.SetGauge([]string{"runtime", "sys_bytes"}, float32(stats.Sys)) + m.SetGauge([]string{"runtime", "malloc_count"}, float32(stats.Mallocs)) + m.SetGauge([]string{"runtime", "free_count"}, float32(stats.Frees)) + m.SetGauge([]string{"runtime", "heap_objects"}, float32(stats.HeapObjects)) + m.SetGauge([]string{"runtime", "total_gc_pause_ns"}, float32(stats.PauseTotalNs)) + m.SetGauge([]string{"runtime", "total_gc_runs"}, float32(stats.NumGC)) + + // Export info about the last few GC runs + num := stats.NumGC + + // Handle wrap around + if num < m.lastNumGC { + m.lastNumGC = 0 + } + + // Ensure we don't scan more than 256 + if num-m.lastNumGC >= 256 { + m.lastNumGC = num - 255 + } + + for i := m.lastNumGC; i < num; i++ { + pause := stats.PauseNs[i%256] + m.AddSample([]string{"runtime", "gc_pause_ns"}, float32(pause)) + } + m.lastNumGC = num +} + +// Inserts a string value at an index into the slice +func insert(i int, v string, s []string) []string { + s = append(s, "") + copy(s[i+1:], s[i:]) + s[i] = v + return s +} diff --git a/Godeps/_workspace/src/github.com/armon/go-metrics/prometheus/prometheus.go b/Godeps/_workspace/src/github.com/armon/go-metrics/prometheus/prometheus.go new file mode 100644 index 00000000000..362dbfb623d --- /dev/null +++ b/Godeps/_workspace/src/github.com/armon/go-metrics/prometheus/prometheus.go @@ -0,0 +1,88 @@ +// +build go1.3 +package prometheus + +import ( + "strings" + "sync" + "time" + + "github.com/prometheus/client_golang/prometheus" +) + +type PrometheusSink struct { + mu sync.Mutex + gauges map[string]prometheus.Gauge + summaries map[string]prometheus.Summary + counters map[string]prometheus.Counter +} + +func NewPrometheusSink() (*PrometheusSink, error) { + return &PrometheusSink{ + gauges: make(map[string]prometheus.Gauge), + summaries: make(map[string]prometheus.Summary), + counters: make(map[string]prometheus.Counter), + }, nil +} + +func (p *PrometheusSink) flattenKey(parts []string) string { + joined := strings.Join(parts, "_") + joined = strings.Replace(joined, " ", "_", -1) + joined = strings.Replace(joined, ".", "_", -1) + joined = strings.Replace(joined, "-", "_", -1) + return joined +} + +func (p *PrometheusSink) SetGauge(parts []string, val float32) { + p.mu.Lock() + defer p.mu.Unlock() + key := p.flattenKey(parts) + g, ok := p.gauges[key] + if !ok { + g = prometheus.NewGauge(prometheus.GaugeOpts{ + Name: key, + Help: key, + }) + prometheus.MustRegister(g) + p.gauges[key] = g + } + g.Set(float64(val)) +} + +func (p *PrometheusSink) AddSample(parts []string, val float32) { + p.mu.Lock() + defer p.mu.Unlock() + key := p.flattenKey(parts) + g, ok := p.summaries[key] + if !ok { + g = prometheus.NewSummary(prometheus.SummaryOpts{ + Name: key, + Help: key, + MaxAge: 10 * time.Second, + }) + prometheus.MustRegister(g) + p.summaries[key] = g + } + g.Observe(float64(val)) +} + +// EmitKey is not implemented. Prometheus doesn’t offer a type for which an +// arbitrary number of values is retained, as Prometheus works with a pull +// model, rather than a push model. +func (p *PrometheusSink) EmitKey(key []string, val float32) { +} + +func (p *PrometheusSink) IncrCounter(parts []string, val float32) { + p.mu.Lock() + defer p.mu.Unlock() + key := p.flattenKey(parts) + g, ok := p.counters[key] + if !ok { + g = prometheus.NewCounter(prometheus.CounterOpts{ + Name: key, + Help: key, + }) + prometheus.MustRegister(g) + p.counters[key] = g + } + g.Add(float64(val)) +} diff --git a/Godeps/_workspace/src/github.com/armon/go-metrics/sink.go b/Godeps/_workspace/src/github.com/armon/go-metrics/sink.go new file mode 100644 index 00000000000..0c240c2c47e --- /dev/null +++ b/Godeps/_workspace/src/github.com/armon/go-metrics/sink.go @@ -0,0 +1,52 @@ +package metrics + +// The MetricSink interface is used to transmit metrics information +// to an external system +type MetricSink interface { + // A Gauge should retain the last value it is set to + SetGauge(key []string, val float32) + + // Should emit a Key/Value pair for each call + EmitKey(key []string, val float32) + + // Counters should accumulate values + IncrCounter(key []string, val float32) + + // Samples are for timing information, where quantiles are used + AddSample(key []string, val float32) +} + +// BlackholeSink is used to just blackhole messages +type BlackholeSink struct{} + +func (*BlackholeSink) SetGauge(key []string, val float32) {} +func (*BlackholeSink) EmitKey(key []string, val float32) {} +func (*BlackholeSink) IncrCounter(key []string, val float32) {} +func (*BlackholeSink) AddSample(key []string, val float32) {} + +// FanoutSink is used to sink to fanout values to multiple sinks +type FanoutSink []MetricSink + +func (fh FanoutSink) SetGauge(key []string, val float32) { + for _, s := range fh { + s.SetGauge(key, val) + } +} + +func (fh FanoutSink) EmitKey(key []string, val float32) { + for _, s := range fh { + s.EmitKey(key, val) + } +} + +func (fh FanoutSink) IncrCounter(key []string, val float32) { + for _, s := range fh { + s.IncrCounter(key, val) + } +} + +func (fh FanoutSink) AddSample(key []string, val float32) { + for _, s := range fh { + s.AddSample(key, val) + } +} diff --git a/Godeps/_workspace/src/github.com/armon/go-metrics/start.go b/Godeps/_workspace/src/github.com/armon/go-metrics/start.go new file mode 100644 index 00000000000..44113f10042 --- /dev/null +++ b/Godeps/_workspace/src/github.com/armon/go-metrics/start.go @@ -0,0 +1,95 @@ +package metrics + +import ( + "os" + "time" +) + +// Config is used to configure metrics settings +type Config struct { + ServiceName string // Prefixed with keys to seperate services + HostName string // Hostname to use. If not provided and EnableHostname, it will be os.Hostname + EnableHostname bool // Enable prefixing gauge values with hostname + EnableRuntimeMetrics bool // Enables profiling of runtime metrics (GC, Goroutines, Memory) + EnableTypePrefix bool // Prefixes key with a type ("counter", "gauge", "timer") + TimerGranularity time.Duration // Granularity of timers. + ProfileInterval time.Duration // Interval to profile runtime metrics +} + +// Metrics represents an instance of a metrics sink that can +// be used to emit +type Metrics struct { + Config + lastNumGC uint32 + sink MetricSink +} + +// Shared global metrics instance +var globalMetrics *Metrics + +func init() { + // Initialize to a blackhole sink to avoid errors + globalMetrics = &Metrics{sink: &BlackholeSink{}} +} + +// DefaultConfig provides a sane default configuration +func DefaultConfig(serviceName string) *Config { + c := &Config{ + ServiceName: serviceName, // Use client provided service + HostName: "", + EnableHostname: true, // Enable hostname prefix + EnableRuntimeMetrics: true, // Enable runtime profiling + EnableTypePrefix: false, // Disable type prefix + TimerGranularity: time.Millisecond, // Timers are in milliseconds + ProfileInterval: time.Second, // Poll runtime every second + } + + // Try to get the hostname + name, _ := os.Hostname() + c.HostName = name + return c +} + +// New is used to create a new instance of Metrics +func New(conf *Config, sink MetricSink) (*Metrics, error) { + met := &Metrics{} + met.Config = *conf + met.sink = sink + + // Start the runtime collector + if conf.EnableRuntimeMetrics { + go met.collectStats() + } + return met, nil +} + +// NewGlobal is the same as New, but it assigns the metrics object to be +// used globally as well as returning it. +func NewGlobal(conf *Config, sink MetricSink) (*Metrics, error) { + metrics, err := New(conf, sink) + if err == nil { + globalMetrics = metrics + } + return metrics, err +} + +// Proxy all the methods to the globalMetrics instance +func SetGauge(key []string, val float32) { + globalMetrics.SetGauge(key, val) +} + +func EmitKey(key []string, val float32) { + globalMetrics.EmitKey(key, val) +} + +func IncrCounter(key []string, val float32) { + globalMetrics.IncrCounter(key, val) +} + +func AddSample(key []string, val float32) { + globalMetrics.AddSample(key, val) +} + +func MeasureSince(key []string, start time.Time) { + globalMetrics.MeasureSince(key, start) +} diff --git a/Godeps/_workspace/src/github.com/armon/go-metrics/statsd.go b/Godeps/_workspace/src/github.com/armon/go-metrics/statsd.go new file mode 100644 index 00000000000..65a5021a057 --- /dev/null +++ b/Godeps/_workspace/src/github.com/armon/go-metrics/statsd.go @@ -0,0 +1,154 @@ +package metrics + +import ( + "bytes" + "fmt" + "log" + "net" + "strings" + "time" +) + +const ( + // statsdMaxLen is the maximum size of a packet + // to send to statsd + statsdMaxLen = 1400 +) + +// StatsdSink provides a MetricSink that can be used +// with a statsite or statsd metrics server. It uses +// only UDP packets, while StatsiteSink uses TCP. +type StatsdSink struct { + addr string + metricQueue chan string +} + +// NewStatsdSink is used to create a new StatsdSink +func NewStatsdSink(addr string) (*StatsdSink, error) { + s := &StatsdSink{ + addr: addr, + metricQueue: make(chan string, 4096), + } + go s.flushMetrics() + return s, nil +} + +// Close is used to stop flushing to statsd +func (s *StatsdSink) Shutdown() { + close(s.metricQueue) +} + +func (s *StatsdSink) SetGauge(key []string, val float32) { + flatKey := s.flattenKey(key) + s.pushMetric(fmt.Sprintf("%s:%f|g\n", flatKey, val)) +} + +func (s *StatsdSink) EmitKey(key []string, val float32) { + flatKey := s.flattenKey(key) + s.pushMetric(fmt.Sprintf("%s:%f|kv\n", flatKey, val)) +} + +func (s *StatsdSink) IncrCounter(key []string, val float32) { + flatKey := s.flattenKey(key) + s.pushMetric(fmt.Sprintf("%s:%f|c\n", flatKey, val)) +} + +func (s *StatsdSink) AddSample(key []string, val float32) { + flatKey := s.flattenKey(key) + s.pushMetric(fmt.Sprintf("%s:%f|ms\n", flatKey, val)) +} + +// Flattens the key for formatting, removes spaces +func (s *StatsdSink) flattenKey(parts []string) string { + joined := strings.Join(parts, ".") + return strings.Map(func(r rune) rune { + switch r { + case ':': + fallthrough + case ' ': + return '_' + default: + return r + } + }, joined) +} + +// Does a non-blocking push to the metrics queue +func (s *StatsdSink) pushMetric(m string) { + select { + case s.metricQueue <- m: + default: + } +} + +// Flushes metrics +func (s *StatsdSink) flushMetrics() { + var sock net.Conn + var err error + var wait <-chan time.Time + ticker := time.NewTicker(flushInterval) + defer ticker.Stop() + +CONNECT: + // Create a buffer + buf := bytes.NewBuffer(nil) + + // Attempt to connect + sock, err = net.Dial("udp", s.addr) + if err != nil { + log.Printf("[ERR] Error connecting to statsd! Err: %s", err) + goto WAIT + } + + for { + select { + case metric, ok := <-s.metricQueue: + // Get a metric from the queue + if !ok { + goto QUIT + } + + // Check if this would overflow the packet size + if len(metric)+buf.Len() > statsdMaxLen { + _, err := sock.Write(buf.Bytes()) + buf.Reset() + if err != nil { + log.Printf("[ERR] Error writing to statsd! Err: %s", err) + goto WAIT + } + } + + // Append to the buffer + buf.WriteString(metric) + + case <-ticker.C: + if buf.Len() == 0 { + continue + } + + _, err := sock.Write(buf.Bytes()) + buf.Reset() + if err != nil { + log.Printf("[ERR] Error flushing to statsd! Err: %s", err) + goto WAIT + } + } + } + +WAIT: + // Wait for a while + wait = time.After(time.Duration(5) * time.Second) + for { + select { + // Dequeue the messages to avoid backlog + case _, ok := <-s.metricQueue: + if !ok { + goto QUIT + } + case <-wait: + goto CONNECT + } + } +QUIT: + s.metricQueue = nil +} diff --git a/Godeps/_workspace/src/github.com/armon/go-metrics/statsite.go b/Godeps/_workspace/src/github.com/armon/go-metrics/statsite.go new file mode 100644 index 00000000000..68730139a73 --- /dev/null +++ b/Godeps/_workspace/src/github.com/armon/go-metrics/statsite.go @@ -0,0 +1,142 @@ +package metrics + +import ( + "bufio" + "fmt" + "log" + "net" + "strings" + "time" +) + +const ( + // We force flush the statsite metrics after this period of + // inactivity. Prevents stats from getting stuck in a buffer + // forever. + flushInterval = 100 * time.Millisecond +) + +// StatsiteSink provides a MetricSink that can be used with a +// statsite metrics server +type StatsiteSink struct { + addr string + metricQueue chan string +} + +// NewStatsiteSink is used to create a new StatsiteSink +func NewStatsiteSink(addr string) (*StatsiteSink, error) { + s := &StatsiteSink{ + addr: addr, + metricQueue: make(chan string, 4096), + } + go s.flushMetrics() + return s, nil +} + +// Close is used to stop flushing to statsite +func (s *StatsiteSink) Shutdown() { + close(s.metricQueue) +} + +func (s *StatsiteSink) SetGauge(key []string, val float32) { + flatKey := s.flattenKey(key) + s.pushMetric(fmt.Sprintf("%s:%f|g\n", flatKey, val)) +} + +func (s *StatsiteSink) EmitKey(key []string, val float32) { + flatKey := s.flattenKey(key) + s.pushMetric(fmt.Sprintf("%s:%f|kv\n", flatKey, val)) +} + +func (s *StatsiteSink) IncrCounter(key []string, val float32) { + flatKey := s.flattenKey(key) + s.pushMetric(fmt.Sprintf("%s:%f|c\n", flatKey, val)) +} + +func (s *StatsiteSink) AddSample(key []string, val float32) { + flatKey := s.flattenKey(key) + s.pushMetric(fmt.Sprintf("%s:%f|ms\n", flatKey, val)) +} + +// Flattens the key for formatting, removes spaces +func (s *StatsiteSink) flattenKey(parts []string) string { + joined := strings.Join(parts, ".") + return strings.Map(func(r rune) rune { + switch r { + case ':': + fallthrough + case ' ': + return '_' + default: + return r + } + }, joined) +} + +// Does a non-blocking push to the metrics queue +func (s *StatsiteSink) pushMetric(m string) { + select { + case s.metricQueue <- m: + default: + } +} + +// Flushes metrics +func (s *StatsiteSink) flushMetrics() { + var sock net.Conn + var err error + var wait <-chan time.Time + var buffered *bufio.Writer + ticker := time.NewTicker(flushInterval) + defer ticker.Stop() + +CONNECT: + // Attempt to connect + sock, err = net.Dial("tcp", s.addr) + if err != nil { + log.Printf("[ERR] Error connecting to statsite! Err: %s", err) + goto WAIT + } + + // Create a buffered writer + buffered = bufio.NewWriter(sock) + + for { + select { + case metric, ok := <-s.metricQueue: + // Get a metric from the queue + if !ok { + goto QUIT + } + + // Try to send to statsite + _, err := buffered.Write([]byte(metric)) + if err != nil { + log.Printf("[ERR] Error writing to statsite! Err: %s", err) + goto WAIT + } + case <-ticker.C: + if err := buffered.Flush(); err != nil { + log.Printf("[ERR] Error flushing to statsite! Err: %s", err) + goto WAIT + } + } + } + +WAIT: + // Wait for a while + wait = time.After(time.Duration(5) * time.Second) + for { + select { + // Dequeue the messages to avoid backlog + case _, ok := <-s.metricQueue: + if !ok { + goto QUIT + } + case <-wait: + goto CONNECT + } + } +QUIT: + s.metricQueue = nil +} diff --git a/Godeps/_workspace/src/github.com/hashicorp/go-msgpack/LICENSE b/Godeps/_workspace/src/github.com/hashicorp/go-msgpack/LICENSE new file mode 100644 index 00000000000..ccae99f6a9a --- /dev/null +++ b/Godeps/_workspace/src/github.com/hashicorp/go-msgpack/LICENSE @@ -0,0 +1,25 @@ +Copyright (c) 2012, 2013 Ugorji Nwoke. +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. +* Neither the name of the author nor the names of its contributors may be used + to endorse or promote products derived from this software + without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/Godeps/_workspace/src/github.com/hashicorp/go-msgpack/codec/0doc.go b/Godeps/_workspace/src/github.com/hashicorp/go-msgpack/codec/0doc.go new file mode 100644 index 00000000000..c14d810a73e --- /dev/null +++ b/Godeps/_workspace/src/github.com/hashicorp/go-msgpack/codec/0doc.go @@ -0,0 +1,143 @@ +// Copyright (c) 2012, 2013 Ugorji Nwoke. All rights reserved. +// Use of this source code is governed by a BSD-style license found in the LICENSE file. + +/* +High Performance, Feature-Rich Idiomatic Go encoding library for msgpack and binc . + +Supported Serialization formats are: + + - msgpack: [https://github.com/msgpack/msgpack] + - binc: [http://github.com/ugorji/binc] + +To install: + + go get github.com/ugorji/go/codec + +The idiomatic Go support is as seen in other encoding packages in +the standard library (ie json, xml, gob, etc). + +Rich Feature Set includes: + + - Simple but extremely powerful and feature-rich API + - Very High Performance. + Our extensive benchmarks show us outperforming Gob, Json and Bson by 2-4X. + This was achieved by taking extreme care on: + - managing allocation + - function frame size (important due to Go's use of split stacks), + - reflection use (and by-passing reflection for common types) + - recursion implications + - zero-copy mode (encoding/decoding to byte slice without using temp buffers) + - Correct. + Care was taken to precisely handle corner cases like: + overflows, nil maps and slices, nil value in stream, etc. + - Efficient zero-copying into temporary byte buffers + when encoding into or decoding from a byte slice. + - Standard field renaming via tags + - Encoding from any value + (struct, slice, map, primitives, pointers, interface{}, etc) + - Decoding into pointer to any non-nil typed value + (struct, slice, map, int, float32, bool, string, reflect.Value, etc) + - Supports extension functions to handle the encode/decode of custom types + - Support Go 1.2 encoding.BinaryMarshaler/BinaryUnmarshaler + - Schema-less decoding + (decode into a pointer to a nil interface{} as opposed to a typed non-nil value). + Includes Options to configure what specific map or slice type to use + when decoding an encoded list or map into a nil interface{} + - Provides a RPC Server and Client Codec for net/rpc communication protocol. + - Msgpack Specific: + - Provides extension functions to handle spec-defined extensions (binary, timestamp) + - Options to resolve ambiguities in handling raw bytes (as string or []byte) + during schema-less decoding (decoding into a nil interface{}) + - RPC Server/Client Codec for msgpack-rpc protocol defined at: + https://github.com/msgpack-rpc/msgpack-rpc/blob/master/spec.md + - Fast Paths for some container types: + For some container types, we circumvent reflection and its associated overhead + and allocation costs, and encode/decode directly. These types are: + []interface{} + []int + []string + map[interface{}]interface{} + map[int]interface{} + map[string]interface{} + +Extension Support + +Users can register a function to handle the encoding or decoding of +their custom types. + +There are no restrictions on what the custom type can be. Some examples: + + type BisSet []int + type BitSet64 uint64 + type UUID string + type MyStructWithUnexportedFields struct { a int; b bool; c []int; } + type GifImage struct { ... } + +As an illustration, MyStructWithUnexportedFields would normally be +encoded as an empty map because it has no exported fields, while UUID +would be encoded as a string. However, with extension support, you can +encode any of these however you like. + +RPC + +RPC Client and Server Codecs are implemented, so the codecs can be used +with the standard net/rpc package. + +Usage + +Typical usage model: + + // create and configure Handle + var ( + bh codec.BincHandle + mh codec.MsgpackHandle + ) + + mh.MapType = reflect.TypeOf(map[string]interface{}(nil)) + + // configure extensions + // e.g. for msgpack, define functions and enable Time support for tag 1 + // mh.AddExt(reflect.TypeOf(time.Time{}), 1, myMsgpackTimeEncodeExtFn, myMsgpackTimeDecodeExtFn) + + // create and use decoder/encoder + var ( + r io.Reader + w io.Writer + b []byte + h = &bh // or mh to use msgpack + ) + + dec = codec.NewDecoder(r, h) + dec = codec.NewDecoderBytes(b, h) + err = dec.Decode(&v) + + enc = codec.NewEncoder(w, h) + enc = codec.NewEncoderBytes(&b, h) + err = enc.Encode(v) + + //RPC Server + go func() { + for { + conn, err := listener.Accept() + rpcCodec := codec.GoRpc.ServerCodec(conn, h) + //OR rpcCodec := codec.MsgpackSpecRpc.ServerCodec(conn, h) + rpc.ServeCodec(rpcCodec) + } + }() + + //RPC Communication (client side) + conn, err = net.Dial("tcp", "localhost:5555") + rpcCodec := codec.GoRpc.ClientCodec(conn, h) + //OR rpcCodec := codec.MsgpackSpecRpc.ClientCodec(conn, h) + client := rpc.NewClientWithCodec(rpcCodec) + +Representative Benchmark Results + +Run the benchmark suite using: + go test -bi -bench=. -benchmem + +To run full benchmark suite (including against vmsgpack and bson), +see notes in ext_dep_test.go + +*/ +package codec diff --git a/Godeps/_workspace/src/github.com/hashicorp/go-msgpack/codec/README.md b/Godeps/_workspace/src/github.com/hashicorp/go-msgpack/codec/README.md new file mode 100644 index 00000000000..6c95d1bfd20 --- /dev/null +++ b/Godeps/_workspace/src/github.com/hashicorp/go-msgpack/codec/README.md @@ -0,0 +1,174 @@ +# Codec + +High Performance and Feature-Rich Idiomatic Go Library providing +encode/decode support for different serialization formats. + +Supported Serialization formats are: + + - msgpack: [https://github.com/msgpack/msgpack] + - binc: [http://github.com/ugorji/binc] + +To install: + + go get github.com/ugorji/go/codec + +Online documentation: [http://godoc.org/github.com/ugorji/go/codec] + +The idiomatic Go support is as seen in other encoding packages in +the standard library (ie json, xml, gob, etc). + +Rich Feature Set includes: + + - Simple but extremely powerful and feature-rich API + - Very High Performance. + Our extensive benchmarks show us outperforming Gob, Json and Bson by 2-4X. + This was achieved by taking extreme care on: + - managing allocation + - function frame size (important due to Go's use of split stacks), + - reflection use (and by-passing reflection for common types) + - recursion implications + - zero-copy mode (encoding/decoding to byte slice without using temp buffers) + - Correct. + Care was taken to precisely handle corner cases like: + overflows, nil maps and slices, nil value in stream, etc. + - Efficient zero-copying into temporary byte buffers + when encoding into or decoding from a byte slice. + - Standard field renaming via tags + - Encoding from any value + (struct, slice, map, primitives, pointers, interface{}, etc) + - Decoding into pointer to any non-nil typed value + (struct, slice, map, int, float32, bool, string, reflect.Value, etc) + - Supports extension functions to handle the encode/decode of custom types + - Support Go 1.2 encoding.BinaryMarshaler/BinaryUnmarshaler + - Schema-less decoding + (decode into a pointer to a nil interface{} as opposed to a typed non-nil value). + Includes Options to configure what specific map or slice type to use + when decoding an encoded list or map into a nil interface{} + - Provides a RPC Server and Client Codec for net/rpc communication protocol. + - Msgpack Specific: + - Provides extension functions to handle spec-defined extensions (binary, timestamp) + - Options to resolve ambiguities in handling raw bytes (as string or []byte) + during schema-less decoding (decoding into a nil interface{}) + - RPC Server/Client Codec for msgpack-rpc protocol defined at: + https://github.com/msgpack-rpc/msgpack-rpc/blob/master/spec.md + - Fast Paths for some container types: + For some container types, we circumvent reflection and its associated overhead + and allocation costs, and encode/decode directly. These types are: + []interface{} + []int + []string + map[interface{}]interface{} + map[int]interface{} + map[string]interface{} + +## Extension Support + +Users can register a function to handle the encoding or decoding of +their custom types. + +There are no restrictions on what the custom type can be. Some examples: + + type BisSet []int + type BitSet64 uint64 + type UUID string + type MyStructWithUnexportedFields struct { a int; b bool; c []int; } + type GifImage struct { ... } + +As an illustration, MyStructWithUnexportedFields would normally be +encoded as an empty map because it has no exported fields, while UUID +would be encoded as a string. However, with extension support, you can +encode any of these however you like. + +## RPC + +RPC Client and Server Codecs are implemented, so the codecs can be used +with the standard net/rpc package. + +## Usage + +Typical usage model: + + // create and configure Handle + var ( + bh codec.BincHandle + mh codec.MsgpackHandle + ) + + mh.MapType = reflect.TypeOf(map[string]interface{}(nil)) + + // configure extensions + // e.g. for msgpack, define functions and enable Time support for tag 1 + // mh.AddExt(reflect.TypeOf(time.Time{}), 1, myMsgpackTimeEncodeExtFn, myMsgpackTimeDecodeExtFn) + + // create and use decoder/encoder + var ( + r io.Reader + w io.Writer + b []byte + h = &bh // or mh to use msgpack + ) + + dec = codec.NewDecoder(r, h) + dec = codec.NewDecoderBytes(b, h) + err = dec.Decode(&v) + + enc = codec.NewEncoder(w, h) + enc = codec.NewEncoderBytes(&b, h) + err = enc.Encode(v) + + //RPC Server + go func() { + for { + conn, err := listener.Accept() + rpcCodec := codec.GoRpc.ServerCodec(conn, h) + //OR rpcCodec := codec.MsgpackSpecRpc.ServerCodec(conn, h) + rpc.ServeCodec(rpcCodec) + } + }() + + //RPC Communication (client side) + conn, err = net.Dial("tcp", "localhost:5555") + rpcCodec := codec.GoRpc.ClientCodec(conn, h) + //OR rpcCodec := codec.MsgpackSpecRpc.ClientCodec(conn, h) + client := rpc.NewClientWithCodec(rpcCodec) + +## Representative Benchmark Results + +A sample run of benchmark using "go test -bi -bench=. -benchmem": + + /proc/cpuinfo: Intel(R) Core(TM) i7-2630QM CPU @ 2.00GHz (HT) + + .............................................. + BENCHMARK INIT: 2013-10-16 11:02:50.345970786 -0400 EDT + To run full benchmark comparing encodings (MsgPack, Binc, JSON, GOB, etc), use: "go test -bench=." + Benchmark: + Struct recursive Depth: 1 + ApproxDeepSize Of benchmark Struct: 4694 bytes + Benchmark One-Pass Run: + v-msgpack: len: 1600 bytes + bson: len: 3025 bytes + msgpack: len: 1560 bytes + binc: len: 1187 bytes + gob: len: 1972 bytes + json: len: 2538 bytes + .............................................. + PASS + Benchmark__Msgpack____Encode 50000 54359 ns/op 14953 B/op 83 allocs/op + Benchmark__Msgpack____Decode 10000 106531 ns/op 14990 B/op 410 allocs/op + Benchmark__Binc_NoSym_Encode 50000 53956 ns/op 14966 B/op 83 allocs/op + Benchmark__Binc_NoSym_Decode 10000 103751 ns/op 14529 B/op 386 allocs/op + Benchmark__Binc_Sym___Encode 50000 65961 ns/op 17130 B/op 88 allocs/op + Benchmark__Binc_Sym___Decode 10000 106310 ns/op 15857 B/op 287 allocs/op + Benchmark__Gob________Encode 10000 135944 ns/op 21189 B/op 237 allocs/op + Benchmark__Gob________Decode 5000 405390 ns/op 83460 B/op 1841 allocs/op + Benchmark__Json_______Encode 20000 79412 ns/op 13874 B/op 102 allocs/op + Benchmark__Json_______Decode 10000 247979 ns/op 14202 B/op 493 allocs/op + Benchmark__Bson_______Encode 10000 121762 ns/op 27814 B/op 514 allocs/op + Benchmark__Bson_______Decode 10000 162126 ns/op 16514 B/op 789 allocs/op + Benchmark__VMsgpack___Encode 50000 69155 ns/op 12370 B/op 344 allocs/op + Benchmark__VMsgpack___Decode 10000 151609 ns/op 20307 B/op 571 allocs/op + ok ugorji.net/codec 30.827s + +To run full benchmark suite (including against vmsgpack and bson), +see notes in ext\_dep\_test.go + diff --git a/Godeps/_workspace/src/github.com/hashicorp/go-msgpack/codec/binc.go b/Godeps/_workspace/src/github.com/hashicorp/go-msgpack/codec/binc.go new file mode 100644 index 00000000000..2bb5e8fee85 --- /dev/null +++ b/Godeps/_workspace/src/github.com/hashicorp/go-msgpack/codec/binc.go @@ -0,0 +1,786 @@ +// Copyright (c) 2012, 2013 Ugorji Nwoke. All rights reserved. +// Use of this source code is governed by a BSD-style license found in the LICENSE file. + +package codec + +import ( + "math" + // "reflect" + // "sync/atomic" + "time" + //"fmt" +) + +const bincDoPrune = true // No longer needed. Needed before as C lib did not support pruning. + +//var _ = fmt.Printf + +// vd as low 4 bits (there are 16 slots) +const ( + bincVdSpecial byte = iota + bincVdPosInt + bincVdNegInt + bincVdFloat + + bincVdString + bincVdByteArray + bincVdArray + bincVdMap + + bincVdTimestamp + bincVdSmallInt + bincVdUnicodeOther + bincVdSymbol + + bincVdDecimal + _ // open slot + _ // open slot + bincVdCustomExt = 0x0f +) + +const ( + bincSpNil byte = iota + bincSpFalse + bincSpTrue + bincSpNan + bincSpPosInf + bincSpNegInf + bincSpZeroFloat + bincSpZero + bincSpNegOne +) + +const ( + bincFlBin16 byte = iota + bincFlBin32 + _ // bincFlBin32e + bincFlBin64 + _ // bincFlBin64e + // others not currently supported +) + +type bincEncDriver struct { + w encWriter + m map[string]uint16 // symbols + s uint32 // symbols sequencer + b [8]byte +} + +func (e *bincEncDriver) isBuiltinType(rt uintptr) bool { + return rt == timeTypId +} + +func (e *bincEncDriver) encodeBuiltin(rt uintptr, v interface{}) { + switch rt { + case timeTypId: + bs := encodeTime(v.(time.Time)) + e.w.writen1(bincVdTimestamp<<4 | uint8(len(bs))) + e.w.writeb(bs) + } +} + +func (e *bincEncDriver) encodeNil() { + e.w.writen1(bincVdSpecial<<4 | bincSpNil) +} + +func (e *bincEncDriver) encodeBool(b bool) { + if b { + e.w.writen1(bincVdSpecial<<4 | bincSpTrue) + } else { + e.w.writen1(bincVdSpecial<<4 | bincSpFalse) + } +} + +func (e *bincEncDriver) encodeFloat32(f float32) { + if f == 0 { + e.w.writen1(bincVdSpecial<<4 | bincSpZeroFloat) + return + } + e.w.writen1(bincVdFloat<<4 | bincFlBin32) + e.w.writeUint32(math.Float32bits(f)) +} + +func (e *bincEncDriver) encodeFloat64(f float64) { + if f == 0 { + e.w.writen1(bincVdSpecial<<4 | bincSpZeroFloat) + return + } + bigen.PutUint64(e.b[:], math.Float64bits(f)) + if bincDoPrune { + i := 7 + for ; i >= 0 && (e.b[i] == 0); i-- { + } + i++ + if i <= 6 { + e.w.writen1(bincVdFloat<<4 | 0x8 | bincFlBin64) + e.w.writen1(byte(i)) + e.w.writeb(e.b[:i]) + return + } + } + e.w.writen1(bincVdFloat<<4 | bincFlBin64) + e.w.writeb(e.b[:]) +} + +func (e *bincEncDriver) encIntegerPrune(bd byte, pos bool, v uint64, lim uint8) { + if lim == 4 { + bigen.PutUint32(e.b[:lim], uint32(v)) + } else { + bigen.PutUint64(e.b[:lim], v) + } + if bincDoPrune { + i := pruneSignExt(e.b[:lim], pos) + e.w.writen1(bd | lim - 1 - byte(i)) + e.w.writeb(e.b[i:lim]) + } else { + e.w.writen1(bd | lim - 1) + e.w.writeb(e.b[:lim]) + } +} + +func (e *bincEncDriver) encodeInt(v int64) { + const nbd byte = bincVdNegInt << 4 + switch { + case v >= 0: + e.encUint(bincVdPosInt<<4, true, uint64(v)) + case v == -1: + e.w.writen1(bincVdSpecial<<4 | bincSpNegOne) + default: + e.encUint(bincVdNegInt<<4, false, uint64(-v)) + } +} + +func (e *bincEncDriver) encodeUint(v uint64) { + e.encUint(bincVdPosInt<<4, true, v) +} + +func (e *bincEncDriver) encUint(bd byte, pos bool, v uint64) { + switch { + case v == 0: + e.w.writen1(bincVdSpecial<<4 | bincSpZero) + case pos && v >= 1 && v <= 16: + e.w.writen1(bincVdSmallInt<<4 | byte(v-1)) + case v <= math.MaxUint8: + e.w.writen2(bd|0x0, byte(v)) + case v <= math.MaxUint16: + e.w.writen1(bd | 0x01) + e.w.writeUint16(uint16(v)) + case v <= math.MaxUint32: + e.encIntegerPrune(bd, pos, v, 4) + default: + e.encIntegerPrune(bd, pos, v, 8) + } +} + +func (e *bincEncDriver) encodeExtPreamble(xtag byte, length int) { + e.encLen(bincVdCustomExt<<4, uint64(length)) + e.w.writen1(xtag) +} + +func (e *bincEncDriver) encodeArrayPreamble(length int) { + e.encLen(bincVdArray<<4, uint64(length)) +} + +func (e *bincEncDriver) encodeMapPreamble(length int) { + e.encLen(bincVdMap<<4, uint64(length)) +} + +func (e *bincEncDriver) encodeString(c charEncoding, v string) { + l := uint64(len(v)) + e.encBytesLen(c, l) + if l > 0 { + e.w.writestr(v) + } +} + +func (e *bincEncDriver) encodeSymbol(v string) { + // if WriteSymbolsNoRefs { + // e.encodeString(c_UTF8, v) + // return + // } + + //symbols only offer benefit when string length > 1. + //This is because strings with length 1 take only 2 bytes to store + //(bd with embedded length, and single byte for string val). + + l := len(v) + switch l { + case 0: + e.encBytesLen(c_UTF8, 0) + return + case 1: + e.encBytesLen(c_UTF8, 1) + e.w.writen1(v[0]) + return + } + if e.m == nil { + e.m = make(map[string]uint16, 16) + } + ui, ok := e.m[v] + if ok { + if ui <= math.MaxUint8 { + e.w.writen2(bincVdSymbol<<4, byte(ui)) + } else { + e.w.writen1(bincVdSymbol<<4 | 0x8) + e.w.writeUint16(ui) + } + } else { + e.s++ + ui = uint16(e.s) + //ui = uint16(atomic.AddUint32(&e.s, 1)) + e.m[v] = ui + var lenprec uint8 + switch { + case l <= math.MaxUint8: + // lenprec = 0 + case l <= math.MaxUint16: + lenprec = 1 + case int64(l) <= math.MaxUint32: + lenprec = 2 + default: + lenprec = 3 + } + if ui <= math.MaxUint8 { + e.w.writen2(bincVdSymbol<<4|0x0|0x4|lenprec, byte(ui)) + } else { + e.w.writen1(bincVdSymbol<<4 | 0x8 | 0x4 | lenprec) + e.w.writeUint16(ui) + } + switch lenprec { + case 0: + e.w.writen1(byte(l)) + case 1: + e.w.writeUint16(uint16(l)) + case 2: + e.w.writeUint32(uint32(l)) + default: + e.w.writeUint64(uint64(l)) + } + e.w.writestr(v) + } +} + +func (e *bincEncDriver) encodeStringBytes(c charEncoding, v []byte) { + l := uint64(len(v)) + e.encBytesLen(c, l) + if l > 0 { + e.w.writeb(v) + } +} + +func (e *bincEncDriver) encBytesLen(c charEncoding, length uint64) { + //TODO: support bincUnicodeOther (for now, just use string or bytearray) + if c == c_RAW { + e.encLen(bincVdByteArray<<4, length) + } else { + e.encLen(bincVdString<<4, length) + } +} + +func (e *bincEncDriver) encLen(bd byte, l uint64) { + if l < 12 { + e.w.writen1(bd | uint8(l+4)) + } else { + e.encLenNumber(bd, l) + } +} + +func (e *bincEncDriver) encLenNumber(bd byte, v uint64) { + switch { + case v <= math.MaxUint8: + e.w.writen2(bd, byte(v)) + case v <= math.MaxUint16: + e.w.writen1(bd | 0x01) + e.w.writeUint16(uint16(v)) + case v <= math.MaxUint32: + e.w.writen1(bd | 0x02) + e.w.writeUint32(uint32(v)) + default: + e.w.writen1(bd | 0x03) + e.w.writeUint64(uint64(v)) + } +} + +//------------------------------------ + +type bincDecDriver struct { + r decReader + bdRead bool + bdType valueType + bd byte + vd byte + vs byte + b [8]byte + m map[uint32]string // symbols (use uint32 as key, as map optimizes for it) +} + +func (d *bincDecDriver) initReadNext() { + if d.bdRead { + return + } + d.bd = d.r.readn1() + d.vd = d.bd >> 4 + d.vs = d.bd & 0x0f + d.bdRead = true + d.bdType = valueTypeUnset +} + +func (d *bincDecDriver) currentEncodedType() valueType { + if d.bdType == valueTypeUnset { + switch d.vd { + case bincVdSpecial: + switch d.vs { + case bincSpNil: + d.bdType = valueTypeNil + case bincSpFalse, bincSpTrue: + d.bdType = valueTypeBool + case bincSpNan, bincSpNegInf, bincSpPosInf, bincSpZeroFloat: + d.bdType = valueTypeFloat + case bincSpZero: + d.bdType = valueTypeUint + case bincSpNegOne: + d.bdType = valueTypeInt + default: + decErr("currentEncodedType: Unrecognized special value 0x%x", d.vs) + } + case bincVdSmallInt: + d.bdType = valueTypeUint + case bincVdPosInt: + d.bdType = valueTypeUint + case bincVdNegInt: + d.bdType = valueTypeInt + case bincVdFloat: + d.bdType = valueTypeFloat + case bincVdString: + d.bdType = valueTypeString + case bincVdSymbol: + d.bdType = valueTypeSymbol + case bincVdByteArray: + d.bdType = valueTypeBytes + case bincVdTimestamp: + d.bdType = valueTypeTimestamp + case bincVdCustomExt: + d.bdType = valueTypeExt + case bincVdArray: + d.bdType = valueTypeArray + case bincVdMap: + d.bdType = valueTypeMap + default: + decErr("currentEncodedType: Unrecognized d.vd: 0x%x", d.vd) + } + } + return d.bdType +} + +func (d *bincDecDriver) tryDecodeAsNil() bool { + if d.bd == bincVdSpecial<<4|bincSpNil { + d.bdRead = false + return true + } + return false +} + +func (d *bincDecDriver) isBuiltinType(rt uintptr) bool { + return rt == timeTypId +} + +func (d *bincDecDriver) decodeBuiltin(rt uintptr, v interface{}) { + switch rt { + case timeTypId: + if d.vd != bincVdTimestamp { + decErr("Invalid d.vd. Expecting 0x%x. Received: 0x%x", bincVdTimestamp, d.vd) + } + tt, err := decodeTime(d.r.readn(int(d.vs))) + if err != nil { + panic(err) + } + var vt *time.Time = v.(*time.Time) + *vt = tt + d.bdRead = false + } +} + +func (d *bincDecDriver) decFloatPre(vs, defaultLen byte) { + if vs&0x8 == 0 { + d.r.readb(d.b[0:defaultLen]) + } else { + l := d.r.readn1() + if l > 8 { + decErr("At most 8 bytes used to represent float. Received: %v bytes", l) + } + for i := l; i < 8; i++ { + d.b[i] = 0 + } + d.r.readb(d.b[0:l]) + } +} + +func (d *bincDecDriver) decFloat() (f float64) { + //if true { f = math.Float64frombits(d.r.readUint64()); break; } + switch vs := d.vs; vs & 0x7 { + case bincFlBin32: + d.decFloatPre(vs, 4) + f = float64(math.Float32frombits(bigen.Uint32(d.b[0:4]))) + case bincFlBin64: + d.decFloatPre(vs, 8) + f = math.Float64frombits(bigen.Uint64(d.b[0:8])) + default: + decErr("only float32 and float64 are supported. d.vd: 0x%x, d.vs: 0x%x", d.vd, d.vs) + } + return +} + +func (d *bincDecDriver) decUint() (v uint64) { + // need to inline the code (interface conversion and type assertion expensive) + switch d.vs { + case 0: + v = uint64(d.r.readn1()) + case 1: + d.r.readb(d.b[6:]) + v = uint64(bigen.Uint16(d.b[6:])) + case 2: + d.b[4] = 0 + d.r.readb(d.b[5:]) + v = uint64(bigen.Uint32(d.b[4:])) + case 3: + d.r.readb(d.b[4:]) + v = uint64(bigen.Uint32(d.b[4:])) + case 4, 5, 6: + lim := int(7 - d.vs) + d.r.readb(d.b[lim:]) + for i := 0; i < lim; i++ { + d.b[i] = 0 + } + v = uint64(bigen.Uint64(d.b[:])) + case 7: + d.r.readb(d.b[:]) + v = uint64(bigen.Uint64(d.b[:])) + default: + decErr("unsigned integers with greater than 64 bits of precision not supported") + } + return +} + +func (d *bincDecDriver) decIntAny() (ui uint64, i int64, neg bool) { + switch d.vd { + case bincVdPosInt: + ui = d.decUint() + i = int64(ui) + case bincVdNegInt: + ui = d.decUint() + i = -(int64(ui)) + neg = true + case bincVdSmallInt: + i = int64(d.vs) + 1 + ui = uint64(d.vs) + 1 + case bincVdSpecial: + switch d.vs { + case bincSpZero: + //i = 0 + case bincSpNegOne: + neg = true + ui = 1 + i = -1 + default: + decErr("numeric decode fails for special value: d.vs: 0x%x", d.vs) + } + default: + decErr("number can only be decoded from uint or int values. d.bd: 0x%x, d.vd: 0x%x", d.bd, d.vd) + } + return +} + +func (d *bincDecDriver) decodeInt(bitsize uint8) (i int64) { + _, i, _ = d.decIntAny() + checkOverflow(0, i, bitsize) + d.bdRead = false + return +} + +func (d *bincDecDriver) decodeUint(bitsize uint8) (ui uint64) { + ui, i, neg := d.decIntAny() + if neg { + decErr("Assigning negative signed value: %v, to unsigned type", i) + } + checkOverflow(ui, 0, bitsize) + d.bdRead = false + return +} + +func (d *bincDecDriver) decodeFloat(chkOverflow32 bool) (f float64) { + switch d.vd { + case bincVdSpecial: + d.bdRead = false + switch d.vs { + case bincSpNan: + return math.NaN() + case bincSpPosInf: + return math.Inf(1) + case bincSpZeroFloat, bincSpZero: + return + case bincSpNegInf: + return math.Inf(-1) + default: + decErr("Invalid d.vs decoding float where d.vd=bincVdSpecial: %v", d.vs) + } + case bincVdFloat: + f = d.decFloat() + default: + _, i, _ := d.decIntAny() + f = float64(i) + } + checkOverflowFloat32(f, chkOverflow32) + d.bdRead = false + return +} + +// bool can be decoded from bool only (single byte). +func (d *bincDecDriver) decodeBool() (b bool) { + switch d.bd { + case (bincVdSpecial | bincSpFalse): + // b = false + case (bincVdSpecial | bincSpTrue): + b = true + default: + decErr("Invalid single-byte value for bool: %s: %x", msgBadDesc, d.bd) + } + d.bdRead = false + return +} + +func (d *bincDecDriver) readMapLen() (length int) { + if d.vd != bincVdMap { + decErr("Invalid d.vd for map. Expecting 0x%x. Got: 0x%x", bincVdMap, d.vd) + } + length = d.decLen() + d.bdRead = false + return +} + +func (d *bincDecDriver) readArrayLen() (length int) { + if d.vd != bincVdArray { + decErr("Invalid d.vd for array. Expecting 0x%x. Got: 0x%x", bincVdArray, d.vd) + } + length = d.decLen() + d.bdRead = false + return +} + +func (d *bincDecDriver) decLen() int { + if d.vs <= 3 { + return int(d.decUint()) + } + return int(d.vs - 4) +} + +func (d *bincDecDriver) decodeString() (s string) { + switch d.vd { + case bincVdString, bincVdByteArray: + if length := d.decLen(); length > 0 { + s = string(d.r.readn(length)) + } + case bincVdSymbol: + //from vs: extract numSymbolBytes, containsStringVal, strLenPrecision, + //extract symbol + //if containsStringVal, read it and put in map + //else look in map for string value + var symbol uint32 + vs := d.vs + //fmt.Printf(">>>> d.vs: 0b%b, & 0x8: %v, & 0x4: %v\n", d.vs, vs & 0x8, vs & 0x4) + if vs&0x8 == 0 { + symbol = uint32(d.r.readn1()) + } else { + symbol = uint32(d.r.readUint16()) + } + if d.m == nil { + d.m = make(map[uint32]string, 16) + } + + if vs&0x4 == 0 { + s = d.m[symbol] + } else { + var slen int + switch vs & 0x3 { + case 0: + slen = int(d.r.readn1()) + case 1: + slen = int(d.r.readUint16()) + case 2: + slen = int(d.r.readUint32()) + case 3: + slen = int(d.r.readUint64()) + } + s = string(d.r.readn(slen)) + d.m[symbol] = s + } + default: + decErr("Invalid d.vd for string. Expecting string:0x%x, bytearray:0x%x or symbol: 0x%x. Got: 0x%x", + bincVdString, bincVdByteArray, bincVdSymbol, d.vd) + } + d.bdRead = false + return +} + +func (d *bincDecDriver) decodeBytes(bs []byte) (bsOut []byte, changed bool) { + var clen int + switch d.vd { + case bincVdString, bincVdByteArray: + clen = d.decLen() + default: + decErr("Invalid d.vd for bytes. Expecting string:0x%x or bytearray:0x%x. Got: 0x%x", + bincVdString, bincVdByteArray, d.vd) + } + if clen > 0 { + // if no contents in stream, don't update the passed byteslice + if len(bs) != clen { + if len(bs) > clen { + bs = bs[:clen] + } else { + bs = make([]byte, clen) + } + bsOut = bs + changed = true + } + d.r.readb(bs) + } + d.bdRead = false + return +} + +func (d *bincDecDriver) decodeExt(verifyTag bool, tag byte) (xtag byte, xbs []byte) { + switch d.vd { + case bincVdCustomExt: + l := d.decLen() + xtag = d.r.readn1() + if verifyTag && xtag != tag { + decErr("Wrong extension tag. Got %b. Expecting: %v", xtag, tag) + } + xbs = d.r.readn(l) + case bincVdByteArray: + xbs, _ = d.decodeBytes(nil) + default: + decErr("Invalid d.vd for extensions (Expecting extensions or byte array). Got: 0x%x", d.vd) + } + d.bdRead = false + return +} + +func (d *bincDecDriver) decodeNaked() (v interface{}, vt valueType, decodeFurther bool) { + d.initReadNext() + + switch d.vd { + case bincVdSpecial: + switch d.vs { + case bincSpNil: + vt = valueTypeNil + case bincSpFalse: + vt = valueTypeBool + v = false + case bincSpTrue: + vt = valueTypeBool + v = true + case bincSpNan: + vt = valueTypeFloat + v = math.NaN() + case bincSpPosInf: + vt = valueTypeFloat + v = math.Inf(1) + case bincSpNegInf: + vt = valueTypeFloat + v = math.Inf(-1) + case bincSpZeroFloat: + vt = valueTypeFloat + v = float64(0) + case bincSpZero: + vt = valueTypeUint + v = int64(0) // int8(0) + case bincSpNegOne: + vt = valueTypeInt + v = int64(-1) // int8(-1) + default: + decErr("decodeNaked: Unrecognized special value 0x%x", d.vs) + } + case bincVdSmallInt: + vt = valueTypeUint + v = uint64(int8(d.vs)) + 1 // int8(d.vs) + 1 + case bincVdPosInt: + vt = valueTypeUint + v = d.decUint() + case bincVdNegInt: + vt = valueTypeInt + v = -(int64(d.decUint())) + case bincVdFloat: + vt = valueTypeFloat + v = d.decFloat() + case bincVdSymbol: + vt = valueTypeSymbol + v = d.decodeString() + case bincVdString: + vt = valueTypeString + v = d.decodeString() + case bincVdByteArray: + vt = valueTypeBytes + v, _ = d.decodeBytes(nil) + case bincVdTimestamp: + vt = valueTypeTimestamp + tt, err := decodeTime(d.r.readn(int(d.vs))) + if err != nil { + panic(err) + } + v = tt + case bincVdCustomExt: + vt = valueTypeExt + l := d.decLen() + var re RawExt + re.Tag = d.r.readn1() + re.Data = d.r.readn(l) + v = &re + vt = valueTypeExt + case bincVdArray: + vt = valueTypeArray + decodeFurther = true + case bincVdMap: + vt = valueTypeMap + decodeFurther = true + default: + decErr("decodeNaked: Unrecognized d.vd: 0x%x", d.vd) + } + + if !decodeFurther { + d.bdRead = false + } + return +} + +//------------------------------------ + +//BincHandle is a Handle for the Binc Schema-Free Encoding Format +//defined at https://github.com/ugorji/binc . +// +//BincHandle currently supports all Binc features with the following EXCEPTIONS: +// - only integers up to 64 bits of precision are supported. +// big integers are unsupported. +// - Only IEEE 754 binary32 and binary64 floats are supported (ie Go float32 and float64 types). +// extended precision and decimal IEEE 754 floats are unsupported. +// - Only UTF-8 strings supported. +// Unicode_Other Binc types (UTF16, UTF32) are currently unsupported. +//Note that these EXCEPTIONS are temporary and full support is possible and may happen soon. +type BincHandle struct { + BasicHandle +} + +func (h *BincHandle) newEncDriver(w encWriter) encDriver { + return &bincEncDriver{w: w} +} + +func (h *BincHandle) newDecDriver(r decReader) decDriver { + return &bincDecDriver{r: r} +} + +func (_ *BincHandle) writeExt() bool { + return true +} + +func (h *BincHandle) getBasicHandle() *BasicHandle { + return &h.BasicHandle +} diff --git a/Godeps/_workspace/src/github.com/hashicorp/go-msgpack/codec/decode.go b/Godeps/_workspace/src/github.com/hashicorp/go-msgpack/codec/decode.go new file mode 100644 index 00000000000..87bef2b9358 --- /dev/null +++ b/Godeps/_workspace/src/github.com/hashicorp/go-msgpack/codec/decode.go @@ -0,0 +1,1048 @@ +// Copyright (c) 2012, 2013 Ugorji Nwoke. All rights reserved. +// Use of this source code is governed by a BSD-style license found in the LICENSE file. + +package codec + +import ( + "io" + "reflect" + // "runtime/debug" +) + +// Some tagging information for error messages. +const ( + msgTagDec = "codec.decoder" + msgBadDesc = "Unrecognized descriptor byte" + msgDecCannotExpandArr = "cannot expand go array from %v to stream length: %v" +) + +// decReader abstracts the reading source, allowing implementations that can +// read from an io.Reader or directly off a byte slice with zero-copying. +type decReader interface { + readn(n int) []byte + readb([]byte) + readn1() uint8 + readUint16() uint16 + readUint32() uint32 + readUint64() uint64 +} + +type decDriver interface { + initReadNext() + tryDecodeAsNil() bool + currentEncodedType() valueType + isBuiltinType(rt uintptr) bool + decodeBuiltin(rt uintptr, v interface{}) + //decodeNaked: Numbers are decoded as int64, uint64, float64 only (no smaller sized number types). + decodeNaked() (v interface{}, vt valueType, decodeFurther bool) + decodeInt(bitsize uint8) (i int64) + decodeUint(bitsize uint8) (ui uint64) + decodeFloat(chkOverflow32 bool) (f float64) + decodeBool() (b bool) + // decodeString can also decode symbols + decodeString() (s string) + decodeBytes(bs []byte) (bsOut []byte, changed bool) + decodeExt(verifyTag bool, tag byte) (xtag byte, xbs []byte) + readMapLen() int + readArrayLen() int +} + +type DecodeOptions struct { + // An instance of MapType is used during schema-less decoding of a map in the stream. + // If nil, we use map[interface{}]interface{} + MapType reflect.Type + // An instance of SliceType is used during schema-less decoding of an array in the stream. + // If nil, we use []interface{} + SliceType reflect.Type + // ErrorIfNoField controls whether an error is returned when decoding a map + // from a codec stream into a struct, and no matching struct field is found. + ErrorIfNoField bool +} + +// ------------------------------------ + +// ioDecReader is a decReader that reads off an io.Reader +type ioDecReader struct { + r io.Reader + br io.ByteReader + x [8]byte //temp byte array re-used internally for efficiency +} + +func (z *ioDecReader) readn(n int) (bs []byte) { + if n <= 0 { + return + } + bs = make([]byte, n) + if _, err := io.ReadAtLeast(z.r, bs, n); err != nil { + panic(err) + } + return +} + +func (z *ioDecReader) readb(bs []byte) { + if _, err := io.ReadAtLeast(z.r, bs, len(bs)); err != nil { + panic(err) + } +} + +func (z *ioDecReader) readn1() uint8 { + if z.br != nil { + b, err := z.br.ReadByte() + if err != nil { + panic(err) + } + return b + } + z.readb(z.x[:1]) + return z.x[0] +} + +func (z *ioDecReader) readUint16() uint16 { + z.readb(z.x[:2]) + return bigen.Uint16(z.x[:2]) +} + +func (z *ioDecReader) readUint32() uint32 { + z.readb(z.x[:4]) + return bigen.Uint32(z.x[:4]) +} + +func (z *ioDecReader) readUint64() uint64 { + z.readb(z.x[:8]) + return bigen.Uint64(z.x[:8]) +} + +// ------------------------------------ + +// bytesDecReader is a decReader that reads off a byte slice with zero copying +type bytesDecReader struct { + b []byte // data + c int // cursor + a int // available +} + +func (z *bytesDecReader) consume(n int) (oldcursor int) { + if z.a == 0 { + panic(io.EOF) + } + if n > z.a { + decErr("Trying to read %v bytes. Only %v available", n, z.a) + } + // z.checkAvailable(n) + oldcursor = z.c + z.c = oldcursor + n + z.a = z.a - n + return +} + +func (z *bytesDecReader) readn(n int) (bs []byte) { + if n <= 0 { + return + } + c0 := z.consume(n) + bs = z.b[c0:z.c] + return +} + +func (z *bytesDecReader) readb(bs []byte) { + copy(bs, z.readn(len(bs))) +} + +func (z *bytesDecReader) readn1() uint8 { + c0 := z.consume(1) + return z.b[c0] +} + +// Use binaryEncoding helper for 4 and 8 bits, but inline it for 2 bits +// creating temp slice variable and copying it to helper function is expensive +// for just 2 bits. + +func (z *bytesDecReader) readUint16() uint16 { + c0 := z.consume(2) + return uint16(z.b[c0+1]) | uint16(z.b[c0])<<8 +} + +func (z *bytesDecReader) readUint32() uint32 { + c0 := z.consume(4) + return bigen.Uint32(z.b[c0:z.c]) +} + +func (z *bytesDecReader) readUint64() uint64 { + c0 := z.consume(8) + return bigen.Uint64(z.b[c0:z.c]) +} + +// ------------------------------------ + +// decFnInfo has methods for registering handling decoding of a specific type +// based on some characteristics (builtin, extension, reflect Kind, etc) +type decFnInfo struct { + ti *typeInfo + d *Decoder + dd decDriver + xfFn func(reflect.Value, []byte) error + xfTag byte + array bool +} + +func (f *decFnInfo) builtin(rv reflect.Value) { + f.dd.decodeBuiltin(f.ti.rtid, rv.Addr().Interface()) +} + +func (f *decFnInfo) rawExt(rv reflect.Value) { + xtag, xbs := f.dd.decodeExt(false, 0) + rv.Field(0).SetUint(uint64(xtag)) + rv.Field(1).SetBytes(xbs) +} + +func (f *decFnInfo) ext(rv reflect.Value) { + _, xbs := f.dd.decodeExt(true, f.xfTag) + if fnerr := f.xfFn(rv, xbs); fnerr != nil { + panic(fnerr) + } +} + +func (f *decFnInfo) binaryMarshal(rv reflect.Value) { + var bm binaryUnmarshaler + if f.ti.unmIndir == -1 { + bm = rv.Addr().Interface().(binaryUnmarshaler) + } else if f.ti.unmIndir == 0 { + bm = rv.Interface().(binaryUnmarshaler) + } else { + for j, k := int8(0), f.ti.unmIndir; j < k; j++ { + if rv.IsNil() { + rv.Set(reflect.New(rv.Type().Elem())) + } + rv = rv.Elem() + } + bm = rv.Interface().(binaryUnmarshaler) + } + xbs, _ := f.dd.decodeBytes(nil) + if fnerr := bm.UnmarshalBinary(xbs); fnerr != nil { + panic(fnerr) + } +} + +func (f *decFnInfo) kErr(rv reflect.Value) { + decErr("Unhandled value for kind: %v: %s", rv.Kind(), msgBadDesc) +} + +func (f *decFnInfo) kString(rv reflect.Value) { + rv.SetString(f.dd.decodeString()) +} + +func (f *decFnInfo) kBool(rv reflect.Value) { + rv.SetBool(f.dd.decodeBool()) +} + +func (f *decFnInfo) kInt(rv reflect.Value) { + rv.SetInt(f.dd.decodeInt(intBitsize)) +} + +func (f *decFnInfo) kInt64(rv reflect.Value) { + rv.SetInt(f.dd.decodeInt(64)) +} + +func (f *decFnInfo) kInt32(rv reflect.Value) { + rv.SetInt(f.dd.decodeInt(32)) +} + +func (f *decFnInfo) kInt8(rv reflect.Value) { + rv.SetInt(f.dd.decodeInt(8)) +} + +func (f *decFnInfo) kInt16(rv reflect.Value) { + rv.SetInt(f.dd.decodeInt(16)) +} + +func (f *decFnInfo) kFloat32(rv reflect.Value) { + rv.SetFloat(f.dd.decodeFloat(true)) +} + +func (f *decFnInfo) kFloat64(rv reflect.Value) { + rv.SetFloat(f.dd.decodeFloat(false)) +} + +func (f *decFnInfo) kUint8(rv reflect.Value) { + rv.SetUint(f.dd.decodeUint(8)) +} + +func (f *decFnInfo) kUint64(rv reflect.Value) { + rv.SetUint(f.dd.decodeUint(64)) +} + +func (f *decFnInfo) kUint(rv reflect.Value) { + rv.SetUint(f.dd.decodeUint(uintBitsize)) +} + +func (f *decFnInfo) kUint32(rv reflect.Value) { + rv.SetUint(f.dd.decodeUint(32)) +} + +func (f *decFnInfo) kUint16(rv reflect.Value) { + rv.SetUint(f.dd.decodeUint(16)) +} + +// func (f *decFnInfo) kPtr(rv reflect.Value) { +// debugf(">>>>>>> ??? decode kPtr called - shouldn't get called") +// if rv.IsNil() { +// rv.Set(reflect.New(rv.Type().Elem())) +// } +// f.d.decodeValue(rv.Elem()) +// } + +func (f *decFnInfo) kInterface(rv reflect.Value) { + // debugf("\t===> kInterface") + if !rv.IsNil() { + f.d.decodeValue(rv.Elem()) + return + } + // nil interface: + // use some hieristics to set the nil interface to an + // appropriate value based on the first byte read (byte descriptor bd) + v, vt, decodeFurther := f.dd.decodeNaked() + if vt == valueTypeNil { + return + } + // Cannot decode into nil interface with methods (e.g. error, io.Reader, etc) + // if non-nil value in stream. + if num := f.ti.rt.NumMethod(); num > 0 { + decErr("decodeValue: Cannot decode non-nil codec value into nil %v (%v methods)", + f.ti.rt, num) + } + var rvn reflect.Value + var useRvn bool + switch vt { + case valueTypeMap: + if f.d.h.MapType == nil { + var m2 map[interface{}]interface{} + v = &m2 + } else { + rvn = reflect.New(f.d.h.MapType).Elem() + useRvn = true + } + case valueTypeArray: + if f.d.h.SliceType == nil { + var m2 []interface{} + v = &m2 + } else { + rvn = reflect.New(f.d.h.SliceType).Elem() + useRvn = true + } + case valueTypeExt: + re := v.(*RawExt) + var bfn func(reflect.Value, []byte) error + rvn, bfn = f.d.h.getDecodeExtForTag(re.Tag) + if bfn == nil { + rvn = reflect.ValueOf(*re) + } else if fnerr := bfn(rvn, re.Data); fnerr != nil { + panic(fnerr) + } + rv.Set(rvn) + return + } + if decodeFurther { + if useRvn { + f.d.decodeValue(rvn) + } else if v != nil { + // this v is a pointer, so we need to dereference it when done + f.d.decode(v) + rvn = reflect.ValueOf(v).Elem() + useRvn = true + } + } + if useRvn { + rv.Set(rvn) + } else if v != nil { + rv.Set(reflect.ValueOf(v)) + } +} + +func (f *decFnInfo) kStruct(rv reflect.Value) { + fti := f.ti + if currEncodedType := f.dd.currentEncodedType(); currEncodedType == valueTypeMap { + containerLen := f.dd.readMapLen() + if containerLen == 0 { + return + } + tisfi := fti.sfi + for j := 0; j < containerLen; j++ { + // var rvkencname string + // ddecode(&rvkencname) + f.dd.initReadNext() + rvkencname := f.dd.decodeString() + // rvksi := ti.getForEncName(rvkencname) + if k := fti.indexForEncName(rvkencname); k > -1 { + sfik := tisfi[k] + if sfik.i != -1 { + f.d.decodeValue(rv.Field(int(sfik.i))) + } else { + f.d.decEmbeddedField(rv, sfik.is) + } + // f.d.decodeValue(ti.field(k, rv)) + } else { + if f.d.h.ErrorIfNoField { + decErr("No matching struct field found when decoding stream map with key: %v", + rvkencname) + } else { + var nilintf0 interface{} + f.d.decodeValue(reflect.ValueOf(&nilintf0).Elem()) + } + } + } + } else if currEncodedType == valueTypeArray { + containerLen := f.dd.readArrayLen() + if containerLen == 0 { + return + } + for j, si := range fti.sfip { + if j == containerLen { + break + } + if si.i != -1 { + f.d.decodeValue(rv.Field(int(si.i))) + } else { + f.d.decEmbeddedField(rv, si.is) + } + } + if containerLen > len(fti.sfip) { + // read remaining values and throw away + for j := len(fti.sfip); j < containerLen; j++ { + var nilintf0 interface{} + f.d.decodeValue(reflect.ValueOf(&nilintf0).Elem()) + } + } + } else { + decErr("Only encoded map or array can be decoded into a struct. (valueType: %x)", + currEncodedType) + } +} + +func (f *decFnInfo) kSlice(rv reflect.Value) { + // A slice can be set from a map or array in stream. + currEncodedType := f.dd.currentEncodedType() + + switch currEncodedType { + case valueTypeBytes, valueTypeString: + if f.ti.rtid == uint8SliceTypId || f.ti.rt.Elem().Kind() == reflect.Uint8 { + if bs2, changed2 := f.dd.decodeBytes(rv.Bytes()); changed2 { + rv.SetBytes(bs2) + } + return + } + } + + if shortCircuitReflectToFastPath && rv.CanAddr() { + switch f.ti.rtid { + case intfSliceTypId: + f.d.decSliceIntf(rv.Addr().Interface().(*[]interface{}), currEncodedType, f.array) + return + case uint64SliceTypId: + f.d.decSliceUint64(rv.Addr().Interface().(*[]uint64), currEncodedType, f.array) + return + case int64SliceTypId: + f.d.decSliceInt64(rv.Addr().Interface().(*[]int64), currEncodedType, f.array) + return + case strSliceTypId: + f.d.decSliceStr(rv.Addr().Interface().(*[]string), currEncodedType, f.array) + return + } + } + + containerLen, containerLenS := decContLens(f.dd, currEncodedType) + + // an array can never return a nil slice. so no need to check f.array here. + + if rv.IsNil() { + rv.Set(reflect.MakeSlice(f.ti.rt, containerLenS, containerLenS)) + } + + if containerLen == 0 { + return + } + + if rvcap, rvlen := rv.Len(), rv.Cap(); containerLenS > rvcap { + if f.array { // !rv.CanSet() + decErr(msgDecCannotExpandArr, rvcap, containerLenS) + } + rvn := reflect.MakeSlice(f.ti.rt, containerLenS, containerLenS) + if rvlen > 0 { + reflect.Copy(rvn, rv) + } + rv.Set(rvn) + } else if containerLenS > rvlen { + rv.SetLen(containerLenS) + } + + for j := 0; j < containerLenS; j++ { + f.d.decodeValue(rv.Index(j)) + } +} + +func (f *decFnInfo) kArray(rv reflect.Value) { + // f.d.decodeValue(rv.Slice(0, rv.Len())) + f.kSlice(rv.Slice(0, rv.Len())) +} + +func (f *decFnInfo) kMap(rv reflect.Value) { + if shortCircuitReflectToFastPath && rv.CanAddr() { + switch f.ti.rtid { + case mapStrIntfTypId: + f.d.decMapStrIntf(rv.Addr().Interface().(*map[string]interface{})) + return + case mapIntfIntfTypId: + f.d.decMapIntfIntf(rv.Addr().Interface().(*map[interface{}]interface{})) + return + case mapInt64IntfTypId: + f.d.decMapInt64Intf(rv.Addr().Interface().(*map[int64]interface{})) + return + case mapUint64IntfTypId: + f.d.decMapUint64Intf(rv.Addr().Interface().(*map[uint64]interface{})) + return + } + } + + containerLen := f.dd.readMapLen() + + if rv.IsNil() { + rv.Set(reflect.MakeMap(f.ti.rt)) + } + + if containerLen == 0 { + return + } + + ktype, vtype := f.ti.rt.Key(), f.ti.rt.Elem() + ktypeId := reflect.ValueOf(ktype).Pointer() + for j := 0; j < containerLen; j++ { + rvk := reflect.New(ktype).Elem() + f.d.decodeValue(rvk) + + // special case if a byte array. + // if ktype == intfTyp { + if ktypeId == intfTypId { + rvk = rvk.Elem() + if rvk.Type() == uint8SliceTyp { + rvk = reflect.ValueOf(string(rvk.Bytes())) + } + } + rvv := rv.MapIndex(rvk) + if !rvv.IsValid() { + rvv = reflect.New(vtype).Elem() + } + + f.d.decodeValue(rvv) + rv.SetMapIndex(rvk, rvv) + } +} + +// ---------------------------------------- + +type decFn struct { + i *decFnInfo + f func(*decFnInfo, reflect.Value) +} + +// A Decoder reads and decodes an object from an input stream in the codec format. +type Decoder struct { + r decReader + d decDriver + h *BasicHandle + f map[uintptr]decFn + x []uintptr + s []decFn +} + +// NewDecoder returns a Decoder for decoding a stream of bytes from an io.Reader. +// +// For efficiency, Users are encouraged to pass in a memory buffered writer +// (eg bufio.Reader, bytes.Buffer). +func NewDecoder(r io.Reader, h Handle) *Decoder { + z := ioDecReader{ + r: r, + } + z.br, _ = r.(io.ByteReader) + return &Decoder{r: &z, d: h.newDecDriver(&z), h: h.getBasicHandle()} +} + +// NewDecoderBytes returns a Decoder which efficiently decodes directly +// from a byte slice with zero copying. +func NewDecoderBytes(in []byte, h Handle) *Decoder { + z := bytesDecReader{ + b: in, + a: len(in), + } + return &Decoder{r: &z, d: h.newDecDriver(&z), h: h.getBasicHandle()} +} + +// Decode decodes the stream from reader and stores the result in the +// value pointed to by v. v cannot be a nil pointer. v can also be +// a reflect.Value of a pointer. +// +// Note that a pointer to a nil interface is not a nil pointer. +// If you do not know what type of stream it is, pass in a pointer to a nil interface. +// We will decode and store a value in that nil interface. +// +// Sample usages: +// // Decoding into a non-nil typed value +// var f float32 +// err = codec.NewDecoder(r, handle).Decode(&f) +// +// // Decoding into nil interface +// var v interface{} +// dec := codec.NewDecoder(r, handle) +// err = dec.Decode(&v) +// +// When decoding into a nil interface{}, we will decode into an appropriate value based +// on the contents of the stream: +// - Numbers are decoded as float64, int64 or uint64. +// - Other values are decoded appropriately depending on the type: +// bool, string, []byte, time.Time, etc +// - Extensions are decoded as RawExt (if no ext function registered for the tag) +// Configurations exist on the Handle to override defaults +// (e.g. for MapType, SliceType and how to decode raw bytes). +// +// When decoding into a non-nil interface{} value, the mode of encoding is based on the +// type of the value. When a value is seen: +// - If an extension is registered for it, call that extension function +// - If it implements BinaryUnmarshaler, call its UnmarshalBinary(data []byte) error +// - Else decode it based on its reflect.Kind +// +// There are some special rules when decoding into containers (slice/array/map/struct). +// Decode will typically use the stream contents to UPDATE the container. +// - A map can be decoded from a stream map, by updating matching keys. +// - A slice can be decoded from a stream array, +// by updating the first n elements, where n is length of the stream. +// - A slice can be decoded from a stream map, by decoding as if +// it contains a sequence of key-value pairs. +// - A struct can be decoded from a stream map, by updating matching fields. +// - A struct can be decoded from a stream array, +// by updating fields as they occur in the struct (by index). +// +// When decoding a stream map or array with length of 0 into a nil map or slice, +// we reset the destination map or slice to a zero-length value. +// +// However, when decoding a stream nil, we reset the destination container +// to its "zero" value (e.g. nil for slice/map, etc). +// +func (d *Decoder) Decode(v interface{}) (err error) { + defer panicToErr(&err) + d.decode(v) + return +} + +func (d *Decoder) decode(iv interface{}) { + d.d.initReadNext() + + switch v := iv.(type) { + case nil: + decErr("Cannot decode into nil.") + + case reflect.Value: + d.chkPtrValue(v) + d.decodeValue(v.Elem()) + + case *string: + *v = d.d.decodeString() + case *bool: + *v = d.d.decodeBool() + case *int: + *v = int(d.d.decodeInt(intBitsize)) + case *int8: + *v = int8(d.d.decodeInt(8)) + case *int16: + *v = int16(d.d.decodeInt(16)) + case *int32: + *v = int32(d.d.decodeInt(32)) + case *int64: + *v = d.d.decodeInt(64) + case *uint: + *v = uint(d.d.decodeUint(uintBitsize)) + case *uint8: + *v = uint8(d.d.decodeUint(8)) + case *uint16: + *v = uint16(d.d.decodeUint(16)) + case *uint32: + *v = uint32(d.d.decodeUint(32)) + case *uint64: + *v = d.d.decodeUint(64) + case *float32: + *v = float32(d.d.decodeFloat(true)) + case *float64: + *v = d.d.decodeFloat(false) + case *[]byte: + *v, _ = d.d.decodeBytes(*v) + + case *[]interface{}: + d.decSliceIntf(v, valueTypeInvalid, false) + case *[]uint64: + d.decSliceUint64(v, valueTypeInvalid, false) + case *[]int64: + d.decSliceInt64(v, valueTypeInvalid, false) + case *[]string: + d.decSliceStr(v, valueTypeInvalid, false) + case *map[string]interface{}: + d.decMapStrIntf(v) + case *map[interface{}]interface{}: + d.decMapIntfIntf(v) + case *map[uint64]interface{}: + d.decMapUint64Intf(v) + case *map[int64]interface{}: + d.decMapInt64Intf(v) + + case *interface{}: + d.decodeValue(reflect.ValueOf(iv).Elem()) + + default: + rv := reflect.ValueOf(iv) + d.chkPtrValue(rv) + d.decodeValue(rv.Elem()) + } +} + +func (d *Decoder) decodeValue(rv reflect.Value) { + d.d.initReadNext() + + if d.d.tryDecodeAsNil() { + // If value in stream is nil, set the dereferenced value to its "zero" value (if settable) + if rv.Kind() == reflect.Ptr { + if !rv.IsNil() { + rv.Set(reflect.Zero(rv.Type())) + } + return + } + // for rv.Kind() == reflect.Ptr { + // rv = rv.Elem() + // } + if rv.IsValid() { // rv.CanSet() // always settable, except it's invalid + rv.Set(reflect.Zero(rv.Type())) + } + return + } + + // If stream is not containing a nil value, then we can deref to the base + // non-pointer value, and decode into that. + for rv.Kind() == reflect.Ptr { + if rv.IsNil() { + rv.Set(reflect.New(rv.Type().Elem())) + } + rv = rv.Elem() + } + + rt := rv.Type() + rtid := reflect.ValueOf(rt).Pointer() + + // retrieve or register a focus'ed function for this type + // to eliminate need to do the retrieval multiple times + + // if d.f == nil && d.s == nil { debugf("---->Creating new dec f map for type: %v\n", rt) } + var fn decFn + var ok bool + if useMapForCodecCache { + fn, ok = d.f[rtid] + } else { + for i, v := range d.x { + if v == rtid { + fn, ok = d.s[i], true + break + } + } + } + if !ok { + // debugf("\tCreating new dec fn for type: %v\n", rt) + fi := decFnInfo{ti: getTypeInfo(rtid, rt), d: d, dd: d.d} + fn.i = &fi + // An extension can be registered for any type, regardless of the Kind + // (e.g. type BitSet int64, type MyStruct { / * unexported fields * / }, type X []int, etc. + // + // We can't check if it's an extension byte here first, because the user may have + // registered a pointer or non-pointer type, meaning we may have to recurse first + // before matching a mapped type, even though the extension byte is already detected. + // + // NOTE: if decoding into a nil interface{}, we return a non-nil + // value except even if the container registers a length of 0. + if rtid == rawExtTypId { + fn.f = (*decFnInfo).rawExt + } else if d.d.isBuiltinType(rtid) { + fn.f = (*decFnInfo).builtin + } else if xfTag, xfFn := d.h.getDecodeExt(rtid); xfFn != nil { + fi.xfTag, fi.xfFn = xfTag, xfFn + fn.f = (*decFnInfo).ext + } else if supportBinaryMarshal && fi.ti.unm { + fn.f = (*decFnInfo).binaryMarshal + } else { + switch rk := rt.Kind(); rk { + case reflect.String: + fn.f = (*decFnInfo).kString + case reflect.Bool: + fn.f = (*decFnInfo).kBool + case reflect.Int: + fn.f = (*decFnInfo).kInt + case reflect.Int64: + fn.f = (*decFnInfo).kInt64 + case reflect.Int32: + fn.f = (*decFnInfo).kInt32 + case reflect.Int8: + fn.f = (*decFnInfo).kInt8 + case reflect.Int16: + fn.f = (*decFnInfo).kInt16 + case reflect.Float32: + fn.f = (*decFnInfo).kFloat32 + case reflect.Float64: + fn.f = (*decFnInfo).kFloat64 + case reflect.Uint8: + fn.f = (*decFnInfo).kUint8 + case reflect.Uint64: + fn.f = (*decFnInfo).kUint64 + case reflect.Uint: + fn.f = (*decFnInfo).kUint + case reflect.Uint32: + fn.f = (*decFnInfo).kUint32 + case reflect.Uint16: + fn.f = (*decFnInfo).kUint16 + // case reflect.Ptr: + // fn.f = (*decFnInfo).kPtr + case reflect.Interface: + fn.f = (*decFnInfo).kInterface + case reflect.Struct: + fn.f = (*decFnInfo).kStruct + case reflect.Slice: + fn.f = (*decFnInfo).kSlice + case reflect.Array: + fi.array = true + fn.f = (*decFnInfo).kArray + case reflect.Map: + fn.f = (*decFnInfo).kMap + default: + fn.f = (*decFnInfo).kErr + } + } + if useMapForCodecCache { + if d.f == nil { + d.f = make(map[uintptr]decFn, 16) + } + d.f[rtid] = fn + } else { + d.s = append(d.s, fn) + d.x = append(d.x, rtid) + } + } + + fn.f(fn.i, rv) + + return +} + +func (d *Decoder) chkPtrValue(rv reflect.Value) { + // We can only decode into a non-nil pointer + if rv.Kind() == reflect.Ptr && !rv.IsNil() { + return + } + if !rv.IsValid() { + decErr("Cannot decode into a zero (ie invalid) reflect.Value") + } + if !rv.CanInterface() { + decErr("Cannot decode into a value without an interface: %v", rv) + } + rvi := rv.Interface() + decErr("Cannot decode into non-pointer or nil pointer. Got: %v, %T, %v", + rv.Kind(), rvi, rvi) +} + +func (d *Decoder) decEmbeddedField(rv reflect.Value, index []int) { + // d.decodeValue(rv.FieldByIndex(index)) + // nil pointers may be here; so reproduce FieldByIndex logic + enhancements + for _, j := range index { + if rv.Kind() == reflect.Ptr { + if rv.IsNil() { + rv.Set(reflect.New(rv.Type().Elem())) + } + // If a pointer, it must be a pointer to struct (based on typeInfo contract) + rv = rv.Elem() + } + rv = rv.Field(j) + } + d.decodeValue(rv) +} + +// -------------------------------------------------- + +// short circuit functions for common maps and slices + +func (d *Decoder) decSliceIntf(v *[]interface{}, currEncodedType valueType, doNotReset bool) { + _, containerLenS := decContLens(d.d, currEncodedType) + s := *v + if s == nil { + s = make([]interface{}, containerLenS, containerLenS) + } else if containerLenS > cap(s) { + if doNotReset { + decErr(msgDecCannotExpandArr, cap(s), containerLenS) + } + s = make([]interface{}, containerLenS, containerLenS) + copy(s, *v) + } else if containerLenS > len(s) { + s = s[:containerLenS] + } + for j := 0; j < containerLenS; j++ { + d.decode(&s[j]) + } + *v = s +} + +func (d *Decoder) decSliceInt64(v *[]int64, currEncodedType valueType, doNotReset bool) { + _, containerLenS := decContLens(d.d, currEncodedType) + s := *v + if s == nil { + s = make([]int64, containerLenS, containerLenS) + } else if containerLenS > cap(s) { + if doNotReset { + decErr(msgDecCannotExpandArr, cap(s), containerLenS) + } + s = make([]int64, containerLenS, containerLenS) + copy(s, *v) + } else if containerLenS > len(s) { + s = s[:containerLenS] + } + for j := 0; j < containerLenS; j++ { + // d.decode(&s[j]) + d.d.initReadNext() + s[j] = d.d.decodeInt(intBitsize) + } + *v = s +} + +func (d *Decoder) decSliceUint64(v *[]uint64, currEncodedType valueType, doNotReset bool) { + _, containerLenS := decContLens(d.d, currEncodedType) + s := *v + if s == nil { + s = make([]uint64, containerLenS, containerLenS) + } else if containerLenS > cap(s) { + if doNotReset { + decErr(msgDecCannotExpandArr, cap(s), containerLenS) + } + s = make([]uint64, containerLenS, containerLenS) + copy(s, *v) + } else if containerLenS > len(s) { + s = s[:containerLenS] + } + for j := 0; j < containerLenS; j++ { + // d.decode(&s[j]) + d.d.initReadNext() + s[j] = d.d.decodeUint(intBitsize) + } + *v = s +} + +func (d *Decoder) decSliceStr(v *[]string, currEncodedType valueType, doNotReset bool) { + _, containerLenS := decContLens(d.d, currEncodedType) + s := *v + if s == nil { + s = make([]string, containerLenS, containerLenS) + } else if containerLenS > cap(s) { + if doNotReset { + decErr(msgDecCannotExpandArr, cap(s), containerLenS) + } + s = make([]string, containerLenS, containerLenS) + copy(s, *v) + } else if containerLenS > len(s) { + s = s[:containerLenS] + } + for j := 0; j < containerLenS; j++ { + // d.decode(&s[j]) + d.d.initReadNext() + s[j] = d.d.decodeString() + } + *v = s +} + +func (d *Decoder) decMapIntfIntf(v *map[interface{}]interface{}) { + containerLen := d.d.readMapLen() + m := *v + if m == nil { + m = make(map[interface{}]interface{}, containerLen) + *v = m + } + for j := 0; j < containerLen; j++ { + var mk interface{} + d.decode(&mk) + // special case if a byte array. + if bv, bok := mk.([]byte); bok { + mk = string(bv) + } + mv := m[mk] + d.decode(&mv) + m[mk] = mv + } +} + +func (d *Decoder) decMapInt64Intf(v *map[int64]interface{}) { + containerLen := d.d.readMapLen() + m := *v + if m == nil { + m = make(map[int64]interface{}, containerLen) + *v = m + } + for j := 0; j < containerLen; j++ { + d.d.initReadNext() + mk := d.d.decodeInt(intBitsize) + mv := m[mk] + d.decode(&mv) + m[mk] = mv + } +} + +func (d *Decoder) decMapUint64Intf(v *map[uint64]interface{}) { + containerLen := d.d.readMapLen() + m := *v + if m == nil { + m = make(map[uint64]interface{}, containerLen) + *v = m + } + for j := 0; j < containerLen; j++ { + d.d.initReadNext() + mk := d.d.decodeUint(intBitsize) + mv := m[mk] + d.decode(&mv) + m[mk] = mv + } +} + +func (d *Decoder) decMapStrIntf(v *map[string]interface{}) { + containerLen := d.d.readMapLen() + m := *v + if m == nil { + m = make(map[string]interface{}, containerLen) + *v = m + } + for j := 0; j < containerLen; j++ { + d.d.initReadNext() + mk := d.d.decodeString() + mv := m[mk] + d.decode(&mv) + m[mk] = mv + } +} + +// ---------------------------------------- + +func decContLens(dd decDriver, currEncodedType valueType) (containerLen, containerLenS int) { + if currEncodedType == valueTypeInvalid { + currEncodedType = dd.currentEncodedType() + } + switch currEncodedType { + case valueTypeArray: + containerLen = dd.readArrayLen() + containerLenS = containerLen + case valueTypeMap: + containerLen = dd.readMapLen() + containerLenS = containerLen * 2 + default: + decErr("Only encoded map or array can be decoded into a slice. (valueType: %0x)", + currEncodedType) + } + return +} + +func decErr(format string, params ...interface{}) { + doPanic(msgTagDec, format, params...) +} diff --git a/Godeps/_workspace/src/github.com/hashicorp/go-msgpack/codec/encode.go b/Godeps/_workspace/src/github.com/hashicorp/go-msgpack/codec/encode.go new file mode 100644 index 00000000000..4914be0c748 --- /dev/null +++ b/Godeps/_workspace/src/github.com/hashicorp/go-msgpack/codec/encode.go @@ -0,0 +1,1001 @@ +// Copyright (c) 2012, 2013 Ugorji Nwoke. All rights reserved. +// Use of this source code is governed by a BSD-style license found in the LICENSE file. + +package codec + +import ( + "io" + "reflect" +) + +const ( + // Some tagging information for error messages. + msgTagEnc = "codec.encoder" + defEncByteBufSize = 1 << 6 // 4:16, 6:64, 8:256, 10:1024 + // maxTimeSecs32 = math.MaxInt32 / 60 / 24 / 366 +) + +// AsSymbolFlag defines what should be encoded as symbols. +type AsSymbolFlag uint8 + +const ( + // AsSymbolDefault is default. + // Currently, this means only encode struct field names as symbols. + // The default is subject to change. + AsSymbolDefault AsSymbolFlag = iota + + // AsSymbolAll means encode anything which could be a symbol as a symbol. + AsSymbolAll = 0xfe + + // AsSymbolNone means do not encode anything as a symbol. + AsSymbolNone = 1 << iota + + // AsSymbolMapStringKeys means encode keys in map[string]XXX as symbols. + AsSymbolMapStringKeysFlag + + // AsSymbolStructFieldName means encode struct field names as symbols. + AsSymbolStructFieldNameFlag +) + +// encWriter abstracting writing to a byte array or to an io.Writer. +type encWriter interface { + writeUint16(uint16) + writeUint32(uint32) + writeUint64(uint64) + writeb([]byte) + writestr(string) + writen1(byte) + writen2(byte, byte) + atEndOfEncode() +} + +// encDriver abstracts the actual codec (binc vs msgpack, etc) +type encDriver interface { + isBuiltinType(rt uintptr) bool + encodeBuiltin(rt uintptr, v interface{}) + encodeNil() + encodeInt(i int64) + encodeUint(i uint64) + encodeBool(b bool) + encodeFloat32(f float32) + encodeFloat64(f float64) + encodeExtPreamble(xtag byte, length int) + encodeArrayPreamble(length int) + encodeMapPreamble(length int) + encodeString(c charEncoding, v string) + encodeSymbol(v string) + encodeStringBytes(c charEncoding, v []byte) + //TODO + //encBignum(f *big.Int) + //encStringRunes(c charEncoding, v []rune) +} + +type ioEncWriterWriter interface { + WriteByte(c byte) error + WriteString(s string) (n int, err error) + Write(p []byte) (n int, err error) +} + +type ioEncStringWriter interface { + WriteString(s string) (n int, err error) +} + +type EncodeOptions struct { + // Encode a struct as an array, and not as a map. + StructToArray bool + + // AsSymbols defines what should be encoded as symbols. + // + // Encoding as symbols can reduce the encoded size significantly. + // + // However, during decoding, each string to be encoded as a symbol must + // be checked to see if it has been seen before. Consequently, encoding time + // will increase if using symbols, because string comparisons has a clear cost. + // + // Sample values: + // AsSymbolNone + // AsSymbolAll + // AsSymbolMapStringKeys + // AsSymbolMapStringKeysFlag | AsSymbolStructFieldNameFlag + AsSymbols AsSymbolFlag +} + +// --------------------------------------------- + +type simpleIoEncWriterWriter struct { + w io.Writer + bw io.ByteWriter + sw ioEncStringWriter +} + +func (o *simpleIoEncWriterWriter) WriteByte(c byte) (err error) { + if o.bw != nil { + return o.bw.WriteByte(c) + } + _, err = o.w.Write([]byte{c}) + return +} + +func (o *simpleIoEncWriterWriter) WriteString(s string) (n int, err error) { + if o.sw != nil { + return o.sw.WriteString(s) + } + return o.w.Write([]byte(s)) +} + +func (o *simpleIoEncWriterWriter) Write(p []byte) (n int, err error) { + return o.w.Write(p) +} + +// ---------------------------------------- + +// ioEncWriter implements encWriter and can write to an io.Writer implementation +type ioEncWriter struct { + w ioEncWriterWriter + x [8]byte // temp byte array re-used internally for efficiency +} + +func (z *ioEncWriter) writeUint16(v uint16) { + bigen.PutUint16(z.x[:2], v) + z.writeb(z.x[:2]) +} + +func (z *ioEncWriter) writeUint32(v uint32) { + bigen.PutUint32(z.x[:4], v) + z.writeb(z.x[:4]) +} + +func (z *ioEncWriter) writeUint64(v uint64) { + bigen.PutUint64(z.x[:8], v) + z.writeb(z.x[:8]) +} + +func (z *ioEncWriter) writeb(bs []byte) { + if len(bs) == 0 { + return + } + n, err := z.w.Write(bs) + if err != nil { + panic(err) + } + if n != len(bs) { + encErr("write: Incorrect num bytes written. Expecting: %v, Wrote: %v", len(bs), n) + } +} + +func (z *ioEncWriter) writestr(s string) { + n, err := z.w.WriteString(s) + if err != nil { + panic(err) + } + if n != len(s) { + encErr("write: Incorrect num bytes written. Expecting: %v, Wrote: %v", len(s), n) + } +} + +func (z *ioEncWriter) writen1(b byte) { + if err := z.w.WriteByte(b); err != nil { + panic(err) + } +} + +func (z *ioEncWriter) writen2(b1 byte, b2 byte) { + z.writen1(b1) + z.writen1(b2) +} + +func (z *ioEncWriter) atEndOfEncode() {} + +// ---------------------------------------- + +// bytesEncWriter implements encWriter and can write to an byte slice. +// It is used by Marshal function. +type bytesEncWriter struct { + b []byte + c int // cursor + out *[]byte // write out on atEndOfEncode +} + +func (z *bytesEncWriter) writeUint16(v uint16) { + c := z.grow(2) + z.b[c] = byte(v >> 8) + z.b[c+1] = byte(v) +} + +func (z *bytesEncWriter) writeUint32(v uint32) { + c := z.grow(4) + z.b[c] = byte(v >> 24) + z.b[c+1] = byte(v >> 16) + z.b[c+2] = byte(v >> 8) + z.b[c+3] = byte(v) +} + +func (z *bytesEncWriter) writeUint64(v uint64) { + c := z.grow(8) + z.b[c] = byte(v >> 56) + z.b[c+1] = byte(v >> 48) + z.b[c+2] = byte(v >> 40) + z.b[c+3] = byte(v >> 32) + z.b[c+4] = byte(v >> 24) + z.b[c+5] = byte(v >> 16) + z.b[c+6] = byte(v >> 8) + z.b[c+7] = byte(v) +} + +func (z *bytesEncWriter) writeb(s []byte) { + if len(s) == 0 { + return + } + c := z.grow(len(s)) + copy(z.b[c:], s) +} + +func (z *bytesEncWriter) writestr(s string) { + c := z.grow(len(s)) + copy(z.b[c:], s) +} + +func (z *bytesEncWriter) writen1(b1 byte) { + c := z.grow(1) + z.b[c] = b1 +} + +func (z *bytesEncWriter) writen2(b1 byte, b2 byte) { + c := z.grow(2) + z.b[c] = b1 + z.b[c+1] = b2 +} + +func (z *bytesEncWriter) atEndOfEncode() { + *(z.out) = z.b[:z.c] +} + +func (z *bytesEncWriter) grow(n int) (oldcursor int) { + oldcursor = z.c + z.c = oldcursor + n + if z.c > cap(z.b) { + // Tried using appendslice logic: (if cap < 1024, *2, else *1.25). + // However, it was too expensive, causing too many iterations of copy. + // Using bytes.Buffer model was much better (2*cap + n) + bs := make([]byte, 2*cap(z.b)+n) + copy(bs, z.b[:oldcursor]) + z.b = bs + } else if z.c > len(z.b) { + z.b = z.b[:cap(z.b)] + } + return +} + +// --------------------------------------------- + +type encFnInfo struct { + ti *typeInfo + e *Encoder + ee encDriver + xfFn func(reflect.Value) ([]byte, error) + xfTag byte +} + +func (f *encFnInfo) builtin(rv reflect.Value) { + f.ee.encodeBuiltin(f.ti.rtid, rv.Interface()) +} + +func (f *encFnInfo) rawExt(rv reflect.Value) { + f.e.encRawExt(rv.Interface().(RawExt)) +} + +func (f *encFnInfo) ext(rv reflect.Value) { + bs, fnerr := f.xfFn(rv) + if fnerr != nil { + panic(fnerr) + } + if bs == nil { + f.ee.encodeNil() + return + } + if f.e.hh.writeExt() { + f.ee.encodeExtPreamble(f.xfTag, len(bs)) + f.e.w.writeb(bs) + } else { + f.ee.encodeStringBytes(c_RAW, bs) + } + +} + +func (f *encFnInfo) binaryMarshal(rv reflect.Value) { + var bm binaryMarshaler + if f.ti.mIndir == 0 { + bm = rv.Interface().(binaryMarshaler) + } else if f.ti.mIndir == -1 { + bm = rv.Addr().Interface().(binaryMarshaler) + } else { + for j, k := int8(0), f.ti.mIndir; j < k; j++ { + if rv.IsNil() { + f.ee.encodeNil() + return + } + rv = rv.Elem() + } + bm = rv.Interface().(binaryMarshaler) + } + // debugf(">>>> binaryMarshaler: %T", rv.Interface()) + bs, fnerr := bm.MarshalBinary() + if fnerr != nil { + panic(fnerr) + } + if bs == nil { + f.ee.encodeNil() + } else { + f.ee.encodeStringBytes(c_RAW, bs) + } +} + +func (f *encFnInfo) kBool(rv reflect.Value) { + f.ee.encodeBool(rv.Bool()) +} + +func (f *encFnInfo) kString(rv reflect.Value) { + f.ee.encodeString(c_UTF8, rv.String()) +} + +func (f *encFnInfo) kFloat64(rv reflect.Value) { + f.ee.encodeFloat64(rv.Float()) +} + +func (f *encFnInfo) kFloat32(rv reflect.Value) { + f.ee.encodeFloat32(float32(rv.Float())) +} + +func (f *encFnInfo) kInt(rv reflect.Value) { + f.ee.encodeInt(rv.Int()) +} + +func (f *encFnInfo) kUint(rv reflect.Value) { + f.ee.encodeUint(rv.Uint()) +} + +func (f *encFnInfo) kInvalid(rv reflect.Value) { + f.ee.encodeNil() +} + +func (f *encFnInfo) kErr(rv reflect.Value) { + encErr("Unsupported kind: %s, for: %#v", rv.Kind(), rv) +} + +func (f *encFnInfo) kSlice(rv reflect.Value) { + if rv.IsNil() { + f.ee.encodeNil() + return + } + + if shortCircuitReflectToFastPath { + switch f.ti.rtid { + case intfSliceTypId: + f.e.encSliceIntf(rv.Interface().([]interface{})) + return + case strSliceTypId: + f.e.encSliceStr(rv.Interface().([]string)) + return + case uint64SliceTypId: + f.e.encSliceUint64(rv.Interface().([]uint64)) + return + case int64SliceTypId: + f.e.encSliceInt64(rv.Interface().([]int64)) + return + } + } + + // If in this method, then there was no extension function defined. + // So it's okay to treat as []byte. + if f.ti.rtid == uint8SliceTypId || f.ti.rt.Elem().Kind() == reflect.Uint8 { + f.ee.encodeStringBytes(c_RAW, rv.Bytes()) + return + } + + l := rv.Len() + if f.ti.mbs { + if l%2 == 1 { + encErr("mapBySlice: invalid length (must be divisible by 2): %v", l) + } + f.ee.encodeMapPreamble(l / 2) + } else { + f.ee.encodeArrayPreamble(l) + } + if l == 0 { + return + } + for j := 0; j < l; j++ { + // TODO: Consider perf implication of encoding odd index values as symbols if type is string + f.e.encodeValue(rv.Index(j)) + } +} + +func (f *encFnInfo) kArray(rv reflect.Value) { + // We cannot share kSlice method, because the array may be non-addressable. + // E.g. type struct S{B [2]byte}; Encode(S{}) will bomb on "panic: slice of unaddressable array". + // So we have to duplicate the functionality here. + // f.e.encodeValue(rv.Slice(0, rv.Len())) + // f.kSlice(rv.Slice(0, rv.Len())) + + l := rv.Len() + // Handle an array of bytes specially (in line with what is done for slices) + if f.ti.rt.Elem().Kind() == reflect.Uint8 { + if l == 0 { + f.ee.encodeStringBytes(c_RAW, nil) + return + } + var bs []byte + if rv.CanAddr() { + bs = rv.Slice(0, l).Bytes() + } else { + bs = make([]byte, l) + for i := 0; i < l; i++ { + bs[i] = byte(rv.Index(i).Uint()) + } + } + f.ee.encodeStringBytes(c_RAW, bs) + return + } + + if f.ti.mbs { + if l%2 == 1 { + encErr("mapBySlice: invalid length (must be divisible by 2): %v", l) + } + f.ee.encodeMapPreamble(l / 2) + } else { + f.ee.encodeArrayPreamble(l) + } + if l == 0 { + return + } + for j := 0; j < l; j++ { + // TODO: Consider perf implication of encoding odd index values as symbols if type is string + f.e.encodeValue(rv.Index(j)) + } +} + +func (f *encFnInfo) kStruct(rv reflect.Value) { + fti := f.ti + newlen := len(fti.sfi) + rvals := make([]reflect.Value, newlen) + var encnames []string + e := f.e + tisfi := fti.sfip + toMap := !(fti.toArray || e.h.StructToArray) + // if toMap, use the sorted array. If toArray, use unsorted array (to match sequence in struct) + if toMap { + tisfi = fti.sfi + encnames = make([]string, newlen) + } + newlen = 0 + for _, si := range tisfi { + if si.i != -1 { + rvals[newlen] = rv.Field(int(si.i)) + } else { + rvals[newlen] = rv.FieldByIndex(si.is) + } + if toMap { + if si.omitEmpty && isEmptyValue(rvals[newlen]) { + continue + } + encnames[newlen] = si.encName + } else { + if si.omitEmpty && isEmptyValue(rvals[newlen]) { + rvals[newlen] = reflect.Value{} //encode as nil + } + } + newlen++ + } + + // debugf(">>>> kStruct: newlen: %v", newlen) + if toMap { + ee := f.ee //don't dereference everytime + ee.encodeMapPreamble(newlen) + // asSymbols := e.h.AsSymbols&AsSymbolStructFieldNameFlag != 0 + asSymbols := e.h.AsSymbols == AsSymbolDefault || e.h.AsSymbols&AsSymbolStructFieldNameFlag != 0 + for j := 0; j < newlen; j++ { + if asSymbols { + ee.encodeSymbol(encnames[j]) + } else { + ee.encodeString(c_UTF8, encnames[j]) + } + e.encodeValue(rvals[j]) + } + } else { + f.ee.encodeArrayPreamble(newlen) + for j := 0; j < newlen; j++ { + e.encodeValue(rvals[j]) + } + } +} + +// func (f *encFnInfo) kPtr(rv reflect.Value) { +// debugf(">>>>>>> ??? encode kPtr called - shouldn't get called") +// if rv.IsNil() { +// f.ee.encodeNil() +// return +// } +// f.e.encodeValue(rv.Elem()) +// } + +func (f *encFnInfo) kInterface(rv reflect.Value) { + if rv.IsNil() { + f.ee.encodeNil() + return + } + f.e.encodeValue(rv.Elem()) +} + +func (f *encFnInfo) kMap(rv reflect.Value) { + if rv.IsNil() { + f.ee.encodeNil() + return + } + + if shortCircuitReflectToFastPath { + switch f.ti.rtid { + case mapIntfIntfTypId: + f.e.encMapIntfIntf(rv.Interface().(map[interface{}]interface{})) + return + case mapStrIntfTypId: + f.e.encMapStrIntf(rv.Interface().(map[string]interface{})) + return + case mapStrStrTypId: + f.e.encMapStrStr(rv.Interface().(map[string]string)) + return + case mapInt64IntfTypId: + f.e.encMapInt64Intf(rv.Interface().(map[int64]interface{})) + return + case mapUint64IntfTypId: + f.e.encMapUint64Intf(rv.Interface().(map[uint64]interface{})) + return + } + } + + l := rv.Len() + f.ee.encodeMapPreamble(l) + if l == 0 { + return + } + // keyTypeIsString := f.ti.rt.Key().Kind() == reflect.String + keyTypeIsString := f.ti.rt.Key() == stringTyp + var asSymbols bool + if keyTypeIsString { + asSymbols = f.e.h.AsSymbols&AsSymbolMapStringKeysFlag != 0 + } + mks := rv.MapKeys() + // for j, lmks := 0, len(mks); j < lmks; j++ { + for j := range mks { + if keyTypeIsString { + if asSymbols { + f.ee.encodeSymbol(mks[j].String()) + } else { + f.ee.encodeString(c_UTF8, mks[j].String()) + } + } else { + f.e.encodeValue(mks[j]) + } + f.e.encodeValue(rv.MapIndex(mks[j])) + } + +} + +// -------------------------------------------------- + +// encFn encapsulates the captured variables and the encode function. +// This way, we only do some calculations one times, and pass to the +// code block that should be called (encapsulated in a function) +// instead of executing the checks every time. +type encFn struct { + i *encFnInfo + f func(*encFnInfo, reflect.Value) +} + +// -------------------------------------------------- + +// An Encoder writes an object to an output stream in the codec format. +type Encoder struct { + w encWriter + e encDriver + h *BasicHandle + hh Handle + f map[uintptr]encFn + x []uintptr + s []encFn +} + +// NewEncoder returns an Encoder for encoding into an io.Writer. +// +// For efficiency, Users are encouraged to pass in a memory buffered writer +// (eg bufio.Writer, bytes.Buffer). +func NewEncoder(w io.Writer, h Handle) *Encoder { + ww, ok := w.(ioEncWriterWriter) + if !ok { + sww := simpleIoEncWriterWriter{w: w} + sww.bw, _ = w.(io.ByteWriter) + sww.sw, _ = w.(ioEncStringWriter) + ww = &sww + //ww = bufio.NewWriterSize(w, defEncByteBufSize) + } + z := ioEncWriter{ + w: ww, + } + return &Encoder{w: &z, hh: h, h: h.getBasicHandle(), e: h.newEncDriver(&z)} +} + +// NewEncoderBytes returns an encoder for encoding directly and efficiently +// into a byte slice, using zero-copying to temporary slices. +// +// It will potentially replace the output byte slice pointed to. +// After encoding, the out parameter contains the encoded contents. +func NewEncoderBytes(out *[]byte, h Handle) *Encoder { + in := *out + if in == nil { + in = make([]byte, defEncByteBufSize) + } + z := bytesEncWriter{ + b: in, + out: out, + } + return &Encoder{w: &z, hh: h, h: h.getBasicHandle(), e: h.newEncDriver(&z)} +} + +// Encode writes an object into a stream in the codec format. +// +// Encoding can be configured via the "codec" struct tag for the fields. +// +// The "codec" key in struct field's tag value is the key name, +// followed by an optional comma and options. +// +// To set an option on all fields (e.g. omitempty on all fields), you +// can create a field called _struct, and set flags on it. +// +// Struct values "usually" encode as maps. Each exported struct field is encoded unless: +// - the field's codec tag is "-", OR +// - the field is empty and its codec tag specifies the "omitempty" option. +// +// When encoding as a map, the first string in the tag (before the comma) +// is the map key string to use when encoding. +// +// However, struct values may encode as arrays. This happens when: +// - StructToArray Encode option is set, OR +// - the codec tag on the _struct field sets the "toarray" option +// +// Values with types that implement MapBySlice are encoded as stream maps. +// +// The empty values (for omitempty option) are false, 0, any nil pointer +// or interface value, and any array, slice, map, or string of length zero. +// +// Anonymous fields are encoded inline if no struct tag is present. +// Else they are encoded as regular fields. +// +// Examples: +// +// type MyStruct struct { +// _struct bool `codec:",omitempty"` //set omitempty for every field +// Field1 string `codec:"-"` //skip this field +// Field2 int `codec:"myName"` //Use key "myName" in encode stream +// Field3 int32 `codec:",omitempty"` //use key "Field3". Omit if empty. +// Field4 bool `codec:"f4,omitempty"` //use key "f4". Omit if empty. +// ... +// } +// +// type MyStruct struct { +// _struct bool `codec:",omitempty,toarray"` //set omitempty for every field +// //and encode struct as an array +// } +// +// The mode of encoding is based on the type of the value. When a value is seen: +// - If an extension is registered for it, call that extension function +// - If it implements BinaryMarshaler, call its MarshalBinary() (data []byte, err error) +// - Else encode it based on its reflect.Kind +// +// Note that struct field names and keys in map[string]XXX will be treated as symbols. +// Some formats support symbols (e.g. binc) and will properly encode the string +// only once in the stream, and use a tag to refer to it thereafter. +func (e *Encoder) Encode(v interface{}) (err error) { + defer panicToErr(&err) + e.encode(v) + e.w.atEndOfEncode() + return +} + +func (e *Encoder) encode(iv interface{}) { + switch v := iv.(type) { + case nil: + e.e.encodeNil() + + case reflect.Value: + e.encodeValue(v) + + case string: + e.e.encodeString(c_UTF8, v) + case bool: + e.e.encodeBool(v) + case int: + e.e.encodeInt(int64(v)) + case int8: + e.e.encodeInt(int64(v)) + case int16: + e.e.encodeInt(int64(v)) + case int32: + e.e.encodeInt(int64(v)) + case int64: + e.e.encodeInt(v) + case uint: + e.e.encodeUint(uint64(v)) + case uint8: + e.e.encodeUint(uint64(v)) + case uint16: + e.e.encodeUint(uint64(v)) + case uint32: + e.e.encodeUint(uint64(v)) + case uint64: + e.e.encodeUint(v) + case float32: + e.e.encodeFloat32(v) + case float64: + e.e.encodeFloat64(v) + + case []interface{}: + e.encSliceIntf(v) + case []string: + e.encSliceStr(v) + case []int64: + e.encSliceInt64(v) + case []uint64: + e.encSliceUint64(v) + case []uint8: + e.e.encodeStringBytes(c_RAW, v) + + case map[interface{}]interface{}: + e.encMapIntfIntf(v) + case map[string]interface{}: + e.encMapStrIntf(v) + case map[string]string: + e.encMapStrStr(v) + case map[int64]interface{}: + e.encMapInt64Intf(v) + case map[uint64]interface{}: + e.encMapUint64Intf(v) + + case *string: + e.e.encodeString(c_UTF8, *v) + case *bool: + e.e.encodeBool(*v) + case *int: + e.e.encodeInt(int64(*v)) + case *int8: + e.e.encodeInt(int64(*v)) + case *int16: + e.e.encodeInt(int64(*v)) + case *int32: + e.e.encodeInt(int64(*v)) + case *int64: + e.e.encodeInt(*v) + case *uint: + e.e.encodeUint(uint64(*v)) + case *uint8: + e.e.encodeUint(uint64(*v)) + case *uint16: + e.e.encodeUint(uint64(*v)) + case *uint32: + e.e.encodeUint(uint64(*v)) + case *uint64: + e.e.encodeUint(*v) + case *float32: + e.e.encodeFloat32(*v) + case *float64: + e.e.encodeFloat64(*v) + + case *[]interface{}: + e.encSliceIntf(*v) + case *[]string: + e.encSliceStr(*v) + case *[]int64: + e.encSliceInt64(*v) + case *[]uint64: + e.encSliceUint64(*v) + case *[]uint8: + e.e.encodeStringBytes(c_RAW, *v) + + case *map[interface{}]interface{}: + e.encMapIntfIntf(*v) + case *map[string]interface{}: + e.encMapStrIntf(*v) + case *map[string]string: + e.encMapStrStr(*v) + case *map[int64]interface{}: + e.encMapInt64Intf(*v) + case *map[uint64]interface{}: + e.encMapUint64Intf(*v) + + default: + e.encodeValue(reflect.ValueOf(iv)) + } +} + +func (e *Encoder) encodeValue(rv reflect.Value) { + for rv.Kind() == reflect.Ptr { + if rv.IsNil() { + e.e.encodeNil() + return + } + rv = rv.Elem() + } + + rt := rv.Type() + rtid := reflect.ValueOf(rt).Pointer() + + // if e.f == nil && e.s == nil { debugf("---->Creating new enc f map for type: %v\n", rt) } + var fn encFn + var ok bool + if useMapForCodecCache { + fn, ok = e.f[rtid] + } else { + for i, v := range e.x { + if v == rtid { + fn, ok = e.s[i], true + break + } + } + } + if !ok { + // debugf("\tCreating new enc fn for type: %v\n", rt) + fi := encFnInfo{ti: getTypeInfo(rtid, rt), e: e, ee: e.e} + fn.i = &fi + if rtid == rawExtTypId { + fn.f = (*encFnInfo).rawExt + } else if e.e.isBuiltinType(rtid) { + fn.f = (*encFnInfo).builtin + } else if xfTag, xfFn := e.h.getEncodeExt(rtid); xfFn != nil { + fi.xfTag, fi.xfFn = xfTag, xfFn + fn.f = (*encFnInfo).ext + } else if supportBinaryMarshal && fi.ti.m { + fn.f = (*encFnInfo).binaryMarshal + } else { + switch rk := rt.Kind(); rk { + case reflect.Bool: + fn.f = (*encFnInfo).kBool + case reflect.String: + fn.f = (*encFnInfo).kString + case reflect.Float64: + fn.f = (*encFnInfo).kFloat64 + case reflect.Float32: + fn.f = (*encFnInfo).kFloat32 + case reflect.Int, reflect.Int8, reflect.Int64, reflect.Int32, reflect.Int16: + fn.f = (*encFnInfo).kInt + case reflect.Uint8, reflect.Uint64, reflect.Uint, reflect.Uint32, reflect.Uint16: + fn.f = (*encFnInfo).kUint + case reflect.Invalid: + fn.f = (*encFnInfo).kInvalid + case reflect.Slice: + fn.f = (*encFnInfo).kSlice + case reflect.Array: + fn.f = (*encFnInfo).kArray + case reflect.Struct: + fn.f = (*encFnInfo).kStruct + // case reflect.Ptr: + // fn.f = (*encFnInfo).kPtr + case reflect.Interface: + fn.f = (*encFnInfo).kInterface + case reflect.Map: + fn.f = (*encFnInfo).kMap + default: + fn.f = (*encFnInfo).kErr + } + } + if useMapForCodecCache { + if e.f == nil { + e.f = make(map[uintptr]encFn, 16) + } + e.f[rtid] = fn + } else { + e.s = append(e.s, fn) + e.x = append(e.x, rtid) + } + } + + fn.f(fn.i, rv) + +} + +func (e *Encoder) encRawExt(re RawExt) { + if re.Data == nil { + e.e.encodeNil() + return + } + if e.hh.writeExt() { + e.e.encodeExtPreamble(re.Tag, len(re.Data)) + e.w.writeb(re.Data) + } else { + e.e.encodeStringBytes(c_RAW, re.Data) + } +} + +// --------------------------------------------- +// short circuit functions for common maps and slices + +func (e *Encoder) encSliceIntf(v []interface{}) { + e.e.encodeArrayPreamble(len(v)) + for _, v2 := range v { + e.encode(v2) + } +} + +func (e *Encoder) encSliceStr(v []string) { + e.e.encodeArrayPreamble(len(v)) + for _, v2 := range v { + e.e.encodeString(c_UTF8, v2) + } +} + +func (e *Encoder) encSliceInt64(v []int64) { + e.e.encodeArrayPreamble(len(v)) + for _, v2 := range v { + e.e.encodeInt(v2) + } +} + +func (e *Encoder) encSliceUint64(v []uint64) { + e.e.encodeArrayPreamble(len(v)) + for _, v2 := range v { + e.e.encodeUint(v2) + } +} + +func (e *Encoder) encMapStrStr(v map[string]string) { + e.e.encodeMapPreamble(len(v)) + asSymbols := e.h.AsSymbols&AsSymbolMapStringKeysFlag != 0 + for k2, v2 := range v { + if asSymbols { + e.e.encodeSymbol(k2) + } else { + e.e.encodeString(c_UTF8, k2) + } + e.e.encodeString(c_UTF8, v2) + } +} + +func (e *Encoder) encMapStrIntf(v map[string]interface{}) { + e.e.encodeMapPreamble(len(v)) + asSymbols := e.h.AsSymbols&AsSymbolMapStringKeysFlag != 0 + for k2, v2 := range v { + if asSymbols { + e.e.encodeSymbol(k2) + } else { + e.e.encodeString(c_UTF8, k2) + } + e.encode(v2) + } +} + +func (e *Encoder) encMapInt64Intf(v map[int64]interface{}) { + e.e.encodeMapPreamble(len(v)) + for k2, v2 := range v { + e.e.encodeInt(k2) + e.encode(v2) + } +} + +func (e *Encoder) encMapUint64Intf(v map[uint64]interface{}) { + e.e.encodeMapPreamble(len(v)) + for k2, v2 := range v { + e.e.encodeUint(uint64(k2)) + e.encode(v2) + } +} + +func (e *Encoder) encMapIntfIntf(v map[interface{}]interface{}) { + e.e.encodeMapPreamble(len(v)) + for k2, v2 := range v { + e.encode(k2) + e.encode(v2) + } +} + +// ---------------------------------------- + +func encErr(format string, params ...interface{}) { + doPanic(msgTagEnc, format, params...) +} diff --git a/Godeps/_workspace/src/github.com/hashicorp/go-msgpack/codec/helper.go b/Godeps/_workspace/src/github.com/hashicorp/go-msgpack/codec/helper.go new file mode 100644 index 00000000000..e6dc0563f09 --- /dev/null +++ b/Godeps/_workspace/src/github.com/hashicorp/go-msgpack/codec/helper.go @@ -0,0 +1,589 @@ +// Copyright (c) 2012, 2013 Ugorji Nwoke. All rights reserved. +// Use of this source code is governed by a BSD-style license found in the LICENSE file. + +package codec + +// Contains code shared by both encode and decode. + +import ( + "encoding/binary" + "fmt" + "math" + "reflect" + "sort" + "strings" + "sync" + "time" + "unicode" + "unicode/utf8" +) + +const ( + structTagName = "codec" + + // Support + // encoding.BinaryMarshaler: MarshalBinary() (data []byte, err error) + // encoding.BinaryUnmarshaler: UnmarshalBinary(data []byte) error + // This constant flag will enable or disable it. + supportBinaryMarshal = true + + // Each Encoder or Decoder uses a cache of functions based on conditionals, + // so that the conditionals are not run every time. + // + // Either a map or a slice is used to keep track of the functions. + // The map is more natural, but has a higher cost than a slice/array. + // This flag (useMapForCodecCache) controls which is used. + useMapForCodecCache = false + + // For some common container types, we can short-circuit an elaborate + // reflection dance and call encode/decode directly. + // The currently supported types are: + // - slices of strings, or id's (int64,uint64) or interfaces. + // - maps of str->str, str->intf, id(int64,uint64)->intf, intf->intf + shortCircuitReflectToFastPath = true + + // for debugging, set this to false, to catch panic traces. + // Note that this will always cause rpc tests to fail, since they need io.EOF sent via panic. + recoverPanicToErr = true +) + +type charEncoding uint8 + +const ( + c_RAW charEncoding = iota + c_UTF8 + c_UTF16LE + c_UTF16BE + c_UTF32LE + c_UTF32BE +) + +// valueType is the stream type +type valueType uint8 + +const ( + valueTypeUnset valueType = iota + valueTypeNil + valueTypeInt + valueTypeUint + valueTypeFloat + valueTypeBool + valueTypeString + valueTypeSymbol + valueTypeBytes + valueTypeMap + valueTypeArray + valueTypeTimestamp + valueTypeExt + + valueTypeInvalid = 0xff +) + +var ( + bigen = binary.BigEndian + structInfoFieldName = "_struct" + + cachedTypeInfo = make(map[uintptr]*typeInfo, 4) + cachedTypeInfoMutex sync.RWMutex + + intfSliceTyp = reflect.TypeOf([]interface{}(nil)) + intfTyp = intfSliceTyp.Elem() + + strSliceTyp = reflect.TypeOf([]string(nil)) + boolSliceTyp = reflect.TypeOf([]bool(nil)) + uintSliceTyp = reflect.TypeOf([]uint(nil)) + uint8SliceTyp = reflect.TypeOf([]uint8(nil)) + uint16SliceTyp = reflect.TypeOf([]uint16(nil)) + uint32SliceTyp = reflect.TypeOf([]uint32(nil)) + uint64SliceTyp = reflect.TypeOf([]uint64(nil)) + intSliceTyp = reflect.TypeOf([]int(nil)) + int8SliceTyp = reflect.TypeOf([]int8(nil)) + int16SliceTyp = reflect.TypeOf([]int16(nil)) + int32SliceTyp = reflect.TypeOf([]int32(nil)) + int64SliceTyp = reflect.TypeOf([]int64(nil)) + float32SliceTyp = reflect.TypeOf([]float32(nil)) + float64SliceTyp = reflect.TypeOf([]float64(nil)) + + mapIntfIntfTyp = reflect.TypeOf(map[interface{}]interface{}(nil)) + mapStrIntfTyp = reflect.TypeOf(map[string]interface{}(nil)) + mapStrStrTyp = reflect.TypeOf(map[string]string(nil)) + + mapIntIntfTyp = reflect.TypeOf(map[int]interface{}(nil)) + mapInt64IntfTyp = reflect.TypeOf(map[int64]interface{}(nil)) + mapUintIntfTyp = reflect.TypeOf(map[uint]interface{}(nil)) + mapUint64IntfTyp = reflect.TypeOf(map[uint64]interface{}(nil)) + + stringTyp = reflect.TypeOf("") + timeTyp = reflect.TypeOf(time.Time{}) + rawExtTyp = reflect.TypeOf(RawExt{}) + + mapBySliceTyp = reflect.TypeOf((*MapBySlice)(nil)).Elem() + binaryMarshalerTyp = reflect.TypeOf((*binaryMarshaler)(nil)).Elem() + binaryUnmarshalerTyp = reflect.TypeOf((*binaryUnmarshaler)(nil)).Elem() + + rawExtTypId = reflect.ValueOf(rawExtTyp).Pointer() + intfTypId = reflect.ValueOf(intfTyp).Pointer() + timeTypId = reflect.ValueOf(timeTyp).Pointer() + + intfSliceTypId = reflect.ValueOf(intfSliceTyp).Pointer() + strSliceTypId = reflect.ValueOf(strSliceTyp).Pointer() + + boolSliceTypId = reflect.ValueOf(boolSliceTyp).Pointer() + uintSliceTypId = reflect.ValueOf(uintSliceTyp).Pointer() + uint8SliceTypId = reflect.ValueOf(uint8SliceTyp).Pointer() + uint16SliceTypId = reflect.ValueOf(uint16SliceTyp).Pointer() + uint32SliceTypId = reflect.ValueOf(uint32SliceTyp).Pointer() + uint64SliceTypId = reflect.ValueOf(uint64SliceTyp).Pointer() + intSliceTypId = reflect.ValueOf(intSliceTyp).Pointer() + int8SliceTypId = reflect.ValueOf(int8SliceTyp).Pointer() + int16SliceTypId = reflect.ValueOf(int16SliceTyp).Pointer() + int32SliceTypId = reflect.ValueOf(int32SliceTyp).Pointer() + int64SliceTypId = reflect.ValueOf(int64SliceTyp).Pointer() + float32SliceTypId = reflect.ValueOf(float32SliceTyp).Pointer() + float64SliceTypId = reflect.ValueOf(float64SliceTyp).Pointer() + + mapStrStrTypId = reflect.ValueOf(mapStrStrTyp).Pointer() + mapIntfIntfTypId = reflect.ValueOf(mapIntfIntfTyp).Pointer() + mapStrIntfTypId = reflect.ValueOf(mapStrIntfTyp).Pointer() + mapIntIntfTypId = reflect.ValueOf(mapIntIntfTyp).Pointer() + mapInt64IntfTypId = reflect.ValueOf(mapInt64IntfTyp).Pointer() + mapUintIntfTypId = reflect.ValueOf(mapUintIntfTyp).Pointer() + mapUint64IntfTypId = reflect.ValueOf(mapUint64IntfTyp).Pointer() + // Id = reflect.ValueOf().Pointer() + // mapBySliceTypId = reflect.ValueOf(mapBySliceTyp).Pointer() + + binaryMarshalerTypId = reflect.ValueOf(binaryMarshalerTyp).Pointer() + binaryUnmarshalerTypId = reflect.ValueOf(binaryUnmarshalerTyp).Pointer() + + intBitsize uint8 = uint8(reflect.TypeOf(int(0)).Bits()) + uintBitsize uint8 = uint8(reflect.TypeOf(uint(0)).Bits()) + + bsAll0x00 = []byte{0, 0, 0, 0, 0, 0, 0, 0} + bsAll0xff = []byte{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff} +) + +type binaryUnmarshaler interface { + UnmarshalBinary(data []byte) error +} + +type binaryMarshaler interface { + MarshalBinary() (data []byte, err error) +} + +// MapBySlice represents a slice which should be encoded as a map in the stream. +// The slice contains a sequence of key-value pairs. +type MapBySlice interface { + MapBySlice() +} + +// WARNING: DO NOT USE DIRECTLY. EXPORTED FOR GODOC BENEFIT. WILL BE REMOVED. +// +// BasicHandle encapsulates the common options and extension functions. +type BasicHandle struct { + extHandle + EncodeOptions + DecodeOptions +} + +// Handle is the interface for a specific encoding format. +// +// Typically, a Handle is pre-configured before first time use, +// and not modified while in use. Such a pre-configured Handle +// is safe for concurrent access. +type Handle interface { + writeExt() bool + getBasicHandle() *BasicHandle + newEncDriver(w encWriter) encDriver + newDecDriver(r decReader) decDriver +} + +// RawExt represents raw unprocessed extension data. +type RawExt struct { + Tag byte + Data []byte +} + +type extTypeTagFn struct { + rtid uintptr + rt reflect.Type + tag byte + encFn func(reflect.Value) ([]byte, error) + decFn func(reflect.Value, []byte) error +} + +type extHandle []*extTypeTagFn + +// AddExt registers an encode and decode function for a reflect.Type. +// Note that the type must be a named type, and specifically not +// a pointer or Interface. An error is returned if that is not honored. +// +// To Deregister an ext, call AddExt with 0 tag, nil encfn and nil decfn. +func (o *extHandle) AddExt( + rt reflect.Type, + tag byte, + encfn func(reflect.Value) ([]byte, error), + decfn func(reflect.Value, []byte) error, +) (err error) { + // o is a pointer, because we may need to initialize it + if rt.PkgPath() == "" || rt.Kind() == reflect.Interface { + err = fmt.Errorf("codec.Handle.AddExt: Takes named type, especially not a pointer or interface: %T", + reflect.Zero(rt).Interface()) + return + } + + // o cannot be nil, since it is always embedded in a Handle. + // if nil, let it panic. + // if o == nil { + // err = errors.New("codec.Handle.AddExt: extHandle cannot be a nil pointer.") + // return + // } + + rtid := reflect.ValueOf(rt).Pointer() + for _, v := range *o { + if v.rtid == rtid { + v.tag, v.encFn, v.decFn = tag, encfn, decfn + return + } + } + + *o = append(*o, &extTypeTagFn{rtid, rt, tag, encfn, decfn}) + return +} + +func (o extHandle) getExt(rtid uintptr) *extTypeTagFn { + for _, v := range o { + if v.rtid == rtid { + return v + } + } + return nil +} + +func (o extHandle) getExtForTag(tag byte) *extTypeTagFn { + for _, v := range o { + if v.tag == tag { + return v + } + } + return nil +} + +func (o extHandle) getDecodeExtForTag(tag byte) ( + rv reflect.Value, fn func(reflect.Value, []byte) error) { + if x := o.getExtForTag(tag); x != nil { + // ext is only registered for base + rv = reflect.New(x.rt).Elem() + fn = x.decFn + } + return +} + +func (o extHandle) getDecodeExt(rtid uintptr) (tag byte, fn func(reflect.Value, []byte) error) { + if x := o.getExt(rtid); x != nil { + tag = x.tag + fn = x.decFn + } + return +} + +func (o extHandle) getEncodeExt(rtid uintptr) (tag byte, fn func(reflect.Value) ([]byte, error)) { + if x := o.getExt(rtid); x != nil { + tag = x.tag + fn = x.encFn + } + return +} + +type structFieldInfo struct { + encName string // encode name + + // only one of 'i' or 'is' can be set. If 'i' is -1, then 'is' has been set. + + is []int // (recursive/embedded) field index in struct + i int16 // field index in struct + omitEmpty bool + toArray bool // if field is _struct, is the toArray set? + + // tag string // tag + // name string // field name + // encNameBs []byte // encoded name as byte stream + // ikind int // kind of the field as an int i.e. int(reflect.Kind) +} + +func parseStructFieldInfo(fname string, stag string) *structFieldInfo { + if fname == "" { + panic("parseStructFieldInfo: No Field Name") + } + si := structFieldInfo{ + // name: fname, + encName: fname, + // tag: stag, + } + + if stag != "" { + for i, s := range strings.Split(stag, ",") { + if i == 0 { + if s != "" { + si.encName = s + } + } else { + switch s { + case "omitempty": + si.omitEmpty = true + case "toarray": + si.toArray = true + } + } + } + } + // si.encNameBs = []byte(si.encName) + return &si +} + +type sfiSortedByEncName []*structFieldInfo + +func (p sfiSortedByEncName) Len() int { + return len(p) +} + +func (p sfiSortedByEncName) Less(i, j int) bool { + return p[i].encName < p[j].encName +} + +func (p sfiSortedByEncName) Swap(i, j int) { + p[i], p[j] = p[j], p[i] +} + +// typeInfo keeps information about each type referenced in the encode/decode sequence. +// +// During an encode/decode sequence, we work as below: +// - If base is a built in type, en/decode base value +// - If base is registered as an extension, en/decode base value +// - If type is binary(M/Unm)arshaler, call Binary(M/Unm)arshal method +// - Else decode appropriately based on the reflect.Kind +type typeInfo struct { + sfi []*structFieldInfo // sorted. Used when enc/dec struct to map. + sfip []*structFieldInfo // unsorted. Used when enc/dec struct to array. + + rt reflect.Type + rtid uintptr + + // baseId gives pointer to the base reflect.Type, after deferencing + // the pointers. E.g. base type of ***time.Time is time.Time. + base reflect.Type + baseId uintptr + baseIndir int8 // number of indirections to get to base + + mbs bool // base type (T or *T) is a MapBySlice + + m bool // base type (T or *T) is a binaryMarshaler + unm bool // base type (T or *T) is a binaryUnmarshaler + mIndir int8 // number of indirections to get to binaryMarshaler type + unmIndir int8 // number of indirections to get to binaryUnmarshaler type + toArray bool // whether this (struct) type should be encoded as an array +} + +func (ti *typeInfo) indexForEncName(name string) int { + //tisfi := ti.sfi + const binarySearchThreshold = 16 + if sfilen := len(ti.sfi); sfilen < binarySearchThreshold { + // linear search. faster than binary search in my testing up to 16-field structs. + for i, si := range ti.sfi { + if si.encName == name { + return i + } + } + } else { + // binary search. adapted from sort/search.go. + h, i, j := 0, 0, sfilen + for i < j { + h = i + (j-i)/2 + if ti.sfi[h].encName < name { + i = h + 1 + } else { + j = h + } + } + if i < sfilen && ti.sfi[i].encName == name { + return i + } + } + return -1 +} + +func getTypeInfo(rtid uintptr, rt reflect.Type) (pti *typeInfo) { + var ok bool + cachedTypeInfoMutex.RLock() + pti, ok = cachedTypeInfo[rtid] + cachedTypeInfoMutex.RUnlock() + if ok { + return + } + + cachedTypeInfoMutex.Lock() + defer cachedTypeInfoMutex.Unlock() + if pti, ok = cachedTypeInfo[rtid]; ok { + return + } + + ti := typeInfo{rt: rt, rtid: rtid} + pti = &ti + + var indir int8 + if ok, indir = implementsIntf(rt, binaryMarshalerTyp); ok { + ti.m, ti.mIndir = true, indir + } + if ok, indir = implementsIntf(rt, binaryUnmarshalerTyp); ok { + ti.unm, ti.unmIndir = true, indir + } + if ok, _ = implementsIntf(rt, mapBySliceTyp); ok { + ti.mbs = true + } + + pt := rt + var ptIndir int8 + // for ; pt.Kind() == reflect.Ptr; pt, ptIndir = pt.Elem(), ptIndir+1 { } + for pt.Kind() == reflect.Ptr { + pt = pt.Elem() + ptIndir++ + } + if ptIndir == 0 { + ti.base = rt + ti.baseId = rtid + } else { + ti.base = pt + ti.baseId = reflect.ValueOf(pt).Pointer() + ti.baseIndir = ptIndir + } + + if rt.Kind() == reflect.Struct { + var siInfo *structFieldInfo + if f, ok := rt.FieldByName(structInfoFieldName); ok { + siInfo = parseStructFieldInfo(structInfoFieldName, f.Tag.Get(structTagName)) + ti.toArray = siInfo.toArray + } + sfip := make([]*structFieldInfo, 0, rt.NumField()) + rgetTypeInfo(rt, nil, make(map[string]bool), &sfip, siInfo) + + // // try to put all si close together + // const tryToPutAllStructFieldInfoTogether = true + // if tryToPutAllStructFieldInfoTogether { + // sfip2 := make([]structFieldInfo, len(sfip)) + // for i, si := range sfip { + // sfip2[i] = *si + // } + // for i := range sfip { + // sfip[i] = &sfip2[i] + // } + // } + + ti.sfip = make([]*structFieldInfo, len(sfip)) + ti.sfi = make([]*structFieldInfo, len(sfip)) + copy(ti.sfip, sfip) + sort.Sort(sfiSortedByEncName(sfip)) + copy(ti.sfi, sfip) + } + // sfi = sfip + cachedTypeInfo[rtid] = pti + return +} + +func rgetTypeInfo(rt reflect.Type, indexstack []int, fnameToHastag map[string]bool, + sfi *[]*structFieldInfo, siInfo *structFieldInfo, +) { + // for rt.Kind() == reflect.Ptr { + // // indexstack = append(indexstack, 0) + // rt = rt.Elem() + // } + for j := 0; j < rt.NumField(); j++ { + f := rt.Field(j) + stag := f.Tag.Get(structTagName) + if stag == "-" { + continue + } + if r1, _ := utf8.DecodeRuneInString(f.Name); r1 == utf8.RuneError || !unicode.IsUpper(r1) { + continue + } + // if anonymous and there is no struct tag and its a struct (or pointer to struct), inline it. + if f.Anonymous && stag == "" { + ft := f.Type + for ft.Kind() == reflect.Ptr { + ft = ft.Elem() + } + if ft.Kind() == reflect.Struct { + indexstack2 := append(append(make([]int, 0, len(indexstack)+4), indexstack...), j) + rgetTypeInfo(ft, indexstack2, fnameToHastag, sfi, siInfo) + continue + } + } + // do not let fields with same name in embedded structs override field at higher level. + // this must be done after anonymous check, to allow anonymous field + // still include their child fields + if _, ok := fnameToHastag[f.Name]; ok { + continue + } + si := parseStructFieldInfo(f.Name, stag) + // si.ikind = int(f.Type.Kind()) + if len(indexstack) == 0 { + si.i = int16(j) + } else { + si.i = -1 + si.is = append(append(make([]int, 0, len(indexstack)+4), indexstack...), j) + } + + if siInfo != nil { + if siInfo.omitEmpty { + si.omitEmpty = true + } + } + *sfi = append(*sfi, si) + fnameToHastag[f.Name] = stag != "" + } +} + +func panicToErr(err *error) { + if recoverPanicToErr { + if x := recover(); x != nil { + //debug.PrintStack() + panicValToErr(x, err) + } + } +} + +func doPanic(tag string, format string, params ...interface{}) { + params2 := make([]interface{}, len(params)+1) + params2[0] = tag + copy(params2[1:], params) + panic(fmt.Errorf("%s: "+format, params2...)) +} + +func checkOverflowFloat32(f float64, doCheck bool) { + if !doCheck { + return + } + // check overflow (logic adapted from std pkg reflect/value.go OverflowFloat() + f2 := f + if f2 < 0 { + f2 = -f + } + if math.MaxFloat32 < f2 && f2 <= math.MaxFloat64 { + decErr("Overflow float32 value: %v", f2) + } +} + +func checkOverflow(ui uint64, i int64, bitsize uint8) { + // check overflow (logic adapted from std pkg reflect/value.go OverflowUint() + if bitsize == 0 { + return + } + if i != 0 { + if trunc := (i << (64 - bitsize)) >> (64 - bitsize); i != trunc { + decErr("Overflow int value: %v", i) + } + } + if ui != 0 { + if trunc := (ui << (64 - bitsize)) >> (64 - bitsize); ui != trunc { + decErr("Overflow uint value: %v", ui) + } + } +} diff --git a/Godeps/_workspace/src/github.com/hashicorp/go-msgpack/codec/helper_internal.go b/Godeps/_workspace/src/github.com/hashicorp/go-msgpack/codec/helper_internal.go new file mode 100644 index 00000000000..58417da958f --- /dev/null +++ b/Godeps/_workspace/src/github.com/hashicorp/go-msgpack/codec/helper_internal.go @@ -0,0 +1,127 @@ +// Copyright (c) 2012, 2013 Ugorji Nwoke. All rights reserved. +// Use of this source code is governed by a BSD-style license found in the LICENSE file. + +package codec + +// All non-std package dependencies live in this file, +// so porting to different environment is easy (just update functions). + +import ( + "errors" + "fmt" + "math" + "reflect" +) + +var ( + raisePanicAfterRecover = false + debugging = true +) + +func panicValToErr(panicVal interface{}, err *error) { + switch xerr := panicVal.(type) { + case error: + *err = xerr + case string: + *err = errors.New(xerr) + default: + *err = fmt.Errorf("%v", panicVal) + } + if raisePanicAfterRecover { + panic(panicVal) + } + return +} + +func isEmptyValueDeref(v reflect.Value, deref bool) bool { + switch v.Kind() { + case reflect.Array, reflect.Map, reflect.Slice, reflect.String: + return v.Len() == 0 + case reflect.Bool: + return !v.Bool() + case reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64: + return v.Int() == 0 + case reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64, reflect.Uintptr: + return v.Uint() == 0 + case reflect.Float32, reflect.Float64: + return v.Float() == 0 + case reflect.Interface, reflect.Ptr: + if deref { + if v.IsNil() { + return true + } + return isEmptyValueDeref(v.Elem(), deref) + } else { + return v.IsNil() + } + case reflect.Struct: + // return true if all fields are empty. else return false. + + // we cannot use equality check, because some fields may be maps/slices/etc + // and consequently the structs are not comparable. + // return v.Interface() == reflect.Zero(v.Type()).Interface() + for i, n := 0, v.NumField(); i < n; i++ { + if !isEmptyValueDeref(v.Field(i), deref) { + return false + } + } + return true + } + return false +} + +func isEmptyValue(v reflect.Value) bool { + return isEmptyValueDeref(v, true) +} + +func debugf(format string, args ...interface{}) { + if debugging { + if len(format) == 0 || format[len(format)-1] != '\n' { + format = format + "\n" + } + fmt.Printf(format, args...) + } +} + +func pruneSignExt(v []byte, pos bool) (n int) { + if len(v) < 2 { + } else if pos && v[0] == 0 { + for ; v[n] == 0 && n+1 < len(v) && (v[n+1]&(1<<7) == 0); n++ { + } + } else if !pos && v[0] == 0xff { + for ; v[n] == 0xff && n+1 < len(v) && (v[n+1]&(1<<7) != 0); n++ { + } + } + return +} + +func implementsIntf(typ, iTyp reflect.Type) (success bool, indir int8) { + if typ == nil { + return + } + rt := typ + // The type might be a pointer and we need to keep + // dereferencing to the base type until we find an implementation. + for { + if rt.Implements(iTyp) { + return true, indir + } + if p := rt; p.Kind() == reflect.Ptr { + indir++ + if indir >= math.MaxInt8 { // insane number of indirections + return false, 0 + } + rt = p.Elem() + continue + } + break + } + // No luck yet, but if this is a base type (non-pointer), the pointer might satisfy. + if typ.Kind() != reflect.Ptr { + // Not a pointer, but does the pointer work? + if reflect.PtrTo(typ).Implements(iTyp) { + return true, -1 + } + } + return false, 0 +} diff --git a/Godeps/_workspace/src/github.com/hashicorp/go-msgpack/codec/msgpack.go b/Godeps/_workspace/src/github.com/hashicorp/go-msgpack/codec/msgpack.go new file mode 100644 index 00000000000..da0500d1922 --- /dev/null +++ b/Godeps/_workspace/src/github.com/hashicorp/go-msgpack/codec/msgpack.go @@ -0,0 +1,816 @@ +// Copyright (c) 2012, 2013 Ugorji Nwoke. All rights reserved. +// Use of this source code is governed by a BSD-style license found in the LICENSE file. + +/* +MSGPACK + +Msgpack-c implementation powers the c, c++, python, ruby, etc libraries. +We need to maintain compatibility with it and how it encodes integer values +without caring about the type. + +For compatibility with behaviour of msgpack-c reference implementation: + - Go intX (>0) and uintX + IS ENCODED AS + msgpack +ve fixnum, unsigned + - Go intX (<0) + IS ENCODED AS + msgpack -ve fixnum, signed + +*/ +package codec + +import ( + "fmt" + "io" + "math" + "net/rpc" +) + +const ( + mpPosFixNumMin byte = 0x00 + mpPosFixNumMax = 0x7f + mpFixMapMin = 0x80 + mpFixMapMax = 0x8f + mpFixArrayMin = 0x90 + mpFixArrayMax = 0x9f + mpFixStrMin = 0xa0 + mpFixStrMax = 0xbf + mpNil = 0xc0 + _ = 0xc1 + mpFalse = 0xc2 + mpTrue = 0xc3 + mpFloat = 0xca + mpDouble = 0xcb + mpUint8 = 0xcc + mpUint16 = 0xcd + mpUint32 = 0xce + mpUint64 = 0xcf + mpInt8 = 0xd0 + mpInt16 = 0xd1 + mpInt32 = 0xd2 + mpInt64 = 0xd3 + + // extensions below + mpBin8 = 0xc4 + mpBin16 = 0xc5 + mpBin32 = 0xc6 + mpExt8 = 0xc7 + mpExt16 = 0xc8 + mpExt32 = 0xc9 + mpFixExt1 = 0xd4 + mpFixExt2 = 0xd5 + mpFixExt4 = 0xd6 + mpFixExt8 = 0xd7 + mpFixExt16 = 0xd8 + + mpStr8 = 0xd9 // new + mpStr16 = 0xda + mpStr32 = 0xdb + + mpArray16 = 0xdc + mpArray32 = 0xdd + + mpMap16 = 0xde + mpMap32 = 0xdf + + mpNegFixNumMin = 0xe0 + mpNegFixNumMax = 0xff +) + +// MsgpackSpecRpcMultiArgs is a special type which signifies to the MsgpackSpecRpcCodec +// that the backend RPC service takes multiple arguments, which have been arranged +// in sequence in the slice. +// +// The Codec then passes it AS-IS to the rpc service (without wrapping it in an +// array of 1 element). +type MsgpackSpecRpcMultiArgs []interface{} + +// A MsgpackContainer type specifies the different types of msgpackContainers. +type msgpackContainerType struct { + fixCutoff int + bFixMin, b8, b16, b32 byte + hasFixMin, has8, has8Always bool +} + +var ( + msgpackContainerStr = msgpackContainerType{32, mpFixStrMin, mpStr8, mpStr16, mpStr32, true, true, false} + msgpackContainerBin = msgpackContainerType{0, 0, mpBin8, mpBin16, mpBin32, false, true, true} + msgpackContainerList = msgpackContainerType{16, mpFixArrayMin, 0, mpArray16, mpArray32, true, false, false} + msgpackContainerMap = msgpackContainerType{16, mpFixMapMin, 0, mpMap16, mpMap32, true, false, false} +) + +//--------------------------------------------- + +type msgpackEncDriver struct { + w encWriter + h *MsgpackHandle +} + +func (e *msgpackEncDriver) isBuiltinType(rt uintptr) bool { + //no builtin types. All encodings are based on kinds. Types supported as extensions. + return false +} + +func (e *msgpackEncDriver) encodeBuiltin(rt uintptr, v interface{}) {} + +func (e *msgpackEncDriver) encodeNil() { + e.w.writen1(mpNil) +} + +func (e *msgpackEncDriver) encodeInt(i int64) { + + switch { + case i >= 0: + e.encodeUint(uint64(i)) + case i >= -32: + e.w.writen1(byte(i)) + case i >= math.MinInt8: + e.w.writen2(mpInt8, byte(i)) + case i >= math.MinInt16: + e.w.writen1(mpInt16) + e.w.writeUint16(uint16(i)) + case i >= math.MinInt32: + e.w.writen1(mpInt32) + e.w.writeUint32(uint32(i)) + default: + e.w.writen1(mpInt64) + e.w.writeUint64(uint64(i)) + } +} + +func (e *msgpackEncDriver) encodeUint(i uint64) { + switch { + case i <= math.MaxInt8: + e.w.writen1(byte(i)) + case i <= math.MaxUint8: + e.w.writen2(mpUint8, byte(i)) + case i <= math.MaxUint16: + e.w.writen1(mpUint16) + e.w.writeUint16(uint16(i)) + case i <= math.MaxUint32: + e.w.writen1(mpUint32) + e.w.writeUint32(uint32(i)) + default: + e.w.writen1(mpUint64) + e.w.writeUint64(uint64(i)) + } +} + +func (e *msgpackEncDriver) encodeBool(b bool) { + if b { + e.w.writen1(mpTrue) + } else { + e.w.writen1(mpFalse) + } +} + +func (e *msgpackEncDriver) encodeFloat32(f float32) { + e.w.writen1(mpFloat) + e.w.writeUint32(math.Float32bits(f)) +} + +func (e *msgpackEncDriver) encodeFloat64(f float64) { + e.w.writen1(mpDouble) + e.w.writeUint64(math.Float64bits(f)) +} + +func (e *msgpackEncDriver) encodeExtPreamble(xtag byte, l int) { + switch { + case l == 1: + e.w.writen2(mpFixExt1, xtag) + case l == 2: + e.w.writen2(mpFixExt2, xtag) + case l == 4: + e.w.writen2(mpFixExt4, xtag) + case l == 8: + e.w.writen2(mpFixExt8, xtag) + case l == 16: + e.w.writen2(mpFixExt16, xtag) + case l < 256: + e.w.writen2(mpExt8, byte(l)) + e.w.writen1(xtag) + case l < 65536: + e.w.writen1(mpExt16) + e.w.writeUint16(uint16(l)) + e.w.writen1(xtag) + default: + e.w.writen1(mpExt32) + e.w.writeUint32(uint32(l)) + e.w.writen1(xtag) + } +} + +func (e *msgpackEncDriver) encodeArrayPreamble(length int) { + e.writeContainerLen(msgpackContainerList, length) +} + +func (e *msgpackEncDriver) encodeMapPreamble(length int) { + e.writeContainerLen(msgpackContainerMap, length) +} + +func (e *msgpackEncDriver) encodeString(c charEncoding, s string) { + if c == c_RAW && e.h.WriteExt { + e.writeContainerLen(msgpackContainerBin, len(s)) + } else { + e.writeContainerLen(msgpackContainerStr, len(s)) + } + if len(s) > 0 { + e.w.writestr(s) + } +} + +func (e *msgpackEncDriver) encodeSymbol(v string) { + e.encodeString(c_UTF8, v) +} + +func (e *msgpackEncDriver) encodeStringBytes(c charEncoding, bs []byte) { + if c == c_RAW && e.h.WriteExt { + e.writeContainerLen(msgpackContainerBin, len(bs)) + } else { + e.writeContainerLen(msgpackContainerStr, len(bs)) + } + if len(bs) > 0 { + e.w.writeb(bs) + } +} + +func (e *msgpackEncDriver) writeContainerLen(ct msgpackContainerType, l int) { + switch { + case ct.hasFixMin && l < ct.fixCutoff: + e.w.writen1(ct.bFixMin | byte(l)) + case ct.has8 && l < 256 && (ct.has8Always || e.h.WriteExt): + e.w.writen2(ct.b8, uint8(l)) + case l < 65536: + e.w.writen1(ct.b16) + e.w.writeUint16(uint16(l)) + default: + e.w.writen1(ct.b32) + e.w.writeUint32(uint32(l)) + } +} + +//--------------------------------------------- + +type msgpackDecDriver struct { + r decReader + h *MsgpackHandle + bd byte + bdRead bool + bdType valueType +} + +func (d *msgpackDecDriver) isBuiltinType(rt uintptr) bool { + //no builtin types. All encodings are based on kinds. Types supported as extensions. + return false +} + +func (d *msgpackDecDriver) decodeBuiltin(rt uintptr, v interface{}) {} + +// Note: This returns either a primitive (int, bool, etc) for non-containers, +// or a containerType, or a specific type denoting nil or extension. +// It is called when a nil interface{} is passed, leaving it up to the DecDriver +// to introspect the stream and decide how best to decode. +// It deciphers the value by looking at the stream first. +func (d *msgpackDecDriver) decodeNaked() (v interface{}, vt valueType, decodeFurther bool) { + d.initReadNext() + bd := d.bd + + switch bd { + case mpNil: + vt = valueTypeNil + d.bdRead = false + case mpFalse: + vt = valueTypeBool + v = false + case mpTrue: + vt = valueTypeBool + v = true + + case mpFloat: + vt = valueTypeFloat + v = float64(math.Float32frombits(d.r.readUint32())) + case mpDouble: + vt = valueTypeFloat + v = math.Float64frombits(d.r.readUint64()) + + case mpUint8: + vt = valueTypeUint + v = uint64(d.r.readn1()) + case mpUint16: + vt = valueTypeUint + v = uint64(d.r.readUint16()) + case mpUint32: + vt = valueTypeUint + v = uint64(d.r.readUint32()) + case mpUint64: + vt = valueTypeUint + v = uint64(d.r.readUint64()) + + case mpInt8: + vt = valueTypeInt + v = int64(int8(d.r.readn1())) + case mpInt16: + vt = valueTypeInt + v = int64(int16(d.r.readUint16())) + case mpInt32: + vt = valueTypeInt + v = int64(int32(d.r.readUint32())) + case mpInt64: + vt = valueTypeInt + v = int64(int64(d.r.readUint64())) + + default: + switch { + case bd >= mpPosFixNumMin && bd <= mpPosFixNumMax: + // positive fixnum (always signed) + vt = valueTypeInt + v = int64(int8(bd)) + case bd >= mpNegFixNumMin && bd <= mpNegFixNumMax: + // negative fixnum + vt = valueTypeInt + v = int64(int8(bd)) + case bd == mpStr8, bd == mpStr16, bd == mpStr32, bd >= mpFixStrMin && bd <= mpFixStrMax: + if d.h.RawToString { + var rvm string + vt = valueTypeString + v = &rvm + } else { + var rvm = []byte{} + vt = valueTypeBytes + v = &rvm + } + decodeFurther = true + case bd == mpBin8, bd == mpBin16, bd == mpBin32: + var rvm = []byte{} + vt = valueTypeBytes + v = &rvm + decodeFurther = true + case bd == mpArray16, bd == mpArray32, bd >= mpFixArrayMin && bd <= mpFixArrayMax: + vt = valueTypeArray + decodeFurther = true + case bd == mpMap16, bd == mpMap32, bd >= mpFixMapMin && bd <= mpFixMapMax: + vt = valueTypeMap + decodeFurther = true + case bd >= mpFixExt1 && bd <= mpFixExt16, bd >= mpExt8 && bd <= mpExt32: + clen := d.readExtLen() + var re RawExt + re.Tag = d.r.readn1() + re.Data = d.r.readn(clen) + v = &re + vt = valueTypeExt + default: + decErr("Nil-Deciphered DecodeValue: %s: hex: %x, dec: %d", msgBadDesc, bd, bd) + } + } + if !decodeFurther { + d.bdRead = false + } + return +} + +// int can be decoded from msgpack type: intXXX or uintXXX +func (d *msgpackDecDriver) decodeInt(bitsize uint8) (i int64) { + switch d.bd { + case mpUint8: + i = int64(uint64(d.r.readn1())) + case mpUint16: + i = int64(uint64(d.r.readUint16())) + case mpUint32: + i = int64(uint64(d.r.readUint32())) + case mpUint64: + i = int64(d.r.readUint64()) + case mpInt8: + i = int64(int8(d.r.readn1())) + case mpInt16: + i = int64(int16(d.r.readUint16())) + case mpInt32: + i = int64(int32(d.r.readUint32())) + case mpInt64: + i = int64(d.r.readUint64()) + default: + switch { + case d.bd >= mpPosFixNumMin && d.bd <= mpPosFixNumMax: + i = int64(int8(d.bd)) + case d.bd >= mpNegFixNumMin && d.bd <= mpNegFixNumMax: + i = int64(int8(d.bd)) + default: + decErr("Unhandled single-byte unsigned integer value: %s: %x", msgBadDesc, d.bd) + } + } + // check overflow (logic adapted from std pkg reflect/value.go OverflowUint() + if bitsize > 0 { + if trunc := (i << (64 - bitsize)) >> (64 - bitsize); i != trunc { + decErr("Overflow int value: %v", i) + } + } + d.bdRead = false + return +} + +// uint can be decoded from msgpack type: intXXX or uintXXX +func (d *msgpackDecDriver) decodeUint(bitsize uint8) (ui uint64) { + switch d.bd { + case mpUint8: + ui = uint64(d.r.readn1()) + case mpUint16: + ui = uint64(d.r.readUint16()) + case mpUint32: + ui = uint64(d.r.readUint32()) + case mpUint64: + ui = d.r.readUint64() + case mpInt8: + if i := int64(int8(d.r.readn1())); i >= 0 { + ui = uint64(i) + } else { + decErr("Assigning negative signed value: %v, to unsigned type", i) + } + case mpInt16: + if i := int64(int16(d.r.readUint16())); i >= 0 { + ui = uint64(i) + } else { + decErr("Assigning negative signed value: %v, to unsigned type", i) + } + case mpInt32: + if i := int64(int32(d.r.readUint32())); i >= 0 { + ui = uint64(i) + } else { + decErr("Assigning negative signed value: %v, to unsigned type", i) + } + case mpInt64: + if i := int64(d.r.readUint64()); i >= 0 { + ui = uint64(i) + } else { + decErr("Assigning negative signed value: %v, to unsigned type", i) + } + default: + switch { + case d.bd >= mpPosFixNumMin && d.bd <= mpPosFixNumMax: + ui = uint64(d.bd) + case d.bd >= mpNegFixNumMin && d.bd <= mpNegFixNumMax: + decErr("Assigning negative signed value: %v, to unsigned type", int(d.bd)) + default: + decErr("Unhandled single-byte unsigned integer value: %s: %x", msgBadDesc, d.bd) + } + } + // check overflow (logic adapted from std pkg reflect/value.go OverflowUint() + if bitsize > 0 { + if trunc := (ui << (64 - bitsize)) >> (64 - bitsize); ui != trunc { + decErr("Overflow uint value: %v", ui) + } + } + d.bdRead = false + return +} + +// float can either be decoded from msgpack type: float, double or intX +func (d *msgpackDecDriver) decodeFloat(chkOverflow32 bool) (f float64) { + switch d.bd { + case mpFloat: + f = float64(math.Float32frombits(d.r.readUint32())) + case mpDouble: + f = math.Float64frombits(d.r.readUint64()) + default: + f = float64(d.decodeInt(0)) + } + checkOverflowFloat32(f, chkOverflow32) + d.bdRead = false + return +} + +// bool can be decoded from bool, fixnum 0 or 1. +func (d *msgpackDecDriver) decodeBool() (b bool) { + switch d.bd { + case mpFalse, 0: + // b = false + case mpTrue, 1: + b = true + default: + decErr("Invalid single-byte value for bool: %s: %x", msgBadDesc, d.bd) + } + d.bdRead = false + return +} + +func (d *msgpackDecDriver) decodeString() (s string) { + clen := d.readContainerLen(msgpackContainerStr) + if clen > 0 { + s = string(d.r.readn(clen)) + } + d.bdRead = false + return +} + +// Callers must check if changed=true (to decide whether to replace the one they have) +func (d *msgpackDecDriver) decodeBytes(bs []byte) (bsOut []byte, changed bool) { + // bytes can be decoded from msgpackContainerStr or msgpackContainerBin + var clen int + switch d.bd { + case mpBin8, mpBin16, mpBin32: + clen = d.readContainerLen(msgpackContainerBin) + default: + clen = d.readContainerLen(msgpackContainerStr) + } + // if clen < 0 { + // changed = true + // panic("length cannot be zero. this cannot be nil.") + // } + if clen > 0 { + // if no contents in stream, don't update the passed byteslice + if len(bs) != clen { + // Return changed=true if length of passed slice diff from length of bytes in stream + if len(bs) > clen { + bs = bs[:clen] + } else { + bs = make([]byte, clen) + } + bsOut = bs + changed = true + } + d.r.readb(bs) + } + d.bdRead = false + return +} + +// Every top-level decode funcs (i.e. decodeValue, decode) must call this first. +func (d *msgpackDecDriver) initReadNext() { + if d.bdRead { + return + } + d.bd = d.r.readn1() + d.bdRead = true + d.bdType = valueTypeUnset +} + +func (d *msgpackDecDriver) currentEncodedType() valueType { + if d.bdType == valueTypeUnset { + bd := d.bd + switch bd { + case mpNil: + d.bdType = valueTypeNil + case mpFalse, mpTrue: + d.bdType = valueTypeBool + case mpFloat, mpDouble: + d.bdType = valueTypeFloat + case mpUint8, mpUint16, mpUint32, mpUint64: + d.bdType = valueTypeUint + case mpInt8, mpInt16, mpInt32, mpInt64: + d.bdType = valueTypeInt + default: + switch { + case bd >= mpPosFixNumMin && bd <= mpPosFixNumMax: + d.bdType = valueTypeInt + case bd >= mpNegFixNumMin && bd <= mpNegFixNumMax: + d.bdType = valueTypeInt + case bd == mpStr8, bd == mpStr16, bd == mpStr32, bd >= mpFixStrMin && bd <= mpFixStrMax: + if d.h.RawToString { + d.bdType = valueTypeString + } else { + d.bdType = valueTypeBytes + } + case bd == mpBin8, bd == mpBin16, bd == mpBin32: + d.bdType = valueTypeBytes + case bd == mpArray16, bd == mpArray32, bd >= mpFixArrayMin && bd <= mpFixArrayMax: + d.bdType = valueTypeArray + case bd == mpMap16, bd == mpMap32, bd >= mpFixMapMin && bd <= mpFixMapMax: + d.bdType = valueTypeMap + case bd >= mpFixExt1 && bd <= mpFixExt16, bd >= mpExt8 && bd <= mpExt32: + d.bdType = valueTypeExt + default: + decErr("currentEncodedType: Undeciphered descriptor: %s: hex: %x, dec: %d", msgBadDesc, bd, bd) + } + } + } + return d.bdType +} + +func (d *msgpackDecDriver) tryDecodeAsNil() bool { + if d.bd == mpNil { + d.bdRead = false + return true + } + return false +} + +func (d *msgpackDecDriver) readContainerLen(ct msgpackContainerType) (clen int) { + bd := d.bd + switch { + case bd == mpNil: + clen = -1 // to represent nil + case bd == ct.b8: + clen = int(d.r.readn1()) + case bd == ct.b16: + clen = int(d.r.readUint16()) + case bd == ct.b32: + clen = int(d.r.readUint32()) + case (ct.bFixMin & bd) == ct.bFixMin: + clen = int(ct.bFixMin ^ bd) + default: + decErr("readContainerLen: %s: hex: %x, dec: %d", msgBadDesc, bd, bd) + } + d.bdRead = false + return +} + +func (d *msgpackDecDriver) readMapLen() int { + return d.readContainerLen(msgpackContainerMap) +} + +func (d *msgpackDecDriver) readArrayLen() int { + return d.readContainerLen(msgpackContainerList) +} + +func (d *msgpackDecDriver) readExtLen() (clen int) { + switch d.bd { + case mpNil: + clen = -1 // to represent nil + case mpFixExt1: + clen = 1 + case mpFixExt2: + clen = 2 + case mpFixExt4: + clen = 4 + case mpFixExt8: + clen = 8 + case mpFixExt16: + clen = 16 + case mpExt8: + clen = int(d.r.readn1()) + case mpExt16: + clen = int(d.r.readUint16()) + case mpExt32: + clen = int(d.r.readUint32()) + default: + decErr("decoding ext bytes: found unexpected byte: %x", d.bd) + } + return +} + +func (d *msgpackDecDriver) decodeExt(verifyTag bool, tag byte) (xtag byte, xbs []byte) { + xbd := d.bd + switch { + case xbd == mpBin8, xbd == mpBin16, xbd == mpBin32: + xbs, _ = d.decodeBytes(nil) + case xbd == mpStr8, xbd == mpStr16, xbd == mpStr32, + xbd >= mpFixStrMin && xbd <= mpFixStrMax: + xbs = []byte(d.decodeString()) + default: + clen := d.readExtLen() + xtag = d.r.readn1() + if verifyTag && xtag != tag { + decErr("Wrong extension tag. Got %b. Expecting: %v", xtag, tag) + } + xbs = d.r.readn(clen) + } + d.bdRead = false + return +} + +//-------------------------------------------------- + +//MsgpackHandle is a Handle for the Msgpack Schema-Free Encoding Format. +type MsgpackHandle struct { + BasicHandle + + // RawToString controls how raw bytes are decoded into a nil interface{}. + RawToString bool + // WriteExt flag supports encoding configured extensions with extension tags. + // It also controls whether other elements of the new spec are encoded (ie Str8). + // + // With WriteExt=false, configured extensions are serialized as raw bytes + // and Str8 is not encoded. + // + // A stream can still be decoded into a typed value, provided an appropriate value + // is provided, but the type cannot be inferred from the stream. If no appropriate + // type is provided (e.g. decoding into a nil interface{}), you get back + // a []byte or string based on the setting of RawToString. + WriteExt bool +} + +func (h *MsgpackHandle) newEncDriver(w encWriter) encDriver { + return &msgpackEncDriver{w: w, h: h} +} + +func (h *MsgpackHandle) newDecDriver(r decReader) decDriver { + return &msgpackDecDriver{r: r, h: h} +} + +func (h *MsgpackHandle) writeExt() bool { + return h.WriteExt +} + +func (h *MsgpackHandle) getBasicHandle() *BasicHandle { + return &h.BasicHandle +} + +//-------------------------------------------------- + +type msgpackSpecRpcCodec struct { + rpcCodec +} + +// /////////////// Spec RPC Codec /////////////////// +func (c *msgpackSpecRpcCodec) WriteRequest(r *rpc.Request, body interface{}) error { + // WriteRequest can write to both a Go service, and other services that do + // not abide by the 1 argument rule of a Go service. + // We discriminate based on if the body is a MsgpackSpecRpcMultiArgs + var bodyArr []interface{} + if m, ok := body.(MsgpackSpecRpcMultiArgs); ok { + bodyArr = ([]interface{})(m) + } else { + bodyArr = []interface{}{body} + } + r2 := []interface{}{0, uint32(r.Seq), r.ServiceMethod, bodyArr} + return c.write(r2, nil, false, true) +} + +func (c *msgpackSpecRpcCodec) WriteResponse(r *rpc.Response, body interface{}) error { + var moe interface{} + if r.Error != "" { + moe = r.Error + } + if moe != nil && body != nil { + body = nil + } + r2 := []interface{}{1, uint32(r.Seq), moe, body} + return c.write(r2, nil, false, true) +} + +func (c *msgpackSpecRpcCodec) ReadResponseHeader(r *rpc.Response) error { + return c.parseCustomHeader(1, &r.Seq, &r.Error) +} + +func (c *msgpackSpecRpcCodec) ReadRequestHeader(r *rpc.Request) error { + return c.parseCustomHeader(0, &r.Seq, &r.ServiceMethod) +} + +func (c *msgpackSpecRpcCodec) ReadRequestBody(body interface{}) error { + if body == nil { // read and discard + return c.read(nil) + } + bodyArr := []interface{}{body} + return c.read(&bodyArr) +} + +func (c *msgpackSpecRpcCodec) parseCustomHeader(expectTypeByte byte, msgid *uint64, methodOrError *string) (err error) { + + if c.cls { + return io.EOF + } + + // We read the response header by hand + // so that the body can be decoded on its own from the stream at a later time. + + const fia byte = 0x94 //four item array descriptor value + // Not sure why the panic of EOF is swallowed above. + // if bs1 := c.dec.r.readn1(); bs1 != fia { + // err = fmt.Errorf("Unexpected value for array descriptor: Expecting %v. Received %v", fia, bs1) + // return + // } + var b byte + b, err = c.br.ReadByte() + if err != nil { + return + } + if b != fia { + err = fmt.Errorf("Unexpected value for array descriptor: Expecting %v. Received %v", fia, b) + return + } + + if err = c.read(&b); err != nil { + return + } + if b != expectTypeByte { + err = fmt.Errorf("Unexpected byte descriptor in header. Expecting %v. Received %v", expectTypeByte, b) + return + } + if err = c.read(msgid); err != nil { + return + } + if err = c.read(methodOrError); err != nil { + return + } + return +} + +//-------------------------------------------------- + +// msgpackSpecRpc is the implementation of Rpc that uses custom communication protocol +// as defined in the msgpack spec at https://github.com/msgpack-rpc/msgpack-rpc/blob/master/spec.md +type msgpackSpecRpc struct{} + +// MsgpackSpecRpc implements Rpc using the communication protocol defined in +// the msgpack spec at https://github.com/msgpack-rpc/msgpack-rpc/blob/master/spec.md . +// Its methods (ServerCodec and ClientCodec) return values that implement RpcCodecBuffered. +var MsgpackSpecRpc msgpackSpecRpc + +func (x msgpackSpecRpc) ServerCodec(conn io.ReadWriteCloser, h Handle) rpc.ServerCodec { + return &msgpackSpecRpcCodec{newRPCCodec(conn, h)} +} + +func (x msgpackSpecRpc) ClientCodec(conn io.ReadWriteCloser, h Handle) rpc.ClientCodec { + return &msgpackSpecRpcCodec{newRPCCodec(conn, h)} +} + +var _ decDriver = (*msgpackDecDriver)(nil) +var _ encDriver = (*msgpackEncDriver)(nil) diff --git a/Godeps/_workspace/src/github.com/hashicorp/go-msgpack/codec/msgpack_test.py b/Godeps/_workspace/src/github.com/hashicorp/go-msgpack/codec/msgpack_test.py new file mode 100644 index 00000000000..e933838c56a --- /dev/null +++ b/Godeps/_workspace/src/github.com/hashicorp/go-msgpack/codec/msgpack_test.py @@ -0,0 +1,110 @@ +#!/usr/bin/env python + +# This will create golden files in a directory passed to it. +# A Test calls this internally to create the golden files +# So it can process them (so we don't have to checkin the files). + +import msgpack, msgpackrpc, sys, os, threading + +def get_test_data_list(): + # get list with all primitive types, and a combo type + l0 = [ + -8, + -1616, + -32323232, + -6464646464646464, + 192, + 1616, + 32323232, + 6464646464646464, + 192, + -3232.0, + -6464646464.0, + 3232.0, + 6464646464.0, + False, + True, + None, + "someday", + "", + "bytestring", + 1328176922000002000, + -2206187877999998000, + 0, + -6795364578871345152 + ] + l1 = [ + { "true": True, + "false": False }, + { "true": "True", + "false": False, + "uint16(1616)": 1616 }, + { "list": [1616, 32323232, True, -3232.0, {"TRUE":True, "FALSE":False}, [True, False] ], + "int32":32323232, "bool": True, + "LONG STRING": "123456789012345678901234567890123456789012345678901234567890", + "SHORT STRING": "1234567890" }, + { True: "true", 8: False, "false": 0 } + ] + + l = [] + l.extend(l0) + l.append(l0) + l.extend(l1) + return l + +def build_test_data(destdir): + l = get_test_data_list() + for i in range(len(l)): + packer = msgpack.Packer() + serialized = packer.pack(l[i]) + f = open(os.path.join(destdir, str(i) + '.golden'), 'wb') + f.write(serialized) + f.close() + +def doRpcServer(port, stopTimeSec): + class EchoHandler(object): + def Echo123(self, msg1, msg2, msg3): + return ("1:%s 2:%s 3:%s" % (msg1, msg2, msg3)) + def EchoStruct(self, msg): + return ("%s" % msg) + + addr = msgpackrpc.Address('localhost', port) + server = msgpackrpc.Server(EchoHandler()) + server.listen(addr) + # run thread to stop it after stopTimeSec seconds if > 0 + if stopTimeSec > 0: + def myStopRpcServer(): + server.stop() + t = threading.Timer(stopTimeSec, myStopRpcServer) + t.start() + server.start() + +def doRpcClientToPythonSvc(port): + address = msgpackrpc.Address('localhost', port) + client = msgpackrpc.Client(address, unpack_encoding='utf-8') + print client.call("Echo123", "A1", "B2", "C3") + print client.call("EchoStruct", {"A" :"Aa", "B":"Bb", "C":"Cc"}) + +def doRpcClientToGoSvc(port): + # print ">>>> port: ", port, " <<<<<" + address = msgpackrpc.Address('localhost', port) + client = msgpackrpc.Client(address, unpack_encoding='utf-8') + print client.call("TestRpcInt.Echo123", ["A1", "B2", "C3"]) + print client.call("TestRpcInt.EchoStruct", {"A" :"Aa", "B":"Bb", "C":"Cc"}) + +def doMain(args): + if len(args) == 2 and args[0] == "testdata": + build_test_data(args[1]) + elif len(args) == 3 and args[0] == "rpc-server": + doRpcServer(int(args[1]), int(args[2])) + elif len(args) == 2 and args[0] == "rpc-client-python-service": + doRpcClientToPythonSvc(int(args[1])) + elif len(args) == 2 and args[0] == "rpc-client-go-service": + doRpcClientToGoSvc(int(args[1])) + else: + print("Usage: msgpack_test.py " + + "[testdata|rpc-server|rpc-client-python-service|rpc-client-go-service] ...") + +if __name__ == "__main__": + doMain(sys.argv[1:]) + diff --git a/Godeps/_workspace/src/github.com/hashicorp/go-msgpack/codec/rpc.go b/Godeps/_workspace/src/github.com/hashicorp/go-msgpack/codec/rpc.go new file mode 100644 index 00000000000..d014dbdcc7d --- /dev/null +++ b/Godeps/_workspace/src/github.com/hashicorp/go-msgpack/codec/rpc.go @@ -0,0 +1,152 @@ +// Copyright (c) 2012, 2013 Ugorji Nwoke. All rights reserved. +// Use of this source code is governed by a BSD-style license found in the LICENSE file. + +package codec + +import ( + "bufio" + "io" + "net/rpc" + "sync" +) + +// Rpc provides a rpc Server or Client Codec for rpc communication. +type Rpc interface { + ServerCodec(conn io.ReadWriteCloser, h Handle) rpc.ServerCodec + ClientCodec(conn io.ReadWriteCloser, h Handle) rpc.ClientCodec +} + +// RpcCodecBuffered allows access to the underlying bufio.Reader/Writer +// used by the rpc connection. It accomodates use-cases where the connection +// should be used by rpc and non-rpc functions, e.g. streaming a file after +// sending an rpc response. +type RpcCodecBuffered interface { + BufferedReader() *bufio.Reader + BufferedWriter() *bufio.Writer +} + +// ------------------------------------- + +// rpcCodec defines the struct members and common methods. +type rpcCodec struct { + rwc io.ReadWriteCloser + dec *Decoder + enc *Encoder + bw *bufio.Writer + br *bufio.Reader + mu sync.Mutex + cls bool +} + +func newRPCCodec(conn io.ReadWriteCloser, h Handle) rpcCodec { + bw := bufio.NewWriter(conn) + br := bufio.NewReader(conn) + return rpcCodec{ + rwc: conn, + bw: bw, + br: br, + enc: NewEncoder(bw, h), + dec: NewDecoder(br, h), + } +} + +func (c *rpcCodec) BufferedReader() *bufio.Reader { + return c.br +} + +func (c *rpcCodec) BufferedWriter() *bufio.Writer { + return c.bw +} + +func (c *rpcCodec) write(obj1, obj2 interface{}, writeObj2, doFlush bool) (err error) { + if c.cls { + return io.EOF + } + if err = c.enc.Encode(obj1); err != nil { + return + } + if writeObj2 { + if err = c.enc.Encode(obj2); err != nil { + return + } + } + if doFlush && c.bw != nil { + return c.bw.Flush() + } + return +} + +func (c *rpcCodec) read(obj interface{}) (err error) { + if c.cls { + return io.EOF + } + //If nil is passed in, we should still attempt to read content to nowhere. + if obj == nil { + var obj2 interface{} + return c.dec.Decode(&obj2) + } + return c.dec.Decode(obj) +} + +func (c *rpcCodec) Close() error { + if c.cls { + return io.EOF + } + c.cls = true + return c.rwc.Close() +} + +func (c *rpcCodec) ReadResponseBody(body interface{}) error { + return c.read(body) +} + +// ------------------------------------- + +type goRpcCodec struct { + rpcCodec +} + +func (c *goRpcCodec) WriteRequest(r *rpc.Request, body interface{}) error { + // Must protect for concurrent access as per API + c.mu.Lock() + defer c.mu.Unlock() + return c.write(r, body, true, true) +} + +func (c *goRpcCodec) WriteResponse(r *rpc.Response, body interface{}) error { + c.mu.Lock() + defer c.mu.Unlock() + return c.write(r, body, true, true) +} + +func (c *goRpcCodec) ReadResponseHeader(r *rpc.Response) error { + return c.read(r) +} + +func (c *goRpcCodec) ReadRequestHeader(r *rpc.Request) error { + return c.read(r) +} + +func (c *goRpcCodec) ReadRequestBody(body interface{}) error { + return c.read(body) +} + +// ------------------------------------- + +// goRpc is the implementation of Rpc that uses the communication protocol +// as defined in net/rpc package. +type goRpc struct{} + +// GoRpc implements Rpc using the communication protocol defined in net/rpc package. +// Its methods (ServerCodec and ClientCodec) return values that implement RpcCodecBuffered. +var GoRpc goRpc + +func (x goRpc) ServerCodec(conn io.ReadWriteCloser, h Handle) rpc.ServerCodec { + return &goRpcCodec{newRPCCodec(conn, h)} +} + +func (x goRpc) ClientCodec(conn io.ReadWriteCloser, h Handle) rpc.ClientCodec { + return &goRpcCodec{newRPCCodec(conn, h)} +} + +var _ RpcCodecBuffered = (*rpcCodec)(nil) // ensure *rpcCodec implements RpcCodecBuffered diff --git a/Godeps/_workspace/src/github.com/hashicorp/go-msgpack/codec/simple.go b/Godeps/_workspace/src/github.com/hashicorp/go-msgpack/codec/simple.go new file mode 100644 index 00000000000..9e4d148a2a1 --- /dev/null +++ b/Godeps/_workspace/src/github.com/hashicorp/go-msgpack/codec/simple.go @@ -0,0 +1,461 @@ +// Copyright (c) 2012, 2013 Ugorji Nwoke. All rights reserved. +// Use of this source code is governed by a BSD-style license found in the LICENSE file. + +package codec + +import "math" + +const ( + _ uint8 = iota + simpleVdNil = 1 + simpleVdFalse = 2 + simpleVdTrue = 3 + simpleVdFloat32 = 4 + simpleVdFloat64 = 5 + + // each lasts for 4 (ie n, n+1, n+2, n+3) + simpleVdPosInt = 8 + simpleVdNegInt = 12 + + // containers: each lasts for 4 (ie n, n+1, n+2, ... n+7) + simpleVdString = 216 + simpleVdByteArray = 224 + simpleVdArray = 232 + simpleVdMap = 240 + simpleVdExt = 248 +) + +type simpleEncDriver struct { + h *SimpleHandle + w encWriter + //b [8]byte +} + +func (e *simpleEncDriver) isBuiltinType(rt uintptr) bool { + return false +} + +func (e *simpleEncDriver) encodeBuiltin(rt uintptr, v interface{}) { +} + +func (e *simpleEncDriver) encodeNil() { + e.w.writen1(simpleVdNil) +} + +func (e *simpleEncDriver) encodeBool(b bool) { + if b { + e.w.writen1(simpleVdTrue) + } else { + e.w.writen1(simpleVdFalse) + } +} + +func (e *simpleEncDriver) encodeFloat32(f float32) { + e.w.writen1(simpleVdFloat32) + e.w.writeUint32(math.Float32bits(f)) +} + +func (e *simpleEncDriver) encodeFloat64(f float64) { + e.w.writen1(simpleVdFloat64) + e.w.writeUint64(math.Float64bits(f)) +} + +func (e *simpleEncDriver) encodeInt(v int64) { + if v < 0 { + e.encUint(uint64(-v), simpleVdNegInt) + } else { + e.encUint(uint64(v), simpleVdPosInt) + } +} + +func (e *simpleEncDriver) encodeUint(v uint64) { + e.encUint(v, simpleVdPosInt) +} + +func (e *simpleEncDriver) encUint(v uint64, bd uint8) { + switch { + case v <= math.MaxUint8: + e.w.writen2(bd, uint8(v)) + case v <= math.MaxUint16: + e.w.writen1(bd + 1) + e.w.writeUint16(uint16(v)) + case v <= math.MaxUint32: + e.w.writen1(bd + 2) + e.w.writeUint32(uint32(v)) + case v <= math.MaxUint64: + e.w.writen1(bd + 3) + e.w.writeUint64(v) + } +} + +func (e *simpleEncDriver) encLen(bd byte, length int) { + switch { + case length == 0: + e.w.writen1(bd) + case length <= math.MaxUint8: + e.w.writen1(bd + 1) + e.w.writen1(uint8(length)) + case length <= math.MaxUint16: + e.w.writen1(bd + 2) + e.w.writeUint16(uint16(length)) + case int64(length) <= math.MaxUint32: + e.w.writen1(bd + 3) + e.w.writeUint32(uint32(length)) + default: + e.w.writen1(bd + 4) + e.w.writeUint64(uint64(length)) + } +} + +func (e *simpleEncDriver) encodeExtPreamble(xtag byte, length int) { + e.encLen(simpleVdExt, length) + e.w.writen1(xtag) +} + +func (e *simpleEncDriver) encodeArrayPreamble(length int) { + e.encLen(simpleVdArray, length) +} + +func (e *simpleEncDriver) encodeMapPreamble(length int) { + e.encLen(simpleVdMap, length) +} + +func (e *simpleEncDriver) encodeString(c charEncoding, v string) { + e.encLen(simpleVdString, len(v)) + e.w.writestr(v) +} + +func (e *simpleEncDriver) encodeSymbol(v string) { + e.encodeString(c_UTF8, v) +} + +func (e *simpleEncDriver) encodeStringBytes(c charEncoding, v []byte) { + e.encLen(simpleVdByteArray, len(v)) + e.w.writeb(v) +} + +//------------------------------------ + +type simpleDecDriver struct { + h *SimpleHandle + r decReader + bdRead bool + bdType valueType + bd byte + //b [8]byte +} + +func (d *simpleDecDriver) initReadNext() { + if d.bdRead { + return + } + d.bd = d.r.readn1() + d.bdRead = true + d.bdType = valueTypeUnset +} + +func (d *simpleDecDriver) currentEncodedType() valueType { + if d.bdType == valueTypeUnset { + switch d.bd { + case simpleVdNil: + d.bdType = valueTypeNil + case simpleVdTrue, simpleVdFalse: + d.bdType = valueTypeBool + case simpleVdPosInt, simpleVdPosInt + 1, simpleVdPosInt + 2, simpleVdPosInt + 3: + d.bdType = valueTypeUint + case simpleVdNegInt, simpleVdNegInt + 1, simpleVdNegInt + 2, simpleVdNegInt + 3: + d.bdType = valueTypeInt + case simpleVdFloat32, simpleVdFloat64: + d.bdType = valueTypeFloat + case simpleVdString, simpleVdString + 1, simpleVdString + 2, simpleVdString + 3, simpleVdString + 4: + d.bdType = valueTypeString + case simpleVdByteArray, simpleVdByteArray + 1, simpleVdByteArray + 2, simpleVdByteArray + 3, simpleVdByteArray + 4: + d.bdType = valueTypeBytes + case simpleVdExt, simpleVdExt + 1, simpleVdExt + 2, simpleVdExt + 3, simpleVdExt + 4: + d.bdType = valueTypeExt + case simpleVdArray, simpleVdArray + 1, simpleVdArray + 2, simpleVdArray + 3, simpleVdArray + 4: + d.bdType = valueTypeArray + case simpleVdMap, simpleVdMap + 1, simpleVdMap + 2, simpleVdMap + 3, simpleVdMap + 4: + d.bdType = valueTypeMap + default: + decErr("currentEncodedType: Unrecognized d.vd: 0x%x", d.bd) + } + } + return d.bdType +} + +func (d *simpleDecDriver) tryDecodeAsNil() bool { + if d.bd == simpleVdNil { + d.bdRead = false + return true + } + return false +} + +func (d *simpleDecDriver) isBuiltinType(rt uintptr) bool { + return false +} + +func (d *simpleDecDriver) decodeBuiltin(rt uintptr, v interface{}) { +} + +func (d *simpleDecDriver) decIntAny() (ui uint64, i int64, neg bool) { + switch d.bd { + case simpleVdPosInt: + ui = uint64(d.r.readn1()) + i = int64(ui) + case simpleVdPosInt + 1: + ui = uint64(d.r.readUint16()) + i = int64(ui) + case simpleVdPosInt + 2: + ui = uint64(d.r.readUint32()) + i = int64(ui) + case simpleVdPosInt + 3: + ui = uint64(d.r.readUint64()) + i = int64(ui) + case simpleVdNegInt: + ui = uint64(d.r.readn1()) + i = -(int64(ui)) + neg = true + case simpleVdNegInt + 1: + ui = uint64(d.r.readUint16()) + i = -(int64(ui)) + neg = true + case simpleVdNegInt + 2: + ui = uint64(d.r.readUint32()) + i = -(int64(ui)) + neg = true + case simpleVdNegInt + 3: + ui = uint64(d.r.readUint64()) + i = -(int64(ui)) + neg = true + default: + decErr("decIntAny: Integer only valid from pos/neg integer1..8. Invalid descriptor: %v", d.bd) + } + // don't do this check, because callers may only want the unsigned value. + // if ui > math.MaxInt64 { + // decErr("decIntAny: Integer out of range for signed int64: %v", ui) + // } + return +} + +func (d *simpleDecDriver) decodeInt(bitsize uint8) (i int64) { + _, i, _ = d.decIntAny() + checkOverflow(0, i, bitsize) + d.bdRead = false + return +} + +func (d *simpleDecDriver) decodeUint(bitsize uint8) (ui uint64) { + ui, i, neg := d.decIntAny() + if neg { + decErr("Assigning negative signed value: %v, to unsigned type", i) + } + checkOverflow(ui, 0, bitsize) + d.bdRead = false + return +} + +func (d *simpleDecDriver) decodeFloat(chkOverflow32 bool) (f float64) { + switch d.bd { + case simpleVdFloat32: + f = float64(math.Float32frombits(d.r.readUint32())) + case simpleVdFloat64: + f = math.Float64frombits(d.r.readUint64()) + default: + if d.bd >= simpleVdPosInt && d.bd <= simpleVdNegInt+3 { + _, i, _ := d.decIntAny() + f = float64(i) + } else { + decErr("Float only valid from float32/64: Invalid descriptor: %v", d.bd) + } + } + checkOverflowFloat32(f, chkOverflow32) + d.bdRead = false + return +} + +// bool can be decoded from bool only (single byte). +func (d *simpleDecDriver) decodeBool() (b bool) { + switch d.bd { + case simpleVdTrue: + b = true + case simpleVdFalse: + default: + decErr("Invalid single-byte value for bool: %s: %x", msgBadDesc, d.bd) + } + d.bdRead = false + return +} + +func (d *simpleDecDriver) readMapLen() (length int) { + d.bdRead = false + return d.decLen() +} + +func (d *simpleDecDriver) readArrayLen() (length int) { + d.bdRead = false + return d.decLen() +} + +func (d *simpleDecDriver) decLen() int { + switch d.bd % 8 { + case 0: + return 0 + case 1: + return int(d.r.readn1()) + case 2: + return int(d.r.readUint16()) + case 3: + ui := uint64(d.r.readUint32()) + checkOverflow(ui, 0, intBitsize) + return int(ui) + case 4: + ui := d.r.readUint64() + checkOverflow(ui, 0, intBitsize) + return int(ui) + } + decErr("decLen: Cannot read length: bd%8 must be in range 0..4. Got: %d", d.bd%8) + return -1 +} + +func (d *simpleDecDriver) decodeString() (s string) { + s = string(d.r.readn(d.decLen())) + d.bdRead = false + return +} + +func (d *simpleDecDriver) decodeBytes(bs []byte) (bsOut []byte, changed bool) { + if clen := d.decLen(); clen > 0 { + // if no contents in stream, don't update the passed byteslice + if len(bs) != clen { + if len(bs) > clen { + bs = bs[:clen] + } else { + bs = make([]byte, clen) + } + bsOut = bs + changed = true + } + d.r.readb(bs) + } + d.bdRead = false + return +} + +func (d *simpleDecDriver) decodeExt(verifyTag bool, tag byte) (xtag byte, xbs []byte) { + switch d.bd { + case simpleVdExt, simpleVdExt + 1, simpleVdExt + 2, simpleVdExt + 3, simpleVdExt + 4: + l := d.decLen() + xtag = d.r.readn1() + if verifyTag && xtag != tag { + decErr("Wrong extension tag. Got %b. Expecting: %v", xtag, tag) + } + xbs = d.r.readn(l) + case simpleVdByteArray, simpleVdByteArray + 1, simpleVdByteArray + 2, simpleVdByteArray + 3, simpleVdByteArray + 4: + xbs, _ = d.decodeBytes(nil) + default: + decErr("Invalid d.vd for extensions (Expecting extensions or byte array). Got: 0x%x", d.bd) + } + d.bdRead = false + return +} + +func (d *simpleDecDriver) decodeNaked() (v interface{}, vt valueType, decodeFurther bool) { + d.initReadNext() + + switch d.bd { + case simpleVdNil: + vt = valueTypeNil + case simpleVdFalse: + vt = valueTypeBool + v = false + case simpleVdTrue: + vt = valueTypeBool + v = true + case simpleVdPosInt, simpleVdPosInt + 1, simpleVdPosInt + 2, simpleVdPosInt + 3: + vt = valueTypeUint + ui, _, _ := d.decIntAny() + v = ui + case simpleVdNegInt, simpleVdNegInt + 1, simpleVdNegInt + 2, simpleVdNegInt + 3: + vt = valueTypeInt + _, i, _ := d.decIntAny() + v = i + case simpleVdFloat32: + vt = valueTypeFloat + v = d.decodeFloat(true) + case simpleVdFloat64: + vt = valueTypeFloat + v = d.decodeFloat(false) + case simpleVdString, simpleVdString + 1, simpleVdString + 2, simpleVdString + 3, simpleVdString + 4: + vt = valueTypeString + v = d.decodeString() + case simpleVdByteArray, simpleVdByteArray + 1, simpleVdByteArray + 2, simpleVdByteArray + 3, simpleVdByteArray + 4: + vt = valueTypeBytes + v, _ = d.decodeBytes(nil) + case simpleVdExt, simpleVdExt + 1, simpleVdExt + 2, simpleVdExt + 3, simpleVdExt + 4: + vt = valueTypeExt + l := d.decLen() + var re RawExt + re.Tag = d.r.readn1() + re.Data = d.r.readn(l) + v = &re + vt = valueTypeExt + case simpleVdArray, simpleVdArray + 1, simpleVdArray + 2, simpleVdArray + 3, simpleVdArray + 4: + vt = valueTypeArray + decodeFurther = true + case simpleVdMap, simpleVdMap + 1, simpleVdMap + 2, simpleVdMap + 3, simpleVdMap + 4: + vt = valueTypeMap + decodeFurther = true + default: + decErr("decodeNaked: Unrecognized d.vd: 0x%x", d.bd) + } + + if !decodeFurther { + d.bdRead = false + } + return +} + +//------------------------------------ + +// SimpleHandle is a Handle for a very simple encoding format. +// +// simple is a simplistic codec similar to binc, but not as compact. +// - Encoding of a value is always preceeded by the descriptor byte (bd) +// - True, false, nil are encoded fully in 1 byte (the descriptor) +// - Integers (intXXX, uintXXX) are encoded in 1, 2, 4 or 8 bytes (plus a descriptor byte). +// There are positive (uintXXX and intXXX >= 0) and negative (intXXX < 0) integers. +// - Floats are encoded in 4 or 8 bytes (plus a descriptor byte) +// - Lenght of containers (strings, bytes, array, map, extensions) +// are encoded in 0, 1, 2, 4 or 8 bytes. +// Zero-length containers have no length encoded. +// For others, the number of bytes is given by pow(2, bd%3) +// - maps are encoded as [bd] [length] [[key][value]]... +// - arrays are encoded as [bd] [length] [value]... +// - extensions are encoded as [bd] [length] [tag] [byte]... +// - strings/bytearrays are encoded as [bd] [length] [byte]... +// +// The full spec will be published soon. +type SimpleHandle struct { + BasicHandle +} + +func (h *SimpleHandle) newEncDriver(w encWriter) encDriver { + return &simpleEncDriver{w: w, h: h} +} + +func (h *SimpleHandle) newDecDriver(r decReader) decDriver { + return &simpleDecDriver{r: r, h: h} +} + +func (_ *SimpleHandle) writeExt() bool { + return true +} + +func (h *SimpleHandle) getBasicHandle() *BasicHandle { + return &h.BasicHandle +} + +var _ decDriver = (*simpleDecDriver)(nil) +var _ encDriver = (*simpleEncDriver)(nil) diff --git a/Godeps/_workspace/src/github.com/hashicorp/go-msgpack/codec/time.go b/Godeps/_workspace/src/github.com/hashicorp/go-msgpack/codec/time.go new file mode 100644 index 00000000000..c86d65328d7 --- /dev/null +++ b/Godeps/_workspace/src/github.com/hashicorp/go-msgpack/codec/time.go @@ -0,0 +1,193 @@ +// Copyright (c) 2012, 2013 Ugorji Nwoke. All rights reserved. +// Use of this source code is governed by a BSD-style license found in the LICENSE file. + +package codec + +import ( + "time" +) + +var ( + timeDigits = [...]byte{'0', '1', '2', '3', '4', '5', '6', '7', '8', '9'} +) + +// EncodeTime encodes a time.Time as a []byte, including +// information on the instant in time and UTC offset. +// +// Format Description +// +// A timestamp is composed of 3 components: +// +// - secs: signed integer representing seconds since unix epoch +// - nsces: unsigned integer representing fractional seconds as a +// nanosecond offset within secs, in the range 0 <= nsecs < 1e9 +// - tz: signed integer representing timezone offset in minutes east of UTC, +// and a dst (daylight savings time) flag +// +// When encoding a timestamp, the first byte is the descriptor, which +// defines which components are encoded and how many bytes are used to +// encode secs and nsecs components. *If secs/nsecs is 0 or tz is UTC, it +// is not encoded in the byte array explicitly*. +// +// Descriptor 8 bits are of the form `A B C DDD EE`: +// A: Is secs component encoded? 1 = true +// B: Is nsecs component encoded? 1 = true +// C: Is tz component encoded? 1 = true +// DDD: Number of extra bytes for secs (range 0-7). +// If A = 1, secs encoded in DDD+1 bytes. +// If A = 0, secs is not encoded, and is assumed to be 0. +// If A = 1, then we need at least 1 byte to encode secs. +// DDD says the number of extra bytes beyond that 1. +// E.g. if DDD=0, then secs is represented in 1 byte. +// if DDD=2, then secs is represented in 3 bytes. +// EE: Number of extra bytes for nsecs (range 0-3). +// If B = 1, nsecs encoded in EE+1 bytes (similar to secs/DDD above) +// +// Following the descriptor bytes, subsequent bytes are: +// +// secs component encoded in `DDD + 1` bytes (if A == 1) +// nsecs component encoded in `EE + 1` bytes (if B == 1) +// tz component encoded in 2 bytes (if C == 1) +// +// secs and nsecs components are integers encoded in a BigEndian +// 2-complement encoding format. +// +// tz component is encoded as 2 bytes (16 bits). Most significant bit 15 to +// Least significant bit 0 are described below: +// +// Timezone offset has a range of -12:00 to +14:00 (ie -720 to +840 minutes). +// Bit 15 = have\_dst: set to 1 if we set the dst flag. +// Bit 14 = dst\_on: set to 1 if dst is in effect at the time, or 0 if not. +// Bits 13..0 = timezone offset in minutes. It is a signed integer in Big Endian format. +// +func encodeTime(t time.Time) []byte { + //t := rv.Interface().(time.Time) + tsecs, tnsecs := t.Unix(), t.Nanosecond() + var ( + bd byte + btmp [8]byte + bs [16]byte + i int = 1 + ) + l := t.Location() + if l == time.UTC { + l = nil + } + if tsecs != 0 { + bd = bd | 0x80 + bigen.PutUint64(btmp[:], uint64(tsecs)) + f := pruneSignExt(btmp[:], tsecs >= 0) + bd = bd | (byte(7-f) << 2) + copy(bs[i:], btmp[f:]) + i = i + (8 - f) + } + if tnsecs != 0 { + bd = bd | 0x40 + bigen.PutUint32(btmp[:4], uint32(tnsecs)) + f := pruneSignExt(btmp[:4], true) + bd = bd | byte(3-f) + copy(bs[i:], btmp[f:4]) + i = i + (4 - f) + } + if l != nil { + bd = bd | 0x20 + // Note that Go Libs do not give access to dst flag. + _, zoneOffset := t.Zone() + //zoneName, zoneOffset := t.Zone() + zoneOffset /= 60 + z := uint16(zoneOffset) + bigen.PutUint16(btmp[:2], z) + // clear dst flags + bs[i] = btmp[0] & 0x3f + bs[i+1] = btmp[1] + i = i + 2 + } + bs[0] = bd + return bs[0:i] +} + +// DecodeTime decodes a []byte into a time.Time. +func decodeTime(bs []byte) (tt time.Time, err error) { + bd := bs[0] + var ( + tsec int64 + tnsec uint32 + tz uint16 + i byte = 1 + i2 byte + n byte + ) + if bd&(1<<7) != 0 { + var btmp [8]byte + n = ((bd >> 2) & 0x7) + 1 + i2 = i + n + copy(btmp[8-n:], bs[i:i2]) + //if first bit of bs[i] is set, then fill btmp[0..8-n] with 0xff (ie sign extend it) + if bs[i]&(1<<7) != 0 { + copy(btmp[0:8-n], bsAll0xff) + //for j,k := byte(0), 8-n; j < k; j++ { btmp[j] = 0xff } + } + i = i2 + tsec = int64(bigen.Uint64(btmp[:])) + } + if bd&(1<<6) != 0 { + var btmp [4]byte + n = (bd & 0x3) + 1 + i2 = i + n + copy(btmp[4-n:], bs[i:i2]) + i = i2 + tnsec = bigen.Uint32(btmp[:]) + } + if bd&(1<<5) == 0 { + tt = time.Unix(tsec, int64(tnsec)).UTC() + return + } + // In stdlib time.Parse, when a date is parsed without a zone name, it uses "" as zone name. + // However, we need name here, so it can be shown when time is printed. + // Zone name is in form: UTC-08:00. + // Note that Go Libs do not give access to dst flag, so we ignore dst bits + + i2 = i + 2 + tz = bigen.Uint16(bs[i:i2]) + i = i2 + // sign extend sign bit into top 2 MSB (which were dst bits): + if tz&(1<<13) == 0 { // positive + tz = tz & 0x3fff //clear 2 MSBs: dst bits + } else { // negative + tz = tz | 0xc000 //set 2 MSBs: dst bits + //tzname[3] = '-' (TODO: verify. this works here) + } + tzint := int16(tz) + if tzint == 0 { + tt = time.Unix(tsec, int64(tnsec)).UTC() + } else { + // For Go Time, do not use a descriptive timezone. + // It's unnecessary, and makes it harder to do a reflect.DeepEqual. + // The Offset already tells what the offset should be, if not on UTC and unknown zone name. + // var zoneName = timeLocUTCName(tzint) + tt = time.Unix(tsec, int64(tnsec)).In(time.FixedZone("", int(tzint)*60)) + } + return +} + +func timeLocUTCName(tzint int16) string { + if tzint == 0 { + return "UTC" + } + var tzname = []byte("UTC+00:00") + //tzname := fmt.Sprintf("UTC%s%02d:%02d", tzsign, tz/60, tz%60) //perf issue using Sprintf. inline below. + //tzhr, tzmin := tz/60, tz%60 //faster if u convert to int first + var tzhr, tzmin int16 + if tzint < 0 { + tzname[3] = '-' // (TODO: verify. this works here) + tzhr, tzmin = -tzint/60, (-tzint)%60 + } else { + tzhr, tzmin = tzint/60, tzint%60 + } + tzname[4] = timeDigits[tzhr/10] + tzname[5] = timeDigits[tzhr%10] + tzname[7] = timeDigits[tzmin/10] + tzname[8] = timeDigits[tzmin%10] + return string(tzname) + //return time.FixedZone(string(tzname), int(tzint)*60) +} diff --git a/Godeps/_workspace/src/github.com/hashicorp/raft-boltdb/LICENSE b/Godeps/_workspace/src/github.com/hashicorp/raft-boltdb/LICENSE new file mode 100644 index 00000000000..f0e5c79e181 --- /dev/null +++ b/Godeps/_workspace/src/github.com/hashicorp/raft-boltdb/LICENSE @@ -0,0 +1,362 @@ +Mozilla Public License, version 2.0 + +1. Definitions + +1.1. "Contributor" + + means each individual or legal entity that creates, contributes to the + creation of, or owns Covered Software. + +1.2. "Contributor Version" + + means the combination of the Contributions of others (if any) used by a + Contributor and that particular Contributor's Contribution. + +1.3. "Contribution" + + means Covered Software of a particular Contributor. + +1.4. "Covered Software" + + means Source Code Form to which the initial Contributor has attached the + notice in Exhibit A, the Executable Form of such Source Code Form, and + Modifications of such Source Code Form, in each case including portions + thereof. + +1.5. "Incompatible With Secondary Licenses" + means + + a. that the initial Contributor has attached the notice described in + Exhibit B to the Covered Software; or + + b. that the Covered Software was made available under the terms of + version 1.1 or earlier of the License, but not also under the terms of + a Secondary License. + +1.6. "Executable Form" + + means any form of the work other than Source Code Form. + +1.7. "Larger Work" + + means a work that combines Covered Software with other material, in a + separate file or files, that is not Covered Software. + +1.8. "License" + + means this document. + +1.9. "Licensable" + + means having the right to grant, to the maximum extent possible, whether + at the time of the initial grant or subsequently, any and all of the + rights conveyed by this License. + +1.10. "Modifications" + + means any of the following: + + a. any file in Source Code Form that results from an addition to, + deletion from, or modification of the contents of Covered Software; or + + b. any new file in Source Code Form that contains any Covered Software. + +1.11. "Patent Claims" of a Contributor + + means any patent claim(s), including without limitation, method, + process, and apparatus claims, in any patent Licensable by such + Contributor that would be infringed, but for the grant of the License, + by the making, using, selling, offering for sale, having made, import, + or transfer of either its Contributions or its Contributor Version. + +1.12. "Secondary License" + + means either the GNU General Public License, Version 2.0, the GNU Lesser + General Public License, Version 2.1, the GNU Affero General Public + License, Version 3.0, or any later versions of those licenses. + +1.13. "Source Code Form" + + means the form of the work preferred for making modifications. + +1.14. "You" (or "Your") + + means an individual or a legal entity exercising rights under this + License. For legal entities, "You" includes any entity that controls, is + controlled by, or is under common control with You. For purposes of this + definition, "control" means (a) the power, direct or indirect, to cause + the direction or management of such entity, whether by contract or + otherwise, or (b) ownership of more than fifty percent (50%) of the + outstanding shares or beneficial ownership of such entity. + + +2. License Grants and Conditions + +2.1. Grants + + Each Contributor hereby grants You a world-wide, royalty-free, + non-exclusive license: + + a. under intellectual property rights (other than patent or trademark) + Licensable by such Contributor to use, reproduce, make available, + modify, display, perform, distribute, and otherwise exploit its + Contributions, either on an unmodified basis, with Modifications, or + as part of a Larger Work; and + + b. under Patent Claims of such Contributor to make, use, sell, offer for + sale, have made, import, and otherwise transfer either its + Contributions or its Contributor Version. + +2.2. Effective Date + + The licenses granted in Section 2.1 with respect to any Contribution + become effective for each Contribution on the date the Contributor first + distributes such Contribution. + +2.3. Limitations on Grant Scope + + The licenses granted in this Section 2 are the only rights granted under + this License. No additional rights or licenses will be implied from the + distribution or licensing of Covered Software under this License. + Notwithstanding Section 2.1(b) above, no patent license is granted by a + Contributor: + + a. for any code that a Contributor has removed from Covered Software; or + + b. for infringements caused by: (i) Your and any other third party's + modifications of Covered Software, or (ii) the combination of its + Contributions with other software (except as part of its Contributor + Version); or + + c. under Patent Claims infringed by Covered Software in the absence of + its Contributions. + + This License does not grant any rights in the trademarks, service marks, + or logos of any Contributor (except as may be necessary to comply with + the notice requirements in Section 3.4). + +2.4. Subsequent Licenses + + No Contributor makes additional grants as a result of Your choice to + distribute the Covered Software under a subsequent version of this + License (see Section 10.2) or under the terms of a Secondary License (if + permitted under the terms of Section 3.3). + +2.5. Representation + + Each Contributor represents that the Contributor believes its + Contributions are its original creation(s) or it has sufficient rights to + grant the rights to its Contributions conveyed by this License. + +2.6. Fair Use + + This License is not intended to limit any rights You have under + applicable copyright doctrines of fair use, fair dealing, or other + equivalents. + +2.7. Conditions + + Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted in + Section 2.1. + + +3. Responsibilities + +3.1. Distribution of Source Form + + All distribution of Covered Software in Source Code Form, including any + Modifications that You create or to which You contribute, must be under + the terms of this License. You must inform recipients that the Source + Code Form of the Covered Software is governed by the terms of this + License, and how they can obtain a copy of this License. You may not + attempt to alter or restrict the recipients' rights in the Source Code + Form. + +3.2. Distribution of Executable Form + + If You distribute Covered Software in Executable Form then: + + a. such Covered Software must also be made available in Source Code Form, + as described in Section 3.1, and You must inform recipients of the + Executable Form how they can obtain a copy of such Source Code Form by + reasonable means in a timely manner, at a charge no more than the cost + of distribution to the recipient; and + + b. You may distribute such Executable Form under the terms of this + License, or sublicense it under different terms, provided that the + license for the Executable Form does not attempt to limit or alter the + recipients' rights in the Source Code Form under this License. + +3.3. Distribution of a Larger Work + + You may create and distribute a Larger Work under terms of Your choice, + provided that You also comply with the requirements of this License for + the Covered Software. If the Larger Work is a combination of Covered + Software with a work governed by one or more Secondary Licenses, and the + Covered Software is not Incompatible With Secondary Licenses, this + License permits You to additionally distribute such Covered Software + under the terms of such Secondary License(s), so that the recipient of + the Larger Work may, at their option, further distribute the Covered + Software under the terms of either this License or such Secondary + License(s). + +3.4. Notices + + You may not remove or alter the substance of any license notices + (including copyright notices, patent notices, disclaimers of warranty, or + limitations of liability) contained within the Source Code Form of the + Covered Software, except that You may alter any license notices to the + extent required to remedy known factual inaccuracies. + +3.5. Application of Additional Terms + + You may choose to offer, and to charge a fee for, warranty, support, + indemnity or liability obligations to one or more recipients of Covered + Software. However, You may do so only on Your own behalf, and not on + behalf of any Contributor. You must make it absolutely clear that any + such warranty, support, indemnity, or liability obligation is offered by + You alone, and You hereby agree to indemnify every Contributor for any + liability incurred by such Contributor as a result of warranty, support, + indemnity or liability terms You offer. You may include additional + disclaimers of warranty and limitations of liability specific to any + jurisdiction. + +4. Inability to Comply Due to Statute or Regulation + + If it is impossible for You to comply with any of the terms of this License + with respect to some or all of the Covered Software due to statute, + judicial order, or regulation then You must: (a) comply with the terms of + this License to the maximum extent possible; and (b) describe the + limitations and the code they affect. Such description must be placed in a + text file included with all distributions of the Covered Software under + this License. Except to the extent prohibited by statute or regulation, + such description must be sufficiently detailed for a recipient of ordinary + skill to be able to understand it. + +5. Termination + +5.1. The rights granted under this License will terminate automatically if You + fail to comply with any of its terms. However, if You become compliant, + then the rights granted under this License from a particular Contributor + are reinstated (a) provisionally, unless and until such Contributor + explicitly and finally terminates Your grants, and (b) on an ongoing + basis, if such Contributor fails to notify You of the non-compliance by + some reasonable means prior to 60 days after You have come back into + compliance. Moreover, Your grants from a particular Contributor are + reinstated on an ongoing basis if such Contributor notifies You of the + non-compliance by some reasonable means, this is the first time You have + received notice of non-compliance with this License from such + Contributor, and You become compliant prior to 30 days after Your receipt + of the notice. + +5.2. If You initiate litigation against any entity by asserting a patent + infringement claim (excluding declaratory judgment actions, + counter-claims, and cross-claims) alleging that a Contributor Version + directly or indirectly infringes any patent, then the rights granted to + You by any and all Contributors for the Covered Software under Section + 2.1 of this License shall terminate. + +5.3. In the event of termination under Sections 5.1 or 5.2 above, all end user + license agreements (excluding distributors and resellers) which have been + validly granted by You or Your distributors under this License prior to + termination shall survive termination. + +6. Disclaimer of Warranty + + Covered Software is provided under this License on an "as is" basis, + without warranty of any kind, either expressed, implied, or statutory, + including, without limitation, warranties that the Covered Software is free + of defects, merchantable, fit for a particular purpose or non-infringing. + The entire risk as to the quality and performance of the Covered Software + is with You. Should any Covered Software prove defective in any respect, + You (not any Contributor) assume the cost of any necessary servicing, + repair, or correction. This disclaimer of warranty constitutes an essential + part of this License. No use of any Covered Software is authorized under + this License except under this disclaimer. + +7. Limitation of Liability + + Under no circumstances and under no legal theory, whether tort (including + negligence), contract, or otherwise, shall any Contributor, or anyone who + distributes Covered Software as permitted above, be liable to You for any + direct, indirect, special, incidental, or consequential damages of any + character including, without limitation, damages for lost profits, loss of + goodwill, work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses, even if such party shall have been + informed of the possibility of such damages. This limitation of liability + shall not apply to liability for death or personal injury resulting from + such party's negligence to the extent applicable law prohibits such + limitation. Some jurisdictions do not allow the exclusion or limitation of + incidental or consequential damages, so this exclusion and limitation may + not apply to You. + +8. Litigation + + Any litigation relating to this License may be brought only in the courts + of a jurisdiction where the defendant maintains its principal place of + business and such litigation shall be governed by laws of that + jurisdiction, without reference to its conflict-of-law provisions. Nothing + in this Section shall prevent a party's ability to bring cross-claims or + counter-claims. + +9. Miscellaneous + + This License represents the complete agreement concerning the subject + matter hereof. If any provision of this License is held to be + unenforceable, such provision shall be reformed only to the extent + necessary to make it enforceable. Any law or regulation which provides that + the language of a contract shall be construed against the drafter shall not + be used to construe this License against a Contributor. + + +10. Versions of the License + +10.1. New Versions + + Mozilla Foundation is the license steward. Except as provided in Section + 10.3, no one other than the license steward has the right to modify or + publish new versions of this License. Each version will be given a + distinguishing version number. + +10.2. Effect of New Versions + + You may distribute the Covered Software under the terms of the version + of the License under which You originally received the Covered Software, + or under the terms of any subsequent version published by the license + steward. + +10.3. Modified Versions + + If you create software not governed by this License, and you want to + create a new license for such software, you may create and use a + modified version of this License if you rename the license and remove + any references to the name of the license steward (except to note that + such modified license differs from this License). + +10.4. Distributing Source Code Form that is Incompatible With Secondary + Licenses If You choose to distribute Source Code Form that is + Incompatible With Secondary Licenses under the terms of this version of + the License, the notice described in Exhibit B of this License must be + attached. + +Exhibit A - Source Code Form License Notice + + This Source Code Form is subject to the + terms of the Mozilla Public License, v. + 2.0. If a copy of the MPL was not + distributed with this file, You can + obtain one at + http://mozilla.org/MPL/2.0/. + +If it is not possible or desirable to put the notice in a particular file, +then You may include the notice in a location (such as a LICENSE file in a +relevant directory) where a recipient would be likely to look for such a +notice. + +You may add additional accurate notices of copyright ownership. + +Exhibit B - "Incompatible With Secondary Licenses" Notice + + This Source Code Form is "Incompatible + With Secondary Licenses", as defined by + the Mozilla Public License, v. 2.0. \ No newline at end of file diff --git a/Godeps/_workspace/src/github.com/hashicorp/raft-boltdb/README.md b/Godeps/_workspace/src/github.com/hashicorp/raft-boltdb/README.md new file mode 100644 index 00000000000..5d7180ab9ec --- /dev/null +++ b/Godeps/_workspace/src/github.com/hashicorp/raft-boltdb/README.md @@ -0,0 +1,11 @@ +raft-boltdb +=========== + +This repository provides the `raftboltdb` package. The package exports the +`BoltStore` which is an implementation of both a `LogStore` and `StableStore`. + +It is meant to be used as a backend for the `raft` [package +here](https://github.com/hashicorp/raft). + +This implementation uses [BoltDB](https://github.com/boltdb/bolt). BoltDB is +a simple key/value store implemented in pure Go, and inspired by LMDB. diff --git a/Godeps/_workspace/src/github.com/hashicorp/raft-boltdb/bolt_store.go b/Godeps/_workspace/src/github.com/hashicorp/raft-boltdb/bolt_store.go new file mode 100644 index 00000000000..ab6dd4803e6 --- /dev/null +++ b/Godeps/_workspace/src/github.com/hashicorp/raft-boltdb/bolt_store.go @@ -0,0 +1,231 @@ +package raftboltdb + +import ( + "errors" + + "github.com/boltdb/bolt" + "github.com/hashicorp/raft" +) + +const ( + // Permissions to use on the db file. This is only used if the + // database file does not exist and needs to be created. + dbFileMode = 0600 +) + +var ( + // Bucket names we perform transactions in + dbLogs = []byte("logs") + dbConf = []byte("conf") + + // An error indicating a given key does not exist + ErrKeyNotFound = errors.New("not found") +) + +// BoltStore provides access to BoltDB for Raft to store and retrieve +// log entries. It also provides key/value storage, and can be used as +// a LogStore and StableStore. +type BoltStore struct { + // conn is the underlying handle to the db. + conn *bolt.DB + + // The path to the Bolt database file + path string +} + +// NewBoltStore takes a file path and returns a connected Raft backend. +func NewBoltStore(path string) (*BoltStore, error) { + // Try to connect + handle, err := bolt.Open(path, dbFileMode, nil) + if err != nil { + return nil, err + } + + // Create the new store + store := &BoltStore{ + conn: handle, + path: path, + } + + // Set up our buckets + if err := store.initialize(); err != nil { + store.Close() + return nil, err + } + + return store, nil +} + +// initialize is used to set up all of the buckets. +func (b *BoltStore) initialize() error { + tx, err := b.conn.Begin(true) + if err != nil { + return err + } + defer tx.Rollback() + + // Create all the buckets + if _, err := tx.CreateBucketIfNotExists(dbLogs); err != nil { + return err + } + if _, err := tx.CreateBucketIfNotExists(dbConf); err != nil { + return err + } + + return tx.Commit() +} + +// Close is used to gracefully close the DB connection. +func (b *BoltStore) Close() error { + return b.conn.Close() +} + +// FirstIndex returns the first known index from the Raft log. +func (b *BoltStore) FirstIndex() (uint64, error) { + tx, err := b.conn.Begin(false) + if err != nil { + return 0, err + } + defer tx.Rollback() + + curs := tx.Bucket(dbLogs).Cursor() + if first, _ := curs.First(); first == nil { + return 0, nil + } else { + return bytesToUint64(first), nil + } +} + +// LastIndex returns the last known index from the Raft log. +func (b *BoltStore) LastIndex() (uint64, error) { + tx, err := b.conn.Begin(false) + if err != nil { + return 0, err + } + defer tx.Rollback() + + curs := tx.Bucket(dbLogs).Cursor() + if last, _ := curs.Last(); last == nil { + return 0, nil + } else { + return bytesToUint64(last), nil + } +} + +// GetLog is used to retrieve a log from BoltDB at a given index. +func (b *BoltStore) GetLog(idx uint64, log *raft.Log) error { + tx, err := b.conn.Begin(false) + if err != nil { + return err + } + defer tx.Rollback() + + bucket := tx.Bucket(dbLogs) + val := bucket.Get(uint64ToBytes(idx)) + + if val == nil { + return raft.ErrLogNotFound + } + return decodeMsgPack(val, log) +} + +// StoreLog is used to store a single raft log +func (b *BoltStore) StoreLog(log *raft.Log) error { + return b.StoreLogs([]*raft.Log{log}) +} + +// StoreLogs is used to store a set of raft logs +func (b *BoltStore) StoreLogs(logs []*raft.Log) error { + tx, err := b.conn.Begin(true) + if err != nil { + return err + } + defer tx.Rollback() + + for _, log := range logs { + key := uint64ToBytes(log.Index) + val, err := encodeMsgPack(log) + if err != nil { + return err + } + bucket := tx.Bucket(dbLogs) + if err := bucket.Put(key, val.Bytes()); err != nil { + return err + } + } + + return tx.Commit() +} + +// DeleteRange is used to delete logs within a given range inclusively. +func (b *BoltStore) DeleteRange(min, max uint64) error { + minKey := uint64ToBytes(min) + + tx, err := b.conn.Begin(true) + if err != nil { + return err + } + defer tx.Rollback() + + curs := tx.Bucket(dbLogs).Cursor() + for k, _ := curs.Seek(minKey); k != nil; k, _ = curs.Next() { + // Handle out-of-range log index + if bytesToUint64(k) > max { + break + } + + // Delete in-range log index + if err := curs.Delete(); err != nil { + return err + } + } + + return tx.Commit() +} + +// Set is used to set a key/value set outside of the raft log +func (b *BoltStore) Set(k, v []byte) error { + tx, err := b.conn.Begin(true) + if err != nil { + return err + } + defer tx.Rollback() + + bucket := tx.Bucket(dbConf) + if err := bucket.Put(k, v); err != nil { + return err + } + + return tx.Commit() +} + +// Get is used to retrieve a value from the k/v store by key +func (b *BoltStore) Get(k []byte) ([]byte, error) { + tx, err := b.conn.Begin(false) + if err != nil { + return nil, err + } + defer tx.Rollback() + + bucket := tx.Bucket(dbConf) + val := bucket.Get(k) + + if val == nil { + return nil, ErrKeyNotFound + } + return append([]byte{}, val...), nil +} + +// SetUint64 is like Set, but handles uint64 values +func (b *BoltStore) SetUint64(key []byte, val uint64) error { + return b.Set(key, uint64ToBytes(val)) +} + +// GetUint64 is like Get, but handles uint64 values +func (b *BoltStore) GetUint64(key []byte) (uint64, error) { + val, err := b.Get(key) + if err != nil { + return 0, err + } + return bytesToUint64(val), nil +} diff --git a/Godeps/_workspace/src/github.com/hashicorp/raft-boltdb/util.go b/Godeps/_workspace/src/github.com/hashicorp/raft-boltdb/util.go new file mode 100644 index 00000000000..68dd786b7ad --- /dev/null +++ b/Godeps/_workspace/src/github.com/hashicorp/raft-boltdb/util.go @@ -0,0 +1,37 @@ +package raftboltdb + +import ( + "bytes" + "encoding/binary" + + "github.com/hashicorp/go-msgpack/codec" +) + +// Decode reverses the encode operation on a byte slice input +func decodeMsgPack(buf []byte, out interface{}) error { + r := bytes.NewBuffer(buf) + hd := codec.MsgpackHandle{} + dec := codec.NewDecoder(r, &hd) + return dec.Decode(out) +} + +// Encode writes an encoded object to a new bytes buffer +func encodeMsgPack(in interface{}) (*bytes.Buffer, error) { + buf := bytes.NewBuffer(nil) + hd := codec.MsgpackHandle{} + enc := codec.NewEncoder(buf, &hd) + err := enc.Encode(in) + return buf, err +} + +// Converts bytes to an integer +func bytesToUint64(b []byte) uint64 { + return binary.BigEndian.Uint64(b) +} + +// Converts a uint to a byte slice +func uint64ToBytes(u uint64) []byte { + buf := make([]byte, 8) + binary.BigEndian.PutUint64(buf, u) + return buf +} diff --git a/Godeps/_workspace/src/github.com/hashicorp/raft/.gitignore b/Godeps/_workspace/src/github.com/hashicorp/raft/.gitignore new file mode 100644 index 00000000000..836562412fe --- /dev/null +++ b/Godeps/_workspace/src/github.com/hashicorp/raft/.gitignore @@ -0,0 +1,23 @@ +# Compiled Object files, Static and Dynamic libs (Shared Objects) +*.o +*.a +*.so + +# Folders +_obj +_test + +# Architecture specific extensions/prefixes +*.[568vq] +[568vq].out + +*.cgo1.go +*.cgo2.c +_cgo_defun.c +_cgo_gotypes.go +_cgo_export.* + +_testmain.go + +*.exe +*.test diff --git a/Godeps/_workspace/src/github.com/hashicorp/raft/.travis.yml b/Godeps/_workspace/src/github.com/hashicorp/raft/.travis.yml new file mode 100644 index 00000000000..5cf041d263a --- /dev/null +++ b/Godeps/_workspace/src/github.com/hashicorp/raft/.travis.yml @@ -0,0 +1,14 @@ +language: go + +go: + - 1.2 + - tip + +install: make deps +script: + - make integ + +notifications: + flowdock: + secure: fZrcf9rlh2IrQrlch1sHkn3YI7SKvjGnAl/zyV5D6NROe1Bbr6d3QRMuCXWWdhJHzjKmXk5rIzbqJhUc0PNF7YjxGNKSzqWMQ56KcvN1k8DzlqxpqkcA3Jbs6fXCWo2fssRtZ7hj/wOP1f5n6cc7kzHDt9dgaYJ6nO2fqNPJiTc= + diff --git a/Godeps/_workspace/src/github.com/hashicorp/raft/LICENSE b/Godeps/_workspace/src/github.com/hashicorp/raft/LICENSE new file mode 100644 index 00000000000..c33dcc7c928 --- /dev/null +++ b/Godeps/_workspace/src/github.com/hashicorp/raft/LICENSE @@ -0,0 +1,354 @@ +Mozilla Public License, version 2.0 + +1. Definitions + +1.1. “Contributor” + + means each individual or legal entity that creates, contributes to the + creation of, or owns Covered Software. + +1.2. “Contributor Version” + + means the combination of the Contributions of others (if any) used by a + Contributor and that particular Contributor’s Contribution. + +1.3. “Contribution” + + means Covered Software of a particular Contributor. + +1.4. “Covered Software” + + means Source Code Form to which the initial Contributor has attached the + notice in Exhibit A, the Executable Form of such Source Code Form, and + Modifications of such Source Code Form, in each case including portions + thereof. + +1.5. “Incompatible With Secondary Licenses” + means + + a. that the initial Contributor has attached the notice described in + Exhibit B to the Covered Software; or + + b. that the Covered Software was made available under the terms of version + 1.1 or earlier of the License, but not also under the terms of a + Secondary License. + +1.6. “Executable Form” + + means any form of the work other than Source Code Form. + +1.7. “Larger Work” + + means a work that combines Covered Software with other material, in a separate + file or files, that is not Covered Software. + +1.8. “License” + + means this document. + +1.9. “Licensable” + + means having the right to grant, to the maximum extent possible, whether at the + time of the initial grant or subsequently, any and all of the rights conveyed by + this License. + +1.10. “Modifications” + + means any of the following: + + a. any file in Source Code Form that results from an addition to, deletion + from, or modification of the contents of Covered Software; or + + b. any new file in Source Code Form that contains any Covered Software. + +1.11. “Patent Claims” of a Contributor + + means any patent claim(s), including without limitation, method, process, + and apparatus claims, in any patent Licensable by such Contributor that + would be infringed, but for the grant of the License, by the making, + using, selling, offering for sale, having made, import, or transfer of + either its Contributions or its Contributor Version. + +1.12. “Secondary License” + + means either the GNU General Public License, Version 2.0, the GNU Lesser + General Public License, Version 2.1, the GNU Affero General Public + License, Version 3.0, or any later versions of those licenses. + +1.13. “Source Code Form” + + means the form of the work preferred for making modifications. + +1.14. “You” (or “Your”) + + means an individual or a legal entity exercising rights under this + License. For legal entities, “You” includes any entity that controls, is + controlled by, or is under common control with You. For purposes of this + definition, “control” means (a) the power, direct or indirect, to cause + the direction or management of such entity, whether by contract or + otherwise, or (b) ownership of more than fifty percent (50%) of the + outstanding shares or beneficial ownership of such entity. + + +2. License Grants and Conditions + +2.1. Grants + + Each Contributor hereby grants You a world-wide, royalty-free, + non-exclusive license: + + a. under intellectual property rights (other than patent or trademark) + Licensable by such Contributor to use, reproduce, make available, + modify, display, perform, distribute, and otherwise exploit its + Contributions, either on an unmodified basis, with Modifications, or as + part of a Larger Work; and + + b. under Patent Claims of such Contributor to make, use, sell, offer for + sale, have made, import, and otherwise transfer either its Contributions + or its Contributor Version. + +2.2. Effective Date + + The licenses granted in Section 2.1 with respect to any Contribution become + effective for each Contribution on the date the Contributor first distributes + such Contribution. + +2.3. Limitations on Grant Scope + + The licenses granted in this Section 2 are the only rights granted under this + License. No additional rights or licenses will be implied from the distribution + or licensing of Covered Software under this License. Notwithstanding Section + 2.1(b) above, no patent license is granted by a Contributor: + + a. for any code that a Contributor has removed from Covered Software; or + + b. for infringements caused by: (i) Your and any other third party’s + modifications of Covered Software, or (ii) the combination of its + Contributions with other software (except as part of its Contributor + Version); or + + c. under Patent Claims infringed by Covered Software in the absence of its + Contributions. + + This License does not grant any rights in the trademarks, service marks, or + logos of any Contributor (except as may be necessary to comply with the + notice requirements in Section 3.4). + +2.4. Subsequent Licenses + + No Contributor makes additional grants as a result of Your choice to + distribute the Covered Software under a subsequent version of this License + (see Section 10.2) or under the terms of a Secondary License (if permitted + under the terms of Section 3.3). + +2.5. Representation + + Each Contributor represents that the Contributor believes its Contributions + are its original creation(s) or it has sufficient rights to grant the + rights to its Contributions conveyed by this License. + +2.6. Fair Use + + This License is not intended to limit any rights You have under applicable + copyright doctrines of fair use, fair dealing, or other equivalents. + +2.7. Conditions + + Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted in + Section 2.1. + + +3. Responsibilities + +3.1. Distribution of Source Form + + All distribution of Covered Software in Source Code Form, including any + Modifications that You create or to which You contribute, must be under the + terms of this License. You must inform recipients that the Source Code Form + of the Covered Software is governed by the terms of this License, and how + they can obtain a copy of this License. You may not attempt to alter or + restrict the recipients’ rights in the Source Code Form. + +3.2. Distribution of Executable Form + + If You distribute Covered Software in Executable Form then: + + a. such Covered Software must also be made available in Source Code Form, + as described in Section 3.1, and You must inform recipients of the + Executable Form how they can obtain a copy of such Source Code Form by + reasonable means in a timely manner, at a charge no more than the cost + of distribution to the recipient; and + + b. You may distribute such Executable Form under the terms of this License, + or sublicense it under different terms, provided that the license for + the Executable Form does not attempt to limit or alter the recipients’ + rights in the Source Code Form under this License. + +3.3. Distribution of a Larger Work + + You may create and distribute a Larger Work under terms of Your choice, + provided that You also comply with the requirements of this License for the + Covered Software. If the Larger Work is a combination of Covered Software + with a work governed by one or more Secondary Licenses, and the Covered + Software is not Incompatible With Secondary Licenses, this License permits + You to additionally distribute such Covered Software under the terms of + such Secondary License(s), so that the recipient of the Larger Work may, at + their option, further distribute the Covered Software under the terms of + either this License or such Secondary License(s). + +3.4. Notices + + You may not remove or alter the substance of any license notices (including + copyright notices, patent notices, disclaimers of warranty, or limitations + of liability) contained within the Source Code Form of the Covered + Software, except that You may alter any license notices to the extent + required to remedy known factual inaccuracies. + +3.5. Application of Additional Terms + + You may choose to offer, and to charge a fee for, warranty, support, + indemnity or liability obligations to one or more recipients of Covered + Software. However, You may do so only on Your own behalf, and not on behalf + of any Contributor. You must make it absolutely clear that any such + warranty, support, indemnity, or liability obligation is offered by You + alone, and You hereby agree to indemnify every Contributor for any + liability incurred by such Contributor as a result of warranty, support, + indemnity or liability terms You offer. You may include additional + disclaimers of warranty and limitations of liability specific to any + jurisdiction. + +4. Inability to Comply Due to Statute or Regulation + + If it is impossible for You to comply with any of the terms of this License + with respect to some or all of the Covered Software due to statute, judicial + order, or regulation then You must: (a) comply with the terms of this License + to the maximum extent possible; and (b) describe the limitations and the code + they affect. Such description must be placed in a text file included with all + distributions of the Covered Software under this License. Except to the + extent prohibited by statute or regulation, such description must be + sufficiently detailed for a recipient of ordinary skill to be able to + understand it. + +5. Termination + +5.1. The rights granted under this License will terminate automatically if You + fail to comply with any of its terms. However, if You become compliant, + then the rights granted under this License from a particular Contributor + are reinstated (a) provisionally, unless and until such Contributor + explicitly and finally terminates Your grants, and (b) on an ongoing basis, + if such Contributor fails to notify You of the non-compliance by some + reasonable means prior to 60 days after You have come back into compliance. + Moreover, Your grants from a particular Contributor are reinstated on an + ongoing basis if such Contributor notifies You of the non-compliance by + some reasonable means, this is the first time You have received notice of + non-compliance with this License from such Contributor, and You become + compliant prior to 30 days after Your receipt of the notice. + +5.2. If You initiate litigation against any entity by asserting a patent + infringement claim (excluding declaratory judgment actions, counter-claims, + and cross-claims) alleging that a Contributor Version directly or + indirectly infringes any patent, then the rights granted to You by any and + all Contributors for the Covered Software under Section 2.1 of this License + shall terminate. + +5.3. In the event of termination under Sections 5.1 or 5.2 above, all end user + license agreements (excluding distributors and resellers) which have been + validly granted by You or Your distributors under this License prior to + termination shall survive termination. + +6. Disclaimer of Warranty + + Covered Software is provided under this License on an “as is” basis, without + warranty of any kind, either expressed, implied, or statutory, including, + without limitation, warranties that the Covered Software is free of defects, + merchantable, fit for a particular purpose or non-infringing. The entire + risk as to the quality and performance of the Covered Software is with You. + Should any Covered Software prove defective in any respect, You (not any + Contributor) assume the cost of any necessary servicing, repair, or + correction. This disclaimer of warranty constitutes an essential part of this + License. No use of any Covered Software is authorized under this License + except under this disclaimer. + +7. Limitation of Liability + + Under no circumstances and under no legal theory, whether tort (including + negligence), contract, or otherwise, shall any Contributor, or anyone who + distributes Covered Software as permitted above, be liable to You for any + direct, indirect, special, incidental, or consequential damages of any + character including, without limitation, damages for lost profits, loss of + goodwill, work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses, even if such party shall have been + informed of the possibility of such damages. This limitation of liability + shall not apply to liability for death or personal injury resulting from such + party’s negligence to the extent applicable law prohibits such limitation. + Some jurisdictions do not allow the exclusion or limitation of incidental or + consequential damages, so this exclusion and limitation may not apply to You. + +8. Litigation + + Any litigation relating to this License may be brought only in the courts of + a jurisdiction where the defendant maintains its principal place of business + and such litigation shall be governed by laws of that jurisdiction, without + reference to its conflict-of-law provisions. Nothing in this Section shall + prevent a party’s ability to bring cross-claims or counter-claims. + +9. Miscellaneous + + This License represents the complete agreement concerning the subject matter + hereof. If any provision of this License is held to be unenforceable, such + provision shall be reformed only to the extent necessary to make it + enforceable. Any law or regulation which provides that the language of a + contract shall be construed against the drafter shall not be used to construe + this License against a Contributor. + + +10. Versions of the License + +10.1. New Versions + + Mozilla Foundation is the license steward. Except as provided in Section + 10.3, no one other than the license steward has the right to modify or + publish new versions of this License. Each version will be given a + distinguishing version number. + +10.2. Effect of New Versions + + You may distribute the Covered Software under the terms of the version of + the License under which You originally received the Covered Software, or + under the terms of any subsequent version published by the license + steward. + +10.3. Modified Versions + + If you create software not governed by this License, and you want to + create a new license for such software, you may create and use a modified + version of this License if you rename the license and remove any + references to the name of the license steward (except to note that such + modified license differs from this License). + +10.4. Distributing Source Code Form that is Incompatible With Secondary Licenses + If You choose to distribute Source Code Form that is Incompatible With + Secondary Licenses under the terms of this version of the License, the + notice described in Exhibit B of this License must be attached. + +Exhibit A - Source Code Form License Notice + + This Source Code Form is subject to the + terms of the Mozilla Public License, v. + 2.0. If a copy of the MPL was not + distributed with this file, You can + obtain one at + http://mozilla.org/MPL/2.0/. + +If it is not possible or desirable to put the notice in a particular file, then +You may include the notice in a location (such as a LICENSE file in a relevant +directory) where a recipient would be likely to look for such a notice. + +You may add additional accurate notices of copyright ownership. + +Exhibit B - “Incompatible With Secondary Licenses” Notice + + This Source Code Form is “Incompatible + With Secondary Licenses”, as defined by + the Mozilla Public License, v. 2.0. + diff --git a/Godeps/_workspace/src/github.com/hashicorp/raft/Makefile b/Godeps/_workspace/src/github.com/hashicorp/raft/Makefile new file mode 100644 index 00000000000..c61b34a8f6c --- /dev/null +++ b/Godeps/_workspace/src/github.com/hashicorp/raft/Makefile @@ -0,0 +1,17 @@ +DEPS = $(go list -f '{{range .TestImports}}{{.}} {{end}}' ./...) + +test: + go test -timeout=5s ./... + +integ: test + INTEG_TESTS=yes go test -timeout=3s -run=Integ ./... + +deps: + go get -d -v ./... + echo $(DEPS) | xargs -n1 go get -d + +cov: + INTEG_TESTS=yes gocov test github.com/hashicorp/raft | gocov-html > /tmp/coverage.html + open /tmp/coverage.html + +.PHONY: test cov integ deps diff --git a/Godeps/_workspace/src/github.com/hashicorp/raft/README.md b/Godeps/_workspace/src/github.com/hashicorp/raft/README.md new file mode 100644 index 00000000000..ecb6c977eea --- /dev/null +++ b/Godeps/_workspace/src/github.com/hashicorp/raft/README.md @@ -0,0 +1,89 @@ +raft [![Build Status](https://travis-ci.org/hashicorp/raft.png)](https://travis-ci.org/hashicorp/raft) +==== + +raft is a [Go](http://www.golang.org) library that manages a replicated +log and can be used with an FSM to manage replicated state machines. It +is library for providing [consensus](http://en.wikipedia.org/wiki/Consensus_(computer_science)). + +The use cases for such a library are far-reaching as replicated state +machines are a key component of many distributed systems. They enable +building Consistent, Partition Tolerant (CP) systems, with limited +fault tolerance as well. + +## Building + +If you wish to build raft you'll need Go version 1.2+ installed. + +Please check your installation with: + +``` +go version +``` + +## Documentation + +For complete documentation, see the associated [Godoc](http://godoc.org/github.com/hashicorp/raft). + +To prevent complications with cgo, the primary backend `MDBStore` is in a separate repositoy, +called [raft-mdb](http://github.com/hashicorp/raft-mdb). That is the recommended implementation +for the `LogStore` and `StableStore`. + +A pure Go backend using [BoltDB](https://github.com/boltdb/bolt) is also available called +[raft-boltdb](https://github.com/hashicorp/raft-boltdb). It can also be used as a `LogStore` +and `StableStore`. + +## Protocol + +raft is based on ["Raft: In Search of an Understandable Consensus Algorithm"](https://ramcloud.stanford.edu/wiki/download/attachments/11370504/raft.pdf) + +A high level overview of the Raft protocol is described below, but for details please read the full +[Raft paper](https://ramcloud.stanford.edu/wiki/download/attachments/11370504/raft.pdf) +followed by the raft source. Any questions about the raft protocol should be sent to the +[raft-dev mailing list](https://groups.google.com/forum/#!forum/raft-dev). + +### Protocol Description + +Raft nodes are always in one of three states: follower, candidate or leader. All +nodes initially start out as a follower. In this state, nodes can accept log entries +from a leader and cast votes. If no entries are received for some time, nodes +self-promote to the candidate state. In the candidate state nodes request votes from +their peers. If a candidate receives a quorum of votes, then it is promoted to a leader. +The leader must accept new log entries and replicate to all the other followers. +In addition, if stale reads are not acceptable, all queries must also be performed on +the leader. + +Once a cluster has a leader, it is able to accept new log entries. A client can +request that a leader append a new log entry, which is an opaque binary blob to +Raft. The leader then writes the entry to durable storage and attempts to replicate +to a quorum of followers. Once the log entry is considered *committed*, it can be +*applied* to a finite state machine. The finite state machine is application specific, +and is implemented using an interface. + +An obvious question relates to the unbounded nature of a replicated log. Raft provides +a mechanism by which the current state is snapshotted, and the log is compacted. Because +of the FSM abstraction, restoring the state of the FSM must result in the same state +as a replay of old logs. This allows Raft to capture the FSM state at a point in time, +and then remove all the logs that were used to reach that state. This is performed automatically +without user intervention, and prevents unbounded disk usage as well as minimizing +time spent replaying logs. + +Lastly, there is the issue of updating the peer set when new servers are joining +or existing servers are leaving. As long as a quorum of nodes is available, this +is not an issue as Raft provides mechanisms to dynamically update the peer set. +If a quorum of nodes is unavailable, then this becomes a very challenging issue. +For example, suppose there are only 2 peers, A and B. The quorum size is also +2, meaning both nodes must agree to commit a log entry. If either A or B fails, +it is now impossible to reach quorum. This means the cluster is unable to add, +or remove a node, or commit any additional log entries. This results in *unavailability*. +At this point, manual intervention would be required to remove either A or B, +and to restart the remaining node in bootstrap mode. + +A Raft cluster of 3 nodes can tolerate a single node failure, while a cluster +of 5 can tolerate 2 node failures. The recommended configuration is to either +run 3 or 5 raft servers. This maximizes availability without +greatly sacrificing performance. + +In terms of performance, Raft is comparable to Paxos. Assuming stable leadership, +committing a log entry requires a single round trip to half of the cluster. +Thus performance is bound by disk I/O and network latency. + diff --git a/Godeps/_workspace/src/github.com/hashicorp/raft/bench/bench.go b/Godeps/_workspace/src/github.com/hashicorp/raft/bench/bench.go new file mode 100644 index 00000000000..d7a58f45f44 --- /dev/null +++ b/Godeps/_workspace/src/github.com/hashicorp/raft/bench/bench.go @@ -0,0 +1,171 @@ +package raftbench + +// raftbench provides common benchmarking functions which can be used by +// anything which implements the raft.LogStore and raft.StableStore interfaces. +// All functions accept these interfaces and perform benchmarking. This +// makes comparing backend performance easier by sharing the tests. + +import ( + "github.com/hashicorp/raft" + "testing" +) + +func FirstIndex(b *testing.B, store raft.LogStore) { + // Create some fake data + var logs []*raft.Log + for i := 1; i < 10; i++ { + logs = append(logs, &raft.Log{Index: uint64(i), Data: []byte("data")}) + } + if err := store.StoreLogs(logs); err != nil { + b.Fatalf("err: %s", err) + } + b.ResetTimer() + + // Run FirstIndex a number of times + for n := 0; n < b.N; n++ { + store.FirstIndex() + } +} + +func LastIndex(b *testing.B, store raft.LogStore) { + // Create some fake data + var logs []*raft.Log + for i := 1; i < 10; i++ { + logs = append(logs, &raft.Log{Index: uint64(i), Data: []byte("data")}) + } + if err := store.StoreLogs(logs); err != nil { + b.Fatalf("err: %s", err) + } + b.ResetTimer() + + // Run LastIndex a number of times + for n := 0; n < b.N; n++ { + store.LastIndex() + } +} + +func GetLog(b *testing.B, store raft.LogStore) { + // Create some fake data + var logs []*raft.Log + for i := 1; i < 10; i++ { + logs = append(logs, &raft.Log{Index: uint64(i), Data: []byte("data")}) + } + if err := store.StoreLogs(logs); err != nil { + b.Fatalf("err: %s", err) + } + b.ResetTimer() + + // Run GetLog a number of times + for n := 0; n < b.N; n++ { + if err := store.GetLog(5, new(raft.Log)); err != nil { + b.Fatalf("err: %s", err) + } + } +} + +func StoreLog(b *testing.B, store raft.LogStore) { + // Run StoreLog a number of times + for n := 0; n < b.N; n++ { + log := &raft.Log{Index: uint64(n), Data: []byte("data")} + if err := store.StoreLog(log); err != nil { + b.Fatalf("err: %s", err) + } + } +} + +func StoreLogs(b *testing.B, store raft.LogStore) { + // Run StoreLogs a number of times. We want to set multiple logs each + // run, so we create 3 logs with incrementing indexes for each iteration. + for n := 0; n < b.N; n++ { + b.StopTimer() + offset := 3 * (n + 1) + logs := []*raft.Log{ + &raft.Log{Index: uint64(offset - 2), Data: []byte("data")}, + &raft.Log{Index: uint64(offset - 1), Data: []byte("data")}, + &raft.Log{Index: uint64(offset), Data: []byte("data")}, + } + b.StartTimer() + + if err := store.StoreLogs(logs); err != nil { + b.Fatalf("err: %s", err) + } + } +} + +func DeleteRange(b *testing.B, store raft.LogStore) { + // Create some fake data. In this case, we create 3 new log entries for each + // test case, and separate them by index in multiples of 10. This allows + // some room so that we can test deleting ranges with "extra" logs to + // to ensure we stop going to the database once our max index is hit. + var logs []*raft.Log + for n := 0; n < b.N; n++ { + offset := 10 * n + for i := offset; i < offset+3; i++ { + logs = append(logs, &raft.Log{Index: uint64(i), Data: []byte("data")}) + } + } + if err := store.StoreLogs(logs); err != nil { + b.Fatalf("err: %s", err) + } + b.ResetTimer() + + // Delete a range of the data + for n := 0; n < b.N; n++ { + offset := 10 * n + if err := store.DeleteRange(uint64(offset), uint64(offset+9)); err != nil { + b.Fatalf("err: %s", err) + } + } +} + +func Set(b *testing.B, store raft.StableStore) { + // Run Set a number of times + for n := 0; n < b.N; n++ { + if err := store.Set([]byte{byte(n)}, []byte("val")); err != nil { + b.Fatalf("err: %s", err) + } + } +} + +func Get(b *testing.B, store raft.StableStore) { + // Create some fake data + for i := 1; i < 10; i++ { + if err := store.Set([]byte{byte(i)}, []byte("val")); err != nil { + b.Fatalf("err: %s", err) + } + } + b.ResetTimer() + + // Run Get a number of times + for n := 0; n < b.N; n++ { + if _, err := store.Get([]byte{0x05}); err != nil { + b.Fatalf("err: %s", err) + } + } +} + +func SetUint64(b *testing.B, store raft.StableStore) { + // Run SetUint64 a number of times + for n := 0; n < b.N; n++ { + if err := store.SetUint64([]byte{byte(n)}, uint64(n)); err != nil { + b.Fatalf("err: %s", err) + } + } +} + +func GetUint64(b *testing.B, store raft.StableStore) { + // Create some fake data + for i := 0; i < 10; i++ { + if err := store.SetUint64([]byte{byte(i)}, uint64(i)); err != nil { + b.Fatalf("err: %s", err) + } + } + b.ResetTimer() + + // Run GetUint64 a number of times + for n := 0; n < b.N; n++ { + if _, err := store.Get([]byte{0x05}); err != nil { + b.Fatalf("err: %s", err) + } + } +} diff --git a/Godeps/_workspace/src/github.com/hashicorp/raft/commands.go b/Godeps/_workspace/src/github.com/hashicorp/raft/commands.go new file mode 100644 index 00000000000..739775b3541 --- /dev/null +++ b/Godeps/_workspace/src/github.com/hashicorp/raft/commands.go @@ -0,0 +1,84 @@ +package raft + +// AppendEntriesRequest is the command used to append entries to the +// replicated log. +type AppendEntriesRequest struct { + // Provide the current term and leader + Term uint64 + Leader []byte + + // Provide the previous entries for integrity checking + PrevLogEntry uint64 + PrevLogTerm uint64 + + // New entries to commit + Entries []*Log + + // Commit index on the leader + LeaderCommitIndex uint64 +} + +// AppendEntriesResponse is the response returned from an +// AppendEntriesRequest. +type AppendEntriesResponse struct { + // Newer term if leader is out of date + Term uint64 + + // Last Log is a hint to help accelerate rebuilding slow nodes + LastLog uint64 + + // We may not succeed if we have a conflicting entry + Success bool + + // There are scenarios where this request didn't succeed + // but there's no need to wait/back-off the next attempt. + NoRetryBackoff bool +} + +// RequestVoteRequest is the command used by a candidate to ask a Raft peer +// for a vote in an election. +type RequestVoteRequest struct { + // Provide the term and our id + Term uint64 + Candidate []byte + + // Used to ensure safety + LastLogIndex uint64 + LastLogTerm uint64 +} + +// RequestVoteResponse is the response returned from a RequestVoteRequest. +type RequestVoteResponse struct { + // Newer term if leader is out of date + Term uint64 + + // Return the peers, so that a node can shutdown on removal + Peers []byte + + // Is the vote granted + Granted bool +} + +// InstallSnapshotRequest is the command sent to a Raft peer to bootstrap its +// log (and state machine) from a snapshot on another peer. +type InstallSnapshotRequest struct { + Term uint64 + Leader []byte + + // These are the last index/term included in the snapshot + LastLogIndex uint64 + LastLogTerm uint64 + + // Peer Set in the snapshot + Peers []byte + + // Size of the snapshot + Size int64 +} + +// InstallSnapshotResponse is the response returned from an +// InstallSnapshotRequest. +type InstallSnapshotResponse struct { + Term uint64 + Success bool +} diff --git a/Godeps/_workspace/src/github.com/hashicorp/raft/config.go b/Godeps/_workspace/src/github.com/hashicorp/raft/config.go new file mode 100644 index 00000000000..6b3c0b59f0c --- /dev/null +++ b/Godeps/_workspace/src/github.com/hashicorp/raft/config.go @@ -0,0 +1,134 @@ +package raft + +import ( + "fmt" + "io" + "log" + "time" +) + +// Config provides any necessary configuration to +// the Raft server +type Config struct { + // Time in follower state without a leader before we attempt an election. + HeartbeatTimeout time.Duration + + // Time in candidate state without a leader before we attempt an election. + ElectionTimeout time.Duration + + // Time without an Apply() operation before we heartbeat to ensure + // a timely commit. Due to random staggering, may be delayed as much as + // 2x this value. + CommitTimeout time.Duration + + // MaxAppendEntries controls the maximum number of append entries + // to send at once. We want to strike a balance between efficiency + // and avoiding waste if the follower is going to reject because of + // an inconsistent log. + MaxAppendEntries int + + // If we are a member of a cluster, and RemovePeer is invoked for the + // local node, then we forget all peers and transition into the follower state. + // If ShutdownOnRemove is is set, we additional shutdown Raft. Otherwise, + // we can become a leader of a cluster containing only this node. + ShutdownOnRemove bool + + // DisableBootstrapAfterElect is used to turn off EnableSingleNode + // after the node is elected. This is used to prevent self-election + // if the node is removed from the Raft cluster via RemovePeer. Setting + // it to false will keep the bootstrap mode, allowing the node to self-elect + // and potentially bootstrap a separate cluster. + DisableBootstrapAfterElect bool + + // TrailingLogs controls how many logs we leave after a snapshot. This is + // used so that we can quickly replay logs on a follower instead of being + // forced to send an entire snapshot. + TrailingLogs uint64 + + // SnapshotInterval controls how often we check if we should perform a snapshot. + // We randomly stagger between this value and 2x this value to avoid the entire + // cluster from performing a snapshot at once. + SnapshotInterval time.Duration + + // SnapshotThreshold controls how many outstanding logs there must be before + // we perform a snapshot. This is to prevent excessive snapshots when we can + // just replay a small set of logs. + SnapshotThreshold uint64 + + // EnableSingleNode allows for a single node mode of operation. This + // is false by default, which prevents a lone node from electing itself. + // leader. + EnableSingleNode bool + + // LeaderLeaseTimeout is used to control how long the "lease" lasts + // for being the leader without being able to contact a quorum + // of nodes. If we reach this interval without contact, we will + // step down as leader. + LeaderLeaseTimeout time.Duration + + // StartAsLeader forces Raft to start in the leader state. This should + // never be used except for testing purposes, as it can cause a split-brain. + StartAsLeader bool + + // NotifyCh is used to provide a channel that will be notified of leadership + // changes. Raft will block writing to this channel, so it should either be + // buffered or aggressively consumed. + NotifyCh chan<- bool + + // LogOutput is used as a sink for logs, unless Logger is specified. + // Defaults to os.Stderr. + LogOutput io.Writer + + // Logger is a user-provided logger. If nil, a logger writing to LogOutput + // is used. + Logger *log.Logger +} + +// DefaultConfig returns a Config with usable defaults. +func DefaultConfig() *Config { + return &Config{ + HeartbeatTimeout: 1000 * time.Millisecond, + ElectionTimeout: 1000 * time.Millisecond, + CommitTimeout: 50 * time.Millisecond, + MaxAppendEntries: 64, + ShutdownOnRemove: true, + DisableBootstrapAfterElect: true, + TrailingLogs: 10240, + SnapshotInterval: 120 * time.Second, + SnapshotThreshold: 8192, + EnableSingleNode: false, + LeaderLeaseTimeout: 500 * time.Millisecond, + } +} + +// ValidateConfig is used to validate a sane configuration +func ValidateConfig(config *Config) error { + if config.HeartbeatTimeout < 5*time.Millisecond { + return fmt.Errorf("Heartbeat timeout is too low") + } + if config.ElectionTimeout < 5*time.Millisecond { + return fmt.Errorf("Election timeout is too low") + } + if config.CommitTimeout < time.Millisecond { + return fmt.Errorf("Commit timeout is too low") + } + if config.MaxAppendEntries <= 0 { + return fmt.Errorf("MaxAppendEntries must be positive") + } + if config.MaxAppendEntries > 1024 { + return fmt.Errorf("MaxAppendEntries is too large") + } + if config.SnapshotInterval < 5*time.Millisecond { + return fmt.Errorf("Snapshot interval is too low") + } + if config.LeaderLeaseTimeout < 5*time.Millisecond { + return fmt.Errorf("Leader lease timeout is too low") + } + if config.LeaderLeaseTimeout > config.HeartbeatTimeout { + return fmt.Errorf("Leader lease timeout cannot be larger than heartbeat timeout") + } + if config.ElectionTimeout < config.HeartbeatTimeout { + return fmt.Errorf("Election timeout must be equal or greater than Heartbeat Timeout") + } + return nil +} diff --git a/Godeps/_workspace/src/github.com/hashicorp/raft/discard_snapshot.go b/Godeps/_workspace/src/github.com/hashicorp/raft/discard_snapshot.go new file mode 100644 index 00000000000..1b4611d559f --- /dev/null +++ b/Godeps/_workspace/src/github.com/hashicorp/raft/discard_snapshot.go @@ -0,0 +1,48 @@ +package raft + +import ( + "fmt" + "io" +) + +// DiscardSnapshotStore is used to successfully snapshot while +// always discarding the snapshot. This is useful for when the +// log should be truncated but no snapshot should be retained. +// This should never be used for production use, and is only +// suitable for testing. +type DiscardSnapshotStore struct{} + +type DiscardSnapshotSink struct{} + +// NewDiscardSnapshotStore is used to create a new DiscardSnapshotStore. +func NewDiscardSnapshotStore() *DiscardSnapshotStore { + return &DiscardSnapshotStore{} +} + +func (d *DiscardSnapshotStore) Create(index, term uint64, peers []byte) (SnapshotSink, error) { + return &DiscardSnapshotSink{}, nil +} + +func (d *DiscardSnapshotStore) List() ([]*SnapshotMeta, error) { + return nil, nil +} + +func (d *DiscardSnapshotStore) Open(id string) (*SnapshotMeta, io.ReadCloser, error) { + return nil, nil, fmt.Errorf("open is not supported") +} + +func (d *DiscardSnapshotSink) Write(b []byte) (int, error) { + return len(b), nil +} + +func (d *DiscardSnapshotSink) Close() error { + return nil +} + +func (d *DiscardSnapshotSink) ID() string { + return "discard" +} + +func (d *DiscardSnapshotSink) Cancel() error { + return nil +} diff --git a/Godeps/_workspace/src/github.com/hashicorp/raft/file_snapshot.go b/Godeps/_workspace/src/github.com/hashicorp/raft/file_snapshot.go new file mode 100644 index 00000000000..a8955373422 --- /dev/null +++ b/Godeps/_workspace/src/github.com/hashicorp/raft/file_snapshot.go @@ -0,0 +1,470 @@ +package raft + +import ( + "bufio" + "bytes" + "encoding/json" + "fmt" + "hash" + "hash/crc64" + "io" + "io/ioutil" + "log" + "os" + "path/filepath" + "sort" + "strings" + "time" +) + +const ( + testPath = "permTest" + snapPath = "snapshots" + metaFilePath = "meta.json" + stateFilePath = "state.bin" + tmpSuffix = ".tmp" +) + +// FileSnapshotStore implements the SnapshotStore interface and allows +// snapshots to be made on the local disk. +type FileSnapshotStore struct { + path string + retain int + logger *log.Logger +} + +type snapMetaSlice []*fileSnapshotMeta + +// FileSnapshotSink implements SnapshotSink with a file. +type FileSnapshotSink struct { + store *FileSnapshotStore + logger *log.Logger + dir string + meta fileSnapshotMeta + + stateFile *os.File + stateHash hash.Hash64 + buffered *bufio.Writer + + closed bool +} + +// fileSnapshotMeta is stored on disk. We also put a CRC +// on disk so that we can verify the snapshot. +type fileSnapshotMeta struct { + SnapshotMeta + CRC []byte +} + +// bufferedFile is returned when we open a snapshot. This way +// reads are buffered and the file still gets closed. +type bufferedFile struct { + bh *bufio.Reader + fh *os.File +} + +func (b *bufferedFile) Read(p []byte) (n int, err error) { + return b.bh.Read(p) +} + +func (b *bufferedFile) Close() error { + return b.fh.Close() +} + +// NewFileSnapshotStoreWithLogger creates a new FileSnapshotStore based +// on a base directory. The `retain` parameter controls how many +// snapshots are retained. Must be at least 1. +func NewFileSnapshotStoreWithLogger(base string, retain int, logger *log.Logger) (*FileSnapshotStore, error) { + if retain < 1 { + return nil, fmt.Errorf("must retain at least one snapshot") + } + if logger == nil { + logger = log.New(os.Stderr, "", log.LstdFlags) + } + + // Ensure our path exists + path := filepath.Join(base, snapPath) + if err := os.MkdirAll(path, 0755); err != nil && !os.IsExist(err) { + return nil, fmt.Errorf("snapshot path not accessible: %v", err) + } + + // Setup the store + store := &FileSnapshotStore{ + path: path, + retain: retain, + logger: logger, + } + + // Do a permissions test + if err := store.testPermissions(); err != nil { + return nil, fmt.Errorf("permissions test failed: %v", err) + } + return store, nil +} + +// NewFileSnapshotStore creates a new FileSnapshotStore based +// on a base directory. The `retain` parameter controls how many +// snapshots are retained. Must be at least 1. +func NewFileSnapshotStore(base string, retain int, logOutput io.Writer) (*FileSnapshotStore, error) { + if logOutput == nil { + logOutput = os.Stderr + } + return NewFileSnapshotStoreWithLogger(base, retain, log.New(logOutput, "", log.LstdFlags)) +} + +// testPermissions tries to touch a file in our path to see if it works. +func (f *FileSnapshotStore) testPermissions() error { + path := filepath.Join(f.path, testPath) + fh, err := os.Create(path) + if err != nil { + return err + } + fh.Close() + os.Remove(path) + return nil +} + +// snapshotName generates a name for the snapshot. +func snapshotName(term, index uint64) string { + now := time.Now() + msec := now.UnixNano() / int64(time.Millisecond) + return fmt.Sprintf("%d-%d-%d", term, index, msec) +} + +// Create is used to start a new snapshot +func (f *FileSnapshotStore) Create(index, term uint64, peers []byte) (SnapshotSink, error) { + // Create a new path + name := snapshotName(term, index) + path := filepath.Join(f.path, name+tmpSuffix) + f.logger.Printf("[INFO] snapshot: Creating new snapshot at %s", path) + + // Make the directory + if err := os.MkdirAll(path, 0755); err != nil { + f.logger.Printf("[ERR] snapshot: Failed to make snapshot directory: %v", err) + return nil, err + } + + // Create the sink + sink := &FileSnapshotSink{ + store: f, + logger: f.logger, + dir: path, + meta: fileSnapshotMeta{ + SnapshotMeta: SnapshotMeta{ + ID: name, + Index: index, + Term: term, + Peers: peers, + }, + CRC: nil, + }, + } + + // Write out the meta data + if err := sink.writeMeta(); err != nil { + f.logger.Printf("[ERR] snapshot: Failed to write metadata: %v", err) + return nil, err + } + + // Open the state file + statePath := filepath.Join(path, stateFilePath) + fh, err := os.Create(statePath) + if err != nil { + f.logger.Printf("[ERR] snapshot: Failed to create state file: %v", err) + return nil, err + } + sink.stateFile = fh + + // Create a CRC64 hash + sink.stateHash = crc64.New(crc64.MakeTable(crc64.ECMA)) + + // Wrap both the hash and file in a MultiWriter with buffering + multi := io.MultiWriter(sink.stateFile, sink.stateHash) + sink.buffered = bufio.NewWriter(multi) + + // Done + return sink, nil +} + +// List returns available snapshots in the store. +func (f *FileSnapshotStore) List() ([]*SnapshotMeta, error) { + // Get the eligible snapshots + snapshots, err := f.getSnapshots() + if err != nil { + f.logger.Printf("[ERR] snapshot: Failed to get snapshots: %v", err) + return nil, err + } + + var snapMeta []*SnapshotMeta + for _, meta := range snapshots { + snapMeta = append(snapMeta, &meta.SnapshotMeta) + if len(snapMeta) == f.retain { + break + } + } + return snapMeta, nil +} + +// getSnapshots returns all the known snapshots. +func (f *FileSnapshotStore) getSnapshots() ([]*fileSnapshotMeta, error) { + // Get the eligible snapshots + snapshots, err := ioutil.ReadDir(f.path) + if err != nil { + f.logger.Printf("[ERR] snapshot: Failed to scan snapshot dir: %v", err) + return nil, err + } + + // Populate the metadata + var snapMeta []*fileSnapshotMeta + for _, snap := range snapshots { + // Ignore any files + if !snap.IsDir() { + continue + } + + // Ignore any temporary snapshots + dirName := snap.Name() + if strings.HasSuffix(dirName, tmpSuffix) { + f.logger.Printf("[WARN] snapshot: Found temporary snapshot: %v", dirName) + continue + } + + // Try to read the meta data + meta, err := f.readMeta(dirName) + if err != nil { + f.logger.Printf("[WARN] snapshot: Failed to read metadata for %v: %v", dirName, err) + continue + } + + // Append, but only return up to the retain count + snapMeta = append(snapMeta, meta) + } + + // Sort the snapshot, reverse so we get new -> old + sort.Sort(sort.Reverse(snapMetaSlice(snapMeta))) + + return snapMeta, nil +} + +// readMeta is used to read the meta data for a given named backup +func (f *FileSnapshotStore) readMeta(name string) (*fileSnapshotMeta, error) { + // Open the meta file + metaPath := filepath.Join(f.path, name, metaFilePath) + fh, err := os.Open(metaPath) + if err != nil { + return nil, err + } + defer fh.Close() + + // Buffer the file IO + buffered := bufio.NewReader(fh) + + // Read in the JSON + meta := &fileSnapshotMeta{} + dec := json.NewDecoder(buffered) + if err := dec.Decode(meta); err != nil { + return nil, err + } + return meta, nil +} + +// Open takes a snapshot ID and returns a ReadCloser for that snapshot. +func (f *FileSnapshotStore) Open(id string) (*SnapshotMeta, io.ReadCloser, error) { + // Get the metadata + meta, err := f.readMeta(id) + if err != nil { + f.logger.Printf("[ERR] snapshot: Failed to get meta data to open snapshot: %v", err) + return nil, nil, err + } + + // Open the state file + statePath := filepath.Join(f.path, id, stateFilePath) + fh, err := os.Open(statePath) + if err != nil { + f.logger.Printf("[ERR] snapshot: Failed to open state file: %v", err) + return nil, nil, err + } + + // Create a CRC64 hash + stateHash := crc64.New(crc64.MakeTable(crc64.ECMA)) + + // Compute the hash + _, err = io.Copy(stateHash, fh) + if err != nil { + f.logger.Printf("[ERR] snapshot: Failed to read state file: %v", err) + fh.Close() + return nil, nil, err + } + + // Verify the hash + computed := stateHash.Sum(nil) + if bytes.Compare(meta.CRC, computed) != 0 { + f.logger.Printf("[ERR] snapshot: CRC checksum failed (stored: %v computed: %v)", + meta.CRC, computed) + fh.Close() + return nil, nil, fmt.Errorf("CRC mismatch") + } + + // Seek to the start + if _, err := fh.Seek(0, 0); err != nil { + f.logger.Printf("[ERR] snapshot: State file seek failed: %v", err) + fh.Close() + return nil, nil, err + } + + // Return a buffered file + buffered := &bufferedFile{ + bh: bufio.NewReader(fh), + fh: fh, + } + + return &meta.SnapshotMeta, buffered, nil +} + +// ReapSnapshots reaps any snapshots beyond the retain count. +func (f *FileSnapshotStore) ReapSnapshots() error { + snapshots, err := f.getSnapshots() + if err != nil { + f.logger.Printf("[ERR] snapshot: Failed to get snapshots: %v", err) + return err + } + + for i := f.retain; i < len(snapshots); i++ { + path := filepath.Join(f.path, snapshots[i].ID) + f.logger.Printf("[INFO] snapshot: reaping snapshot %v", path) + if err := os.RemoveAll(path); err != nil { + f.logger.Printf("[ERR] snapshot: Failed to reap snapshot %v: %v", path, err) + return err + } + } + return nil +} + +// ID returns the ID of the snapshot, can be used with Open() +// after the snapshot is finalized. +func (s *FileSnapshotSink) ID() string { + return s.meta.ID +} + +// Write is used to append to the state file. We write to the +// buffered IO object to reduce the amount of context switches. +func (s *FileSnapshotSink) Write(b []byte) (int, error) { + return s.buffered.Write(b) +} + +// Close is used to indicate a successful end. +func (s *FileSnapshotSink) Close() error { + // Make sure close is idempotent + if s.closed { + return nil + } + s.closed = true + + // Close the open handles + if err := s.finalize(); err != nil { + s.logger.Printf("[ERR] snapshot: Failed to finalize snapshot: %v", err) + return err + } + + // Write out the meta data + if err := s.writeMeta(); err != nil { + s.logger.Printf("[ERR] snapshot: Failed to write metadata: %v", err) + return err + } + + // Move the directory into place + newPath := strings.TrimSuffix(s.dir, tmpSuffix) + if err := os.Rename(s.dir, newPath); err != nil { + s.logger.Printf("[ERR] snapshot: Failed to move snapshot into place: %v", err) + return err + } + + // Reap any old snapshots + s.store.ReapSnapshots() + return nil +} + +// Cancel is used to indicate an unsuccessful end. +func (s *FileSnapshotSink) Cancel() error { + // Make sure close is idempotent + if s.closed { + return nil + } + s.closed = true + + // Close the open handles + if err := s.finalize(); err != nil { + s.logger.Printf("[ERR] snapshot: Failed to finalize snapshot: %v", err) + return err + } + + // Attempt to remove all artifacts + return os.RemoveAll(s.dir) +} + +// finalize is used to close all of our resources. +func (s *FileSnapshotSink) finalize() error { + // Flush any remaining data + if err := s.buffered.Flush(); err != nil { + return err + } + + // Get the file size + stat, statErr := s.stateFile.Stat() + + // Close the file + if err := s.stateFile.Close(); err != nil { + return err + } + + // Set the file size, check after we close + if statErr != nil { + return statErr + } + s.meta.Size = stat.Size() + + // Set the CRC + s.meta.CRC = s.stateHash.Sum(nil) + return nil +} + +// writeMeta is used to write out the metadata we have. +func (s *FileSnapshotSink) writeMeta() error { + // Open the meta file + metaPath := filepath.Join(s.dir, metaFilePath) + fh, err := os.Create(metaPath) + if err != nil { + return err + } + defer fh.Close() + + // Buffer the file IO + buffered := bufio.NewWriter(fh) + defer buffered.Flush() + + // Write out as JSON + enc := json.NewEncoder(buffered) + if err := enc.Encode(&s.meta); err != nil { + return err + } + return nil +} + +// Implement the sort interface for []*fileSnapshotMeta. +func (s snapMetaSlice) Len() int { + return len(s) +} + +func (s snapMetaSlice) Less(i, j int) bool { + if s[i].Term != s[j].Term { + return s[i].Term < s[j].Term + } + if s[i].Index != s[j].Index { + return s[i].Index < s[j].Index + } + return s[i].ID < s[j].ID +} + +func (s snapMetaSlice) Swap(i, j int) { + s[i], s[j] = s[j], s[i] +} diff --git a/Godeps/_workspace/src/github.com/hashicorp/raft/fsm.go b/Godeps/_workspace/src/github.com/hashicorp/raft/fsm.go new file mode 100644 index 00000000000..ea8ab548dbc --- /dev/null +++ b/Godeps/_workspace/src/github.com/hashicorp/raft/fsm.go @@ -0,0 +1,37 @@ +package raft + +import ( + "io" +) + +// FSM provides an interface that can be implemented by +// clients to make use of the replicated log. +type FSM interface { + // Apply log is invoked once a log entry is committed. + Apply(*Log) interface{} + + // Snapshot is used to support log compaction. This call should + // return an FSMSnapshot which can be used to save a point-in-time + // snapshot of the FSM. Apply and Snapshot are not called in multiple + // threads, but Apply will be called concurrently with Persist. This means + // the FSM should be implemented in a fashion that allows for concurrent + // updates while a snapshot is happening. + Snapshot() (FSMSnapshot, error) + + // Restore is used to restore an FSM from a snapshot. It is not called + // concurrently with any other command. The FSM must discard all previous + // state. + Restore(io.ReadCloser) error +} + +// FSMSnapshot is returned by an FSM in response to a Snapshot +// It must be safe to invoke FSMSnapshot methods with concurrent +// calls to Apply. +type FSMSnapshot interface { + // Persist should dump all necessary state to the WriteCloser 'sink', + // and call sink.Close() when finished or call sink.Cancel() on error. + Persist(sink SnapshotSink) error + + // Release is invoked when we are finished with the snapshot. + Release() +} diff --git a/Godeps/_workspace/src/github.com/hashicorp/raft/future.go b/Godeps/_workspace/src/github.com/hashicorp/raft/future.go new file mode 100644 index 00000000000..854e1ac927b --- /dev/null +++ b/Godeps/_workspace/src/github.com/hashicorp/raft/future.go @@ -0,0 +1,182 @@ +package raft + +import ( + "sync" + "time" +) + +// Future is used to represent an action that may occur in the future. +type Future interface { + Error() error +} + +// ApplyFuture is used for Apply() and can returns the FSM response. +type ApplyFuture interface { + Future + Response() interface{} + Index() uint64 +} + +// errorFuture is used to return a static error. +type errorFuture struct { + err error +} + +func (e errorFuture) Error() error { + return e.err +} + +func (e errorFuture) Response() interface{} { + return nil +} + +func (e errorFuture) Index() uint64 { + return 0 +} + +// deferError can be embedded to allow a future +// to provide an error in the future. +type deferError struct { + err error + errCh chan error + responded bool +} + +func (d *deferError) init() { + d.errCh = make(chan error, 1) +} + +func (d *deferError) Error() error { + if d.err != nil { + return d.err + } + if d.errCh == nil { + panic("waiting for response on nil channel") + } + d.err = <-d.errCh + return d.err +} + +func (d *deferError) respond(err error) { + if d.errCh == nil { + return + } + if d.responded { + return + } + d.errCh <- err + close(d.errCh) + d.responded = true +} + +// logFuture is used to apply a log entry and waits until +// the log is considered committed. +type logFuture struct { + deferError + log Log + policy quorumPolicy + response interface{} + dispatch time.Time +} + +func (l *logFuture) Response() interface{} { + return l.response +} + +func (l *logFuture) Index() uint64 { + return l.log.Index +} + +type peerFuture struct { + deferError + peers []string +} + +type shutdownFuture struct { + raft *Raft +} + +func (s *shutdownFuture) Error() error { + for s.raft.getRoutines() > 0 { + time.Sleep(5 * time.Millisecond) + } + return nil +} + +// snapshotFuture is used for waiting on a snapshot to complete. +type snapshotFuture struct { + deferError +} + +// reqSnapshotFuture is used for requesting a snapshot start. +// It is only used internally. +type reqSnapshotFuture struct { + deferError + + // snapshot details provided by the FSM runner before responding + index uint64 + term uint64 + peers []string + snapshot FSMSnapshot +} + +// restoreFuture is used for requesting an FSM to perform a +// snapshot restore. Used internally only. +type restoreFuture struct { + deferError + ID string +} + +// verifyFuture is used to verify the current node is still +// the leader. This is to prevent a stale read. +type verifyFuture struct { + deferError + notifyCh chan *verifyFuture + quorumSize int + votes int + voteLock sync.Mutex +} + +// vote is used to respond to a verifyFuture. +// This may block when responding on the notifyCh. +func (v *verifyFuture) vote(leader bool) { + v.voteLock.Lock() + defer v.voteLock.Unlock() + + // Guard against having notified already + if v.notifyCh == nil { + return + } + + if leader { + v.votes++ + if v.votes >= v.quorumSize { + v.notifyCh <- v + v.notifyCh = nil + } + } else { + v.notifyCh <- v + v.notifyCh = nil + } +} + +// appendFuture is used for waiting on a pipelined append +// entries RPC. +type appendFuture struct { + deferError + start time.Time + args *AppendEntriesRequest + resp *AppendEntriesResponse +} + +func (a *appendFuture) Start() time.Time { + return a.start +} + +func (a *appendFuture) Request() *AppendEntriesRequest { + return a.args +} + +func (a *appendFuture) Response() *AppendEntriesResponse { + return a.resp +} diff --git a/Godeps/_workspace/src/github.com/hashicorp/raft/inflight.go b/Godeps/_workspace/src/github.com/hashicorp/raft/inflight.go new file mode 100644 index 00000000000..7014ff50394 --- /dev/null +++ b/Godeps/_workspace/src/github.com/hashicorp/raft/inflight.go @@ -0,0 +1,213 @@ +package raft + +import ( + "container/list" + "sync" +) + +// QuorumPolicy allows individual logFutures to have different +// commitment rules while still using the inflight mechanism. +type quorumPolicy interface { + // Checks if a commit from a given peer is enough to + // satisfy the commitment rules + Commit() bool + + // Checks if a commit is committed + IsCommitted() bool +} + +// MajorityQuorum is used by Apply transactions and requires +// a simple majority of nodes. +type majorityQuorum struct { + count int + votesNeeded int +} + +func newMajorityQuorum(clusterSize int) *majorityQuorum { + votesNeeded := (clusterSize / 2) + 1 + return &majorityQuorum{count: 0, votesNeeded: votesNeeded} +} + +func (m *majorityQuorum) Commit() bool { + m.count++ + return m.count >= m.votesNeeded +} + +func (m *majorityQuorum) IsCommitted() bool { + return m.count >= m.votesNeeded +} + +// Inflight is used to track operations that are still in-flight. +type inflight struct { + sync.Mutex + committed *list.List + commitCh chan struct{} + minCommit uint64 + maxCommit uint64 + operations map[uint64]*logFuture + stopCh chan struct{} +} + +// NewInflight returns an inflight struct that notifies +// the provided channel when logs are finished committing. +func newInflight(commitCh chan struct{}) *inflight { + return &inflight{ + committed: list.New(), + commitCh: commitCh, + minCommit: 0, + maxCommit: 0, + operations: make(map[uint64]*logFuture), + stopCh: make(chan struct{}), + } +} + +// Start is used to mark a logFuture as being inflight. It +// also commits the entry, as it is assumed the leader is +// starting. +func (i *inflight) Start(l *logFuture) { + i.Lock() + defer i.Unlock() + i.start(l) +} + +// StartAll is used to mark a list of logFuture's as being +// inflight. It also commits each entry as the leader is +// assumed to be starting. +func (i *inflight) StartAll(logs []*logFuture) { + i.Lock() + defer i.Unlock() + for _, l := range logs { + i.start(l) + } +} + +// start is used to mark a single entry as inflight, +// must be invoked with the lock held. +func (i *inflight) start(l *logFuture) { + idx := l.log.Index + i.operations[idx] = l + + if idx > i.maxCommit { + i.maxCommit = idx + } + if i.minCommit == 0 { + i.minCommit = idx + } + i.commit(idx) +} + +// Cancel is used to cancel all in-flight operations. +// This is done when the leader steps down, and all futures +// are sent the given error. +func (i *inflight) Cancel(err error) { + // Close the channel first to unblock any pending commits + close(i.stopCh) + + // Lock after close to avoid deadlock + i.Lock() + defer i.Unlock() + + // Respond to all inflight operations + for _, op := range i.operations { + op.respond(err) + } + + // Clear all the committed but not processed + for e := i.committed.Front(); e != nil; e = e.Next() { + e.Value.(*logFuture).respond(err) + } + + // Clear the map + i.operations = make(map[uint64]*logFuture) + + // Clear the list of committed + i.committed = list.New() + + // Close the commmitCh + close(i.commitCh) + + // Reset indexes + i.minCommit = 0 + i.maxCommit = 0 +} + +// Committed returns all the committed operations in order. +func (i *inflight) Committed() (l *list.List) { + i.Lock() + l, i.committed = i.committed, list.New() + i.Unlock() + return l +} + +// Commit is used by leader replication routines to indicate that +// a follower was finished committing a log to disk. +func (i *inflight) Commit(index uint64) { + i.Lock() + defer i.Unlock() + i.commit(index) +} + +// CommitRange is used to commit a range of indexes inclusively. +// It is optimized to avoid commits for indexes that are not tracked. +func (i *inflight) CommitRange(minIndex, maxIndex uint64) { + i.Lock() + defer i.Unlock() + + // Update the minimum index + minIndex = max(i.minCommit, minIndex) + + // Commit each index + for idx := minIndex; idx <= maxIndex; idx++ { + i.commit(idx) + } +} + +// commit is used to commit a single index. Must be called with the lock held. +func (i *inflight) commit(index uint64) { + op, ok := i.operations[index] + if !ok { + // Ignore if not in the map, as it may be committed already + return + } + + // Check if we've satisfied the commit + if !op.policy.Commit() { + return + } + + // Cannot commit if this is not the minimum inflight. This can happen + // if the quorum size changes, meaning a previous commit requires a larger + // quorum that this commit. We MUST block until the previous log is committed, + // otherwise logs will be applied out of order. + if index != i.minCommit { + return + } + +NOTIFY: + // Add the operation to the committed list + i.committed.PushBack(op) + + // Stop tracking since it is committed + delete(i.operations, index) + + // Update the indexes + if index == i.maxCommit { + i.minCommit = 0 + i.maxCommit = 0 + + } else { + i.minCommit++ + } + + // Check if the next in-flight operation is ready + if i.minCommit != 0 { + op = i.operations[i.minCommit] + if op.policy.IsCommitted() { + index = i.minCommit + goto NOTIFY + } + } + + // Async notify of ready operations + asyncNotifyCh(i.commitCh) +} diff --git a/Godeps/_workspace/src/github.com/hashicorp/raft/inmem_store.go b/Godeps/_workspace/src/github.com/hashicorp/raft/inmem_store.go new file mode 100644 index 00000000000..6e4dfd020f7 --- /dev/null +++ b/Godeps/_workspace/src/github.com/hashicorp/raft/inmem_store.go @@ -0,0 +1,116 @@ +package raft + +import ( + "sync" +) + +// InmemStore implements the LogStore and StableStore interface. +// It should NOT EVER be used for production. It is used only for +// unit tests. Use the MDBStore implementation instead. +type InmemStore struct { + l sync.RWMutex + lowIndex uint64 + highIndex uint64 + logs map[uint64]*Log + kv map[string][]byte + kvInt map[string]uint64 +} + +// NewInmemStore returns a new in-memory backend. Do not ever +// use for production. Only for testing. +func NewInmemStore() *InmemStore { + i := &InmemStore{ + logs: make(map[uint64]*Log), + kv: make(map[string][]byte), + kvInt: make(map[string]uint64), + } + return i +} + +// FirstIndex implements the LogStore interface. +func (i *InmemStore) FirstIndex() (uint64, error) { + i.l.RLock() + defer i.l.RUnlock() + return i.lowIndex, nil +} + +// LastIndex implements the LogStore interface. +func (i *InmemStore) LastIndex() (uint64, error) { + i.l.RLock() + defer i.l.RUnlock() + return i.highIndex, nil +} + +// GetLog implements the LogStore interface. +func (i *InmemStore) GetLog(index uint64, log *Log) error { + i.l.RLock() + defer i.l.RUnlock() + l, ok := i.logs[index] + if !ok { + return ErrLogNotFound + } + *log = *l + return nil +} + +// StoreLog implements the LogStore interface. +func (i *InmemStore) StoreLog(log *Log) error { + return i.StoreLogs([]*Log{log}) +} + +// StoreLogs implements the LogStore interface. +func (i *InmemStore) StoreLogs(logs []*Log) error { + i.l.Lock() + defer i.l.Unlock() + for _, l := range logs { + i.logs[l.Index] = l + if i.lowIndex == 0 { + i.lowIndex = l.Index + } + if l.Index > i.highIndex { + i.highIndex = l.Index + } + } + return nil +} + +// DeleteRange implements the LogStore interface. +func (i *InmemStore) DeleteRange(min, max uint64) error { + i.l.Lock() + defer i.l.Unlock() + for j := min; j <= max; j++ { + delete(i.logs, j) + } + i.lowIndex = max + 1 + return nil +} + +// Set implements the StableStore interface. +func (i *InmemStore) Set(key []byte, val []byte) error { + i.l.Lock() + defer i.l.Unlock() + i.kv[string(key)] = val + return nil +} + +// Get implements the StableStore interface. +func (i *InmemStore) Get(key []byte) ([]byte, error) { + i.l.RLock() + defer i.l.RUnlock() + return i.kv[string(key)], nil +} + +// SetUint64 implements the StableStore interface. +func (i *InmemStore) SetUint64(key []byte, val uint64) error { + i.l.Lock() + defer i.l.Unlock() + i.kvInt[string(key)] = val + return nil +} + +// GetUint64 implements the StableStore interface. +func (i *InmemStore) GetUint64(key []byte) (uint64, error) { + i.l.RLock() + defer i.l.RUnlock() + return i.kvInt[string(key)], nil +} diff --git a/Godeps/_workspace/src/github.com/hashicorp/raft/inmem_transport.go b/Godeps/_workspace/src/github.com/hashicorp/raft/inmem_transport.go new file mode 100644 index 00000000000..994d06d8fad --- /dev/null +++ b/Godeps/_workspace/src/github.com/hashicorp/raft/inmem_transport.go @@ -0,0 +1,315 @@ +package raft + +import ( + "fmt" + "io" + "sync" + "time" +) + +// NewInmemAddr returns a new in-memory addr with +// a randomly generate UUID as the ID. +func NewInmemAddr() string { + return generateUUID() +} + +// inmemPipeline is used to pipeline requests for the in-mem transport. +type inmemPipeline struct { + trans *InmemTransport + peer *InmemTransport + peerAddr string + + doneCh chan AppendFuture + inprogressCh chan *inmemPipelineInflight + + shutdown bool + shutdownCh chan struct{} + shutdownLock sync.Mutex +} + +type inmemPipelineInflight struct { + future *appendFuture + respCh <-chan RPCResponse +} + +// InmemTransport Implements the Transport interface, to allow Raft to be +// tested in-memory without going over a network. +type InmemTransport struct { + sync.RWMutex + consumerCh chan RPC + localAddr string + peers map[string]*InmemTransport + pipelines []*inmemPipeline + timeout time.Duration +} + +// NewInmemTransport is used to initialize a new transport +// and generates a random local address. +func NewInmemTransport() (string, *InmemTransport) { + addr := NewInmemAddr() + trans := &InmemTransport{ + consumerCh: make(chan RPC, 16), + localAddr: addr, + peers: make(map[string]*InmemTransport), + timeout: 50 * time.Millisecond, + } + return addr, trans +} + +// SetHeartbeatHandler is used to set optional fast-path for +// heartbeats, not supported for this transport. +func (i *InmemTransport) SetHeartbeatHandler(cb func(RPC)) { +} + +// Consumer implements the Transport interface. +func (i *InmemTransport) Consumer() <-chan RPC { + return i.consumerCh +} + +// LocalAddr implements the Transport interface. +func (i *InmemTransport) LocalAddr() string { + return i.localAddr +} + +// AppendEntriesPipeline returns an interface that can be used to pipeline +// AppendEntries requests. +func (i *InmemTransport) AppendEntriesPipeline(target string) (AppendPipeline, error) { + i.RLock() + peer, ok := i.peers[target] + i.RUnlock() + if !ok { + return nil, fmt.Errorf("failed to connect to peer: %v", target) + } + pipeline := newInmemPipeline(i, peer, target) + i.Lock() + i.pipelines = append(i.pipelines, pipeline) + i.Unlock() + return pipeline, nil +} + +// AppendEntries implements the Transport interface. +func (i *InmemTransport) AppendEntries(target string, args *AppendEntriesRequest, resp *AppendEntriesResponse) error { + rpcResp, err := i.makeRPC(target, args, nil, i.timeout) + if err != nil { + return err + } + + // Copy the result back + out := rpcResp.Response.(*AppendEntriesResponse) + *resp = *out + return nil +} + +// RequestVote implements the Transport interface. +func (i *InmemTransport) RequestVote(target string, args *RequestVoteRequest, resp *RequestVoteResponse) error { + rpcResp, err := i.makeRPC(target, args, nil, i.timeout) + if err != nil { + return err + } + + // Copy the result back + out := rpcResp.Response.(*RequestVoteResponse) + *resp = *out + return nil +} + +// InstallSnapshot implements the Transport interface. +func (i *InmemTransport) InstallSnapshot(target string, args *InstallSnapshotRequest, resp *InstallSnapshotResponse, data io.Reader) error { + rpcResp, err := i.makeRPC(target, args, data, 10*i.timeout) + if err != nil { + return err + } + + // Copy the result back + out := rpcResp.Response.(*InstallSnapshotResponse) + *resp = *out + return nil +} + +func (i *InmemTransport) makeRPC(target string, args interface{}, r io.Reader, timeout time.Duration) (rpcResp RPCResponse, err error) { + i.RLock() + peer, ok := i.peers[target] + i.RUnlock() + + if !ok { + err = fmt.Errorf("failed to connect to peer: %v", target) + return + } + + // Send the RPC over + respCh := make(chan RPCResponse) + peer.consumerCh <- RPC{ + Command: args, + Reader: r, + RespChan: respCh, + } + + // Wait for a response + select { + case rpcResp = <-respCh: + if rpcResp.Error != nil { + err = rpcResp.Error + } + case <-time.After(timeout): + err = fmt.Errorf("command timed out") + } + return +} + +// EncodePeer implements the Transport interface. It uses the UUID as the +// address directly. +func (i *InmemTransport) EncodePeer(p string) []byte { + return []byte(p) +} + +// DecodePeer implements the Transport interface. It wraps the UUID in an +// InmemAddr. +func (i *InmemTransport) DecodePeer(buf []byte) string { + return string(buf) +} + +// Connect is used to connect this transport to another transport for +// a given peer name. This allows for local routing. +func (i *InmemTransport) Connect(peer string, trans *InmemTransport) { + i.Lock() + defer i.Unlock() + i.peers[peer] = trans +} + +// Disconnect is used to remove the ability to route to a given peer. +func (i *InmemTransport) Disconnect(peer string) { + i.Lock() + defer i.Unlock() + delete(i.peers, peer) + + // Disconnect any pipelines + n := len(i.pipelines) + for idx := 0; idx < n; idx++ { + if i.pipelines[idx].peerAddr == peer { + i.pipelines[idx].Close() + i.pipelines[idx], i.pipelines[n-1] = i.pipelines[n-1], nil + idx-- + n-- + } + } + i.pipelines = i.pipelines[:n] +} + +// DisconnectAll is used to remove all routes to peers. +func (i *InmemTransport) DisconnectAll() { + i.Lock() + defer i.Unlock() + i.peers = make(map[string]*InmemTransport) + + // Handle pipelines + for _, pipeline := range i.pipelines { + pipeline.Close() + } + i.pipelines = nil +} + +func newInmemPipeline(trans *InmemTransport, peer *InmemTransport, addr string) *inmemPipeline { + i := &inmemPipeline{ + trans: trans, + peer: peer, + peerAddr: addr, + doneCh: make(chan AppendFuture, 16), + inprogressCh: make(chan *inmemPipelineInflight, 16), + shutdownCh: make(chan struct{}), + } + go i.decodeResponses() + return i +} + +func (i *inmemPipeline) decodeResponses() { + timeout := i.trans.timeout + for { + select { + case inp := <-i.inprogressCh: + var timeoutCh <-chan time.Time + if timeout > 0 { + timeoutCh = time.After(timeout) + } + + select { + case rpcResp := <-inp.respCh: + // Copy the result back + *inp.future.resp = *rpcResp.Response.(*AppendEntriesResponse) + inp.future.respond(rpcResp.Error) + + select { + case i.doneCh <- inp.future: + case <-i.shutdownCh: + return + } + + case <-timeoutCh: + inp.future.respond(fmt.Errorf("command timed out")) + select { + case i.doneCh <- inp.future: + case <-i.shutdownCh: + return + } + + case <-i.shutdownCh: + return + } + case <-i.shutdownCh: + return + } + } +} + +func (i *inmemPipeline) AppendEntries(args *AppendEntriesRequest, resp *AppendEntriesResponse) (AppendFuture, error) { + // Create a new future + future := &appendFuture{ + start: time.Now(), + args: args, + resp: resp, + } + future.init() + + // Handle a timeout + var timeout <-chan time.Time + if i.trans.timeout > 0 { + timeout = time.After(i.trans.timeout) + } + + // Send the RPC over + respCh := make(chan RPCResponse, 1) + rpc := RPC{ + Command: args, + RespChan: respCh, + } + select { + case i.peer.consumerCh <- rpc: + case <-timeout: + return nil, fmt.Errorf("command enqueue timeout") + case <-i.shutdownCh: + return nil, ErrPipelineShutdown + } + + // Send to be decoded + select { + case i.inprogressCh <- &inmemPipelineInflight{future, respCh}: + return future, nil + case <-i.shutdownCh: + return nil, ErrPipelineShutdown + } +} + +func (i *inmemPipeline) Consumer() <-chan AppendFuture { + return i.doneCh +} + +func (i *inmemPipeline) Close() error { + i.shutdownLock.Lock() + defer i.shutdownLock.Unlock() + if i.shutdown { + return nil + } + + i.shutdown = true + close(i.shutdownCh) + return nil +} diff --git a/Godeps/_workspace/src/github.com/hashicorp/raft/log.go b/Godeps/_workspace/src/github.com/hashicorp/raft/log.go new file mode 100644 index 00000000000..a8c5a40eabf --- /dev/null +++ b/Godeps/_workspace/src/github.com/hashicorp/raft/log.go @@ -0,0 +1,60 @@ +package raft + +// LogType describes various types of log entries. +type LogType uint8 + +const ( + // LogCommand is applied to a user FSM. + LogCommand LogType = iota + + // LogNoop is used to assert leadership. + LogNoop + + // LogAddPeer is used to add a new peer. + LogAddPeer + + // LogRemovePeer is used to remove an existing peer. + LogRemovePeer + + // LogBarrier is used to ensure all preceding operations have been + // applied to the FSM. It is similar to LogNoop, but instead of returning + // once committed, it only returns once the FSM manager acks it. Otherwise + // it is possible there are operations committed but not yet applied to + // the FSM. + LogBarrier +) + +// Log entries are replicated to all members of the Raft cluster +// and form the heart of the replicated state machine. +type Log struct { + Index uint64 + Term uint64 + Type LogType + Data []byte + + // peer is not exported since it is not transmitted, only used + // internally to construct the Data field. + peer string +} + +// LogStore is used to provide an interface for storing +// and retrieving logs in a durable fashion. +type LogStore interface { + // Returns the first index written. 0 for no entries. + FirstIndex() (uint64, error) + + // Returns the last index written. 0 for no entries. + LastIndex() (uint64, error) + + // Gets a log entry at a given index. + GetLog(index uint64, log *Log) error + + // Stores a log entry. + StoreLog(log *Log) error + + // Stores multiple log entries. + StoreLogs(logs []*Log) error + + // Deletes a range of log entries. The range is inclusive. + DeleteRange(min, max uint64) error +} diff --git a/Godeps/_workspace/src/github.com/hashicorp/raft/log_cache.go b/Godeps/_workspace/src/github.com/hashicorp/raft/log_cache.go new file mode 100644 index 00000000000..952e98c2282 --- /dev/null +++ b/Godeps/_workspace/src/github.com/hashicorp/raft/log_cache.go @@ -0,0 +1,79 @@ +package raft + +import ( + "fmt" + "sync" +) + +// LogCache wraps any LogStore implementation to provide an +// in-memory ring buffer. This is used to cache access to +// the recently written entries. For implementations that do not +// cache themselves, this can provide a substantial boost by +// avoiding disk I/O on recent entries. +type LogCache struct { + store LogStore + + cache []*Log + l sync.RWMutex +} + +// NewLogCache is used to create a new LogCache with the +// given capacity and backend store. +func NewLogCache(capacity int, store LogStore) (*LogCache, error) { + if capacity <= 0 { + return nil, fmt.Errorf("capacity must be positive") + } + c := &LogCache{ + store: store, + cache: make([]*Log, capacity), + } + return c, nil +} + +func (c *LogCache) GetLog(idx uint64, log *Log) error { + // Check the buffer for an entry + c.l.RLock() + cached := c.cache[idx%uint64(len(c.cache))] + c.l.RUnlock() + + // Check if entry is valid + if cached != nil && cached.Index == idx { + *log = *cached + return nil + } + + // Forward request on cache miss + return c.store.GetLog(idx, log) +} + +func (c *LogCache) StoreLog(log *Log) error { + return c.StoreLogs([]*Log{log}) +} + +func (c *LogCache) StoreLogs(logs []*Log) error { + // Insert the logs into the ring buffer + c.l.Lock() + for _, l := range logs { + c.cache[l.Index%uint64(len(c.cache))] = l + } + c.l.Unlock() + + return c.store.StoreLogs(logs) +} + +func (c *LogCache) FirstIndex() (uint64, error) { + return c.store.FirstIndex() +} + +func (c *LogCache) LastIndex() (uint64, error) { + return c.store.LastIndex() +} + +func (c *LogCache) DeleteRange(min, max uint64) error { + // Invalidate the cache on deletes + c.l.Lock() + c.cache = make([]*Log, len(c.cache)) + c.l.Unlock() + + return c.store.DeleteRange(min, max) +} diff --git a/Godeps/_workspace/src/github.com/hashicorp/raft/net_transport.go b/Godeps/_workspace/src/github.com/hashicorp/raft/net_transport.go new file mode 100644 index 00000000000..9eb4fe054e8 --- /dev/null +++ b/Godeps/_workspace/src/github.com/hashicorp/raft/net_transport.go @@ -0,0 +1,622 @@ +package raft + +import ( + "bufio" + "errors" + "fmt" + "io" + "log" + "net" + "os" + "sync" + "time" + + "github.com/hashicorp/go-msgpack/codec" +) + +const ( + rpcAppendEntries uint8 = iota + rpcRequestVote + rpcInstallSnapshot + + // DefaultTimeoutScale is the default TimeoutScale in a NetworkTransport. + DefaultTimeoutScale = 256 * 1024 // 256KB + + // rpcMaxPipeline controls the maximum number of outstanding + // AppendEntries RPC calls. + rpcMaxPipeline = 128 +) + +var ( + // ErrTransportShutdown is returned when operations on a transport are + // invoked after it's been terminated. + ErrTransportShutdown = errors.New("transport shutdown") + + // ErrPipelineShutdown is returned when the pipeline is closed. + ErrPipelineShutdown = errors.New("append pipeline closed") +) + +/* + +NetworkTransport provides a network based transport that can be +used to communicate with Raft on remote machines. It requires +an underlying stream layer to provide a stream abstraction, which can +be simple TCP, TLS, etc. + +This transport is very simple and lightweight. Each RPC request is +framed by sending a byte that indicates the message type, followed +by the MsgPack encoded request. + +The response is an error string followed by the response object, +both are encoded using MsgPack. + +InstallSnapshot is special, in that after the RPC request we stream +the entire state. That socket is not re-used as the connection state +is not known if there is an error. + +*/ +type NetworkTransport struct { + connPool map[string][]*netConn + connPoolLock sync.Mutex + + consumeCh chan RPC + + heartbeatFn func(RPC) + heartbeatFnLock sync.Mutex + + logger *log.Logger + + maxPool int + + shutdown bool + shutdownCh chan struct{} + shutdownLock sync.Mutex + + stream StreamLayer + + timeout time.Duration + TimeoutScale int +} + +// StreamLayer is used with the NetworkTransport to provide +// the low level stream abstraction. +type StreamLayer interface { + net.Listener + + // Dial is used to create a new outgoing connection + Dial(address string, timeout time.Duration) (net.Conn, error) +} + +type netConn struct { + target string + conn net.Conn + r *bufio.Reader + w *bufio.Writer + dec *codec.Decoder + enc *codec.Encoder +} + +func (n *netConn) Release() error { + return n.conn.Close() +} + +type netPipeline struct { + conn *netConn + trans *NetworkTransport + + doneCh chan AppendFuture + inprogressCh chan *appendFuture + + shutdown bool + shutdownCh chan struct{} + shutdownLock sync.Mutex +} + +// NewNetworkTransport creates a new network transport with the given dialer +// and listener. The maxPool controls how many connections we will pool. The +// timeout is used to apply I/O deadlines. For InstallSnapshot, we multiply +// the timeout by (SnapshotSize / TimeoutScale). +func NewNetworkTransport( + stream StreamLayer, + maxPool int, + timeout time.Duration, + logOutput io.Writer, +) *NetworkTransport { + if logOutput == nil { + logOutput = os.Stderr + } + return NewNetworkTransportWithLogger(stream, maxPool, timeout, log.New(logOutput, "", log.LstdFlags)) +} + +// NewNetworkTransportWithLogger creates a new network transport with the given dialer +// and listener. The maxPool controls how many connections we will pool. The +// timeout is used to apply I/O deadlines. For InstallSnapshot, we multiply +// the timeout by (SnapshotSize / TimeoutScale). +func NewNetworkTransportWithLogger( + stream StreamLayer, + maxPool int, + timeout time.Duration, + logger *log.Logger, +) *NetworkTransport { + if logger == nil { + logger = log.New(os.Stderr, "", log.LstdFlags) + } + trans := &NetworkTransport{ + connPool: make(map[string][]*netConn), + consumeCh: make(chan RPC), + logger: logger, + maxPool: maxPool, + shutdownCh: make(chan struct{}), + stream: stream, + timeout: timeout, + TimeoutScale: DefaultTimeoutScale, + } + go trans.listen() + return trans +} + +// SetHeartbeatHandler is used to setup a heartbeat handler +// as a fast-pass. This is to avoid head-of-line blocking from +// disk IO. +func (n *NetworkTransport) SetHeartbeatHandler(cb func(rpc RPC)) { + n.heartbeatFnLock.Lock() + defer n.heartbeatFnLock.Unlock() + n.heartbeatFn = cb +} + +// Close is used to stop the network transport. +func (n *NetworkTransport) Close() error { + n.shutdownLock.Lock() + defer n.shutdownLock.Unlock() + + if !n.shutdown { + close(n.shutdownCh) + n.stream.Close() + n.shutdown = true + } + return nil +} + +// Consumer implements the Transport interface. +func (n *NetworkTransport) Consumer() <-chan RPC { + return n.consumeCh +} + +// LocalAddr implements the Transport interface. +func (n *NetworkTransport) LocalAddr() string { + return n.stream.Addr().String() +} + +// IsShutdown is used to check if the transport is shutdown. +func (n *NetworkTransport) IsShutdown() bool { + select { + case <-n.shutdownCh: + return true + default: + return false + } +} + +// getExistingConn is used to grab a pooled connection. +func (n *NetworkTransport) getPooledConn(target string) *netConn { + n.connPoolLock.Lock() + defer n.connPoolLock.Unlock() + + conns, ok := n.connPool[target] + if !ok || len(conns) == 0 { + return nil + } + + var conn *netConn + num := len(conns) + conn, conns[num-1] = conns[num-1], nil + n.connPool[target] = conns[:num-1] + return conn +} + +// getConn is used to get a connection from the pool. +func (n *NetworkTransport) getConn(target string) (*netConn, error) { + // Check for a pooled conn + if conn := n.getPooledConn(target); conn != nil { + return conn, nil + } + + // Dial a new connection + conn, err := n.stream.Dial(target, n.timeout) + if err != nil { + return nil, err + } + + // Wrap the conn + netConn := &netConn{ + target: target, + conn: conn, + r: bufio.NewReader(conn), + w: bufio.NewWriter(conn), + } + + // Setup encoder/decoders + netConn.dec = codec.NewDecoder(netConn.r, &codec.MsgpackHandle{}) + netConn.enc = codec.NewEncoder(netConn.w, &codec.MsgpackHandle{}) + + // Done + return netConn, nil +} + +// returnConn returns a connection back to the pool. +func (n *NetworkTransport) returnConn(conn *netConn) { + n.connPoolLock.Lock() + defer n.connPoolLock.Unlock() + + key := conn.target + conns, _ := n.connPool[key] + + if !n.IsShutdown() && len(conns) < n.maxPool { + n.connPool[key] = append(conns, conn) + } else { + conn.Release() + } +} + +// AppendEntriesPipeline returns an interface that can be used to pipeline +// AppendEntries requests. +func (n *NetworkTransport) AppendEntriesPipeline(target string) (AppendPipeline, error) { + // Get a connection + conn, err := n.getConn(target) + if err != nil { + return nil, err + } + + // Create the pipeline + return newNetPipeline(n, conn), nil +} + +// AppendEntries implements the Transport interface. +func (n *NetworkTransport) AppendEntries(target string, args *AppendEntriesRequest, resp *AppendEntriesResponse) error { + return n.genericRPC(target, rpcAppendEntries, args, resp) +} + +// RequestVote implements the Transport interface. +func (n *NetworkTransport) RequestVote(target string, args *RequestVoteRequest, resp *RequestVoteResponse) error { + return n.genericRPC(target, rpcRequestVote, args, resp) +} + +// genericRPC handles a simple request/response RPC. +func (n *NetworkTransport) genericRPC(target string, rpcType uint8, args interface{}, resp interface{}) error { + // Get a conn + conn, err := n.getConn(target) + if err != nil { + return err + } + + // Set a deadline + if n.timeout > 0 { + conn.conn.SetDeadline(time.Now().Add(n.timeout)) + } + + // Send the RPC + if err := sendRPC(conn, rpcType, args); err != nil { + return err + } + + // Decode the response + canReturn, err := decodeResponse(conn, resp) + if canReturn { + n.returnConn(conn) + } + return err +} + +// InstallSnapshot implements the Transport interface. +func (n *NetworkTransport) InstallSnapshot(target string, args *InstallSnapshotRequest, resp *InstallSnapshotResponse, data io.Reader) error { + // Get a conn, always close for InstallSnapshot + conn, err := n.getConn(target) + if err != nil { + return err + } + defer conn.Release() + + // Set a deadline, scaled by request size + if n.timeout > 0 { + timeout := n.timeout * time.Duration(args.Size/int64(n.TimeoutScale)) + if timeout < n.timeout { + timeout = n.timeout + } + conn.conn.SetDeadline(time.Now().Add(timeout)) + } + + // Send the RPC + if err := sendRPC(conn, rpcInstallSnapshot, args); err != nil { + return err + } + + // Stream the state + if _, err := io.Copy(conn.w, data); err != nil { + return err + } + + // Flush + if err := conn.w.Flush(); err != nil { + return err + } + + // Decode the response, do not return conn + _, err = decodeResponse(conn, resp) + return err +} + +// EncodePeer implements the Transport interface. +func (n *NetworkTransport) EncodePeer(p string) []byte { + return []byte(p) +} + +// DecodePeer implements the Transport interface. +func (n *NetworkTransport) DecodePeer(buf []byte) string { + return string(buf) +} + +// listen is used to handling incoming connections. +func (n *NetworkTransport) listen() { + for { + // Accept incoming connections + conn, err := n.stream.Accept() + if err != nil { + if n.IsShutdown() { + return + } + n.logger.Printf("[ERR] raft-net: Failed to accept connection: %v", err) + continue + } + n.logger.Printf("[DEBUG] raft-net: %v accepted connection from: %v", n.LocalAddr(), conn.RemoteAddr()) + + // Handle the connection in dedicated routine + go n.handleConn(conn) + } +} + +// handleConn is used to handle an inbound connection for its lifespan. +func (n *NetworkTransport) handleConn(conn net.Conn) { + defer conn.Close() + r := bufio.NewReader(conn) + w := bufio.NewWriter(conn) + dec := codec.NewDecoder(r, &codec.MsgpackHandle{}) + enc := codec.NewEncoder(w, &codec.MsgpackHandle{}) + + for { + if err := n.handleCommand(r, dec, enc); err != nil { + if err != io.EOF { + n.logger.Printf("[ERR] raft-net: Failed to decode incoming command: %v", err) + } + return + } + if err := w.Flush(); err != nil { + n.logger.Printf("[ERR] raft-net: Failed to flush response: %v", err) + return + } + } +} + +// handleCommand is used to decode and dispatch a single command. +func (n *NetworkTransport) handleCommand(r *bufio.Reader, dec *codec.Decoder, enc *codec.Encoder) error { + // Get the rpc type + rpcType, err := r.ReadByte() + if err != nil { + return err + } + + // Create the RPC object + respCh := make(chan RPCResponse, 1) + rpc := RPC{ + RespChan: respCh, + } + + // Decode the command + isHeartbeat := false + switch rpcType { + case rpcAppendEntries: + var req AppendEntriesRequest + if err := dec.Decode(&req); err != nil { + return err + } + rpc.Command = &req + + // Check if this is a heartbeat + if req.Term != 0 && req.Leader != nil && + req.PrevLogEntry == 0 && req.PrevLogTerm == 0 && + len(req.Entries) == 0 && req.LeaderCommitIndex == 0 { + isHeartbeat = true + } + + case rpcRequestVote: + var req RequestVoteRequest + if err := dec.Decode(&req); err != nil { + return err + } + rpc.Command = &req + + case rpcInstallSnapshot: + var req InstallSnapshotRequest + if err := dec.Decode(&req); err != nil { + return err + } + rpc.Command = &req + rpc.Reader = io.LimitReader(r, req.Size) + + default: + return fmt.Errorf("unknown rpc type %d", rpcType) + } + + // Check for heartbeat fast-path + if isHeartbeat { + n.heartbeatFnLock.Lock() + fn := n.heartbeatFn + n.heartbeatFnLock.Unlock() + if fn != nil { + fn(rpc) + goto RESP + } + } + + // Dispatch the RPC + select { + case n.consumeCh <- rpc: + case <-n.shutdownCh: + return ErrTransportShutdown + } + + // Wait for response +RESP: + select { + case resp := <-respCh: + // Send the error first + respErr := "" + if resp.Error != nil { + respErr = resp.Error.Error() + } + if err := enc.Encode(respErr); err != nil { + return err + } + + // Send the response + if err := enc.Encode(resp.Response); err != nil { + return err + } + case <-n.shutdownCh: + return ErrTransportShutdown + } + return nil +} + +// decodeResponse is used to decode an RPC response and reports whether +// the connection can be reused. +func decodeResponse(conn *netConn, resp interface{}) (bool, error) { + // Decode the error if any + var rpcError string + if err := conn.dec.Decode(&rpcError); err != nil { + conn.Release() + return false, err + } + + // Decode the response + if err := conn.dec.Decode(resp); err != nil { + conn.Release() + return false, err + } + + // Format an error if any + if rpcError != "" { + return true, fmt.Errorf(rpcError) + } + return true, nil +} + +// sendRPC is used to encode and send the RPC. +func sendRPC(conn *netConn, rpcType uint8, args interface{}) error { + // Write the request type + if err := conn.w.WriteByte(rpcType); err != nil { + conn.Release() + return err + } + + // Send the request + if err := conn.enc.Encode(args); err != nil { + conn.Release() + return err + } + + // Flush + if err := conn.w.Flush(); err != nil { + conn.Release() + return err + } + return nil +} + +// newNetPipeline is used to construct a netPipeline from a given +// transport and connection. +func newNetPipeline(trans *NetworkTransport, conn *netConn) *netPipeline { + n := &netPipeline{ + conn: conn, + trans: trans, + doneCh: make(chan AppendFuture, rpcMaxPipeline), + inprogressCh: make(chan *appendFuture, rpcMaxPipeline), + shutdownCh: make(chan struct{}), + } + go n.decodeResponses() + return n +} + +// decodeResponses is a long running routine that decodes the responses +// sent on the connection. +func (n *netPipeline) decodeResponses() { + timeout := n.trans.timeout + for { + select { + case future := <-n.inprogressCh: + if timeout > 0 { + n.conn.conn.SetReadDeadline(time.Now().Add(timeout)) + } + + _, err := decodeResponse(n.conn, future.resp) + future.respond(err) + select { + case n.doneCh <- future: + case <-n.shutdownCh: + return + } + case <-n.shutdownCh: + return + } + } +} + +// AppendEntries is used to pipeline a new append entries request. +func (n *netPipeline) AppendEntries(args *AppendEntriesRequest, resp *AppendEntriesResponse) (AppendFuture, error) { + // Create a new future + future := &appendFuture{ + start: time.Now(), + args: args, + resp: resp, + } + future.init() + + // Add a send timeout + if timeout := n.trans.timeout; timeout > 0 { + n.conn.conn.SetWriteDeadline(time.Now().Add(timeout)) + } + + // Send the RPC + if err := sendRPC(n.conn, rpcAppendEntries, future.args); err != nil { + return nil, err + } + + // Hand-off for decoding, this can also cause back-pressure + // to prevent too many inflight requests + select { + case n.inprogressCh <- future: + return future, nil + case <-n.shutdownCh: + return nil, ErrPipelineShutdown + } +} + +// Consumer returns a channel that can be used to consume complete futures. +func (n *netPipeline) Consumer() <-chan AppendFuture { + return n.doneCh +} + +// Closed is used to shutdown the pipeline connection. +func (n *netPipeline) Close() error { + n.shutdownLock.Lock() + defer n.shutdownLock.Unlock() + if n.shutdown { + return nil + } + + // Release the connection + n.conn.Release() + + n.shutdown = true + close(n.shutdownCh) + return nil +} diff --git a/Godeps/_workspace/src/github.com/hashicorp/raft/peer.go b/Godeps/_workspace/src/github.com/hashicorp/raft/peer.go new file mode 100644 index 00000000000..6f3bcf85645 --- /dev/null +++ b/Godeps/_workspace/src/github.com/hashicorp/raft/peer.go @@ -0,0 +1,122 @@ +package raft + +import ( + "bytes" + "encoding/json" + "io/ioutil" + "os" + "path/filepath" + "sync" +) + +const ( + jsonPeerPath = "peers.json" +) + +// PeerStore provides an interface for persistent storage and +// retrieval of peers. We use a separate interface than StableStore +// since the peers may need to be edited by a human operator. For example, +// in a two node cluster, the failure of either node requires human intervention +// since consensus is impossible. +type PeerStore interface { + // Peers returns the list of known peers. + Peers() ([]string, error) + + // SetPeers sets the list of known peers. This is invoked when a peer is + // added or removed. + SetPeers([]string) error +} + +// StaticPeers is used to provide a static list of peers. +type StaticPeers struct { + StaticPeers []string + l sync.Mutex +} + +// Peers implements the PeerStore interface. +func (s *StaticPeers) Peers() ([]string, error) { + s.l.Lock() + peers := s.StaticPeers + s.l.Unlock() + return peers, nil +} + +// SetPeers implements the PeerStore interface. +func (s *StaticPeers) SetPeers(p []string) error { + s.l.Lock() + s.StaticPeers = p + s.l.Unlock() + return nil +} + +// JSONPeers is used to provide peer persistence on disk in the form +// of a JSON file. This allows human operators to manipulate the file. +type JSONPeers struct { + l sync.Mutex + path string + trans Transport +} + +// NewJSONPeers creates a new JSONPeers store. Requires a transport +// to handle the serialization of network addresses. +func NewJSONPeers(base string, trans Transport) *JSONPeers { + path := filepath.Join(base, jsonPeerPath) + store := &JSONPeers{ + path: path, + trans: trans, + } + return store +} + +// Peers implements the PeerStore interface. +func (j *JSONPeers) Peers() ([]string, error) { + j.l.Lock() + defer j.l.Unlock() + + // Read the file + buf, err := ioutil.ReadFile(j.path) + if err != nil && !os.IsNotExist(err) { + return nil, err + } + + // Check for no peers + if len(buf) == 0 { + return nil, nil + } + + // Decode the peers + var peerSet []string + dec := json.NewDecoder(bytes.NewReader(buf)) + if err := dec.Decode(&peerSet); err != nil { + return nil, err + } + + // Deserialize each peer + var peers []string + for _, p := range peerSet { + peers = append(peers, j.trans.DecodePeer([]byte(p))) + } + return peers, nil +} + +// SetPeers implements the PeerStore interface. +func (j *JSONPeers) SetPeers(peers []string) error { + j.l.Lock() + defer j.l.Unlock() + + // Encode each peer + var peerSet []string + for _, p := range peers { + peerSet = append(peerSet, string(j.trans.EncodePeer(p))) + } + + // Convert to JSON + var buf bytes.Buffer + enc := json.NewEncoder(&buf) + if err := enc.Encode(peerSet); err != nil { + return err + } + + // Write out as JSON + return ioutil.WriteFile(j.path, buf.Bytes(), 0755) +} diff --git a/Godeps/_workspace/src/github.com/hashicorp/raft/raft.go b/Godeps/_workspace/src/github.com/hashicorp/raft/raft.go new file mode 100644 index 00000000000..f7880ba9c97 --- /dev/null +++ b/Godeps/_workspace/src/github.com/hashicorp/raft/raft.go @@ -0,0 +1,1887 @@ +package raft + +import ( + "bytes" + "errors" + "fmt" + "io" + "log" + "os" + "strconv" + "sync" + "time" + + "github.com/armon/go-metrics" +) + +const ( + minCheckInterval = 10 * time.Millisecond +) + +var ( + keyCurrentTerm = []byte("CurrentTerm") + keyLastVoteTerm = []byte("LastVoteTerm") + keyLastVoteCand = []byte("LastVoteCand") + + // ErrLeader is returned when an operation can't be completed on a + // leader node. + ErrLeader = errors.New("node is the leader") + + // ErrNotLeader is returned when an operation can't be completed on a + // follower or candidate node. + ErrNotLeader = errors.New("node is not the leader") + + // ErrLeadershipLost is returned when a leader fails to commit a log entry + // because it's been deposed in the process. + ErrLeadershipLost = errors.New("leadership lost while committing log") + + // ErrRaftShutdown is returned when operations are requested against an + // inactive Raft. + ErrRaftShutdown = errors.New("raft is already shutdown") + + // ErrEnqueueTimeout is returned when a command fails due to a timeout. + ErrEnqueueTimeout = errors.New("timed out enqueuing operation") + + // ErrKnownPeer is returned when trying to add a peer to the configuration + // that already exists. + ErrKnownPeer = errors.New("peer already known") + + // ErrUnknownPeer is returned when trying to remove a peer from the + // configuration that doesn't exist. + ErrUnknownPeer = errors.New("peer is unknown") + + // ErrNothingNewToSnapshot is returned when trying to create a snapshot + // but there's nothing new commited to the FSM since we started. + ErrNothingNewToSnapshot = errors.New("Nothing new to snapshot") +) + +// commitTuple is used to send an index that was committed, +// with an optional associated future that should be invoked. +type commitTuple struct { + log *Log + future *logFuture +} + +// leaderState is state that is used while we are a leader. +type leaderState struct { + commitCh chan struct{} + inflight *inflight + replState map[string]*followerReplication + notify map[*verifyFuture]struct{} + stepDown chan struct{} +} + +// Raft implements a Raft node. +type Raft struct { + raftState + + // applyCh is used to async send logs to the main thread to + // be committed and applied to the FSM. + applyCh chan *logFuture + + // Configuration provided at Raft initialization + conf *Config + + // FSM is the client state machine to apply commands to + fsm FSM + + // fsmCommitCh is used to trigger async application of logs to the fsm + fsmCommitCh chan commitTuple + + // fsmRestoreCh is used to trigger a restore from snapshot + fsmRestoreCh chan *restoreFuture + + // fsmSnapshotCh is used to trigger a new snapshot being taken + fsmSnapshotCh chan *reqSnapshotFuture + + // lastContact is the last time we had contact from the + // leader node. This can be used to gauge staleness. + lastContact time.Time + lastContactLock sync.RWMutex + + // Leader is the current cluster leader + leader string + leaderLock sync.RWMutex + + // leaderCh is used to notify of leadership changes + leaderCh chan bool + + // leaderState used only while state is leader + leaderState leaderState + + // Stores our local addr + localAddr string + + // Used for our logging + logger *log.Logger + + // LogStore provides durable storage for logs + logs LogStore + + // Track our known peers + peerCh chan *peerFuture + peers []string + peerStore PeerStore + + // RPC chan comes from the transport layer + rpcCh <-chan RPC + + // Shutdown channel to exit, protected to prevent concurrent exits + shutdown bool + shutdownCh chan struct{} + shutdownLock sync.Mutex + + // snapshots is used to store and retrieve snapshots + snapshots SnapshotStore + + // snapshotCh is used for user triggered snapshots + snapshotCh chan *snapshotFuture + + // stable is a StableStore implementation for durable state + // It provides stable storage for many fields in raftState + stable StableStore + + // The transport layer we use + trans Transport + + // verifyCh is used to async send verify futures to the main thread + // to verify we are still the leader + verifyCh chan *verifyFuture +} + +// NewRaft is used to construct a new Raft node. It takes a configuration, as well +// as implementations of various interfaces that are required. If we have any old state, +// such as snapshots, logs, peers, etc, all those will be restored when creating the +// Raft node. +func NewRaft(conf *Config, fsm FSM, logs LogStore, stable StableStore, snaps SnapshotStore, + peerStore PeerStore, trans Transport) (*Raft, error) { + // Validate the configuration + if err := ValidateConfig(conf); err != nil { + return nil, err + } + + // Ensure we have a LogOutput + var logger *log.Logger + if conf.Logger != nil { + logger = conf.Logger + } else { + if conf.LogOutput == nil { + conf.LogOutput = os.Stderr + } + logger = log.New(conf.LogOutput, "", log.LstdFlags) + } + + // Try to restore the current term + currentTerm, err := stable.GetUint64(keyCurrentTerm) + if err != nil && err.Error() != "not found" { + return nil, fmt.Errorf("failed to load current term: %v", err) + } + + // Read the last log value + lastIdx, err := logs.LastIndex() + if err != nil { + return nil, fmt.Errorf("failed to find last log: %v", err) + } + + // Get the log + var lastLog Log + if lastIdx > 0 { + if err := logs.GetLog(lastIdx, &lastLog); err != nil { + return nil, fmt.Errorf("failed to get last log: %v", err) + } + } + + // Construct the list of peers that excludes us + localAddr := trans.LocalAddr() + peers, err := peerStore.Peers() + if err != nil { + return nil, fmt.Errorf("failed to get list of peers: %v", err) + } + peers = ExcludePeer(peers, localAddr) + + // Create Raft struct + r := &Raft{ + applyCh: make(chan *logFuture), + conf: conf, + fsm: fsm, + fsmCommitCh: make(chan commitTuple, 128), + fsmRestoreCh: make(chan *restoreFuture), + fsmSnapshotCh: make(chan *reqSnapshotFuture), + leaderCh: make(chan bool), + localAddr: localAddr, + logger: logger, + logs: logs, + peerCh: make(chan *peerFuture), + peers: peers, + peerStore: peerStore, + rpcCh: trans.Consumer(), + snapshots: snaps, + snapshotCh: make(chan *snapshotFuture), + shutdownCh: make(chan struct{}), + stable: stable, + trans: trans, + verifyCh: make(chan *verifyFuture, 64), + } + + // Initialize as a follower + r.setState(Follower) + + // Start as leader if specified. This should only be used + // for testing purposes. + if conf.StartAsLeader { + r.setState(Leader) + r.setLeader(r.localAddr) + } + + // Restore the current term and the last log + r.setCurrentTerm(currentTerm) + r.setLastLogIndex(lastLog.Index) + r.setLastLogTerm(lastLog.Term) + + // Attempt to restore a snapshot if there are any + if err := r.restoreSnapshot(); err != nil { + return nil, err + } + + // Setup a heartbeat fast-path to avoid head-of-line + // blocking where possible. It MUST be safe for this + // to be called concurrently with a blocking RPC. + trans.SetHeartbeatHandler(r.processHeartbeat) + + // Start the background work + r.goFunc(r.run) + r.goFunc(r.runFSM) + r.goFunc(r.runSnapshots) + return r, nil +} + +// Leader is used to return the current leader of the cluster. +// It may return empty string if there is no current leader +// or the leader is unknown. +func (r *Raft) Leader() string { + r.leaderLock.RLock() + leader := r.leader + r.leaderLock.RUnlock() + return leader +} + +// setLeader is used to modify the current leader of the cluster +func (r *Raft) setLeader(leader string) { + r.leaderLock.Lock() + r.leader = leader + r.leaderLock.Unlock() +} + +// Apply is used to apply a command to the FSM in a highly consistent +// manner. This returns a future that can be used to wait on the application. +// An optional timeout can be provided to limit the amount of time we wait +// for the command to be started. This must be run on the leader or it +// will fail. +func (r *Raft) Apply(cmd []byte, timeout time.Duration) ApplyFuture { + metrics.IncrCounter([]string{"raft", "apply"}, 1) + var timer <-chan time.Time + if timeout > 0 { + timer = time.After(timeout) + } + + // Create a log future, no index or term yet + logFuture := &logFuture{ + log: Log{ + Type: LogCommand, + Data: cmd, + }, + } + logFuture.init() + + select { + case <-timer: + return errorFuture{ErrEnqueueTimeout} + case <-r.shutdownCh: + return errorFuture{ErrRaftShutdown} + case r.applyCh <- logFuture: + return logFuture + } +} + +// Barrier is used to issue a command that blocks until all preceeding +// operations have been applied to the FSM. It can be used to ensure the +// FSM reflects all queued writes. An optional timeout can be provided to +// limit the amount of time we wait for the command to be started. This +// must be run on the leader or it will fail. +func (r *Raft) Barrier(timeout time.Duration) Future { + metrics.IncrCounter([]string{"raft", "barrier"}, 1) + var timer <-chan time.Time + if timeout > 0 { + timer = time.After(timeout) + } + + // Create a log future, no index or term yet + logFuture := &logFuture{ + log: Log{ + Type: LogBarrier, + }, + } + logFuture.init() + + select { + case <-timer: + return errorFuture{ErrEnqueueTimeout} + case <-r.shutdownCh: + return errorFuture{ErrRaftShutdown} + case r.applyCh <- logFuture: + return logFuture + } +} + +// VerifyLeader is used to ensure the current node is still +// the leader. This can be done to prevent stale reads when a +// new leader has potentially been elected. +func (r *Raft) VerifyLeader() Future { + metrics.IncrCounter([]string{"raft", "verify_leader"}, 1) + verifyFuture := &verifyFuture{} + verifyFuture.init() + select { + case <-r.shutdownCh: + return errorFuture{ErrRaftShutdown} + case r.verifyCh <- verifyFuture: + return verifyFuture + } +} + +// AddPeer is used to add a new peer into the cluster. This must be +// run on the leader or it will fail. +func (r *Raft) AddPeer(peer string) Future { + logFuture := &logFuture{ + log: Log{ + Type: LogAddPeer, + peer: peer, + }, + } + logFuture.init() + select { + case r.applyCh <- logFuture: + return logFuture + case <-r.shutdownCh: + return errorFuture{ErrRaftShutdown} + } +} + +// RemovePeer is used to remove a peer from the cluster. If the +// current leader is being removed, it will cause a new election +// to occur. This must be run on the leader or it will fail. +func (r *Raft) RemovePeer(peer string) Future { + logFuture := &logFuture{ + log: Log{ + Type: LogRemovePeer, + peer: peer, + }, + } + logFuture.init() + select { + case r.applyCh <- logFuture: + return logFuture + case <-r.shutdownCh: + return errorFuture{ErrRaftShutdown} + } +} + +// SetPeers is used to forcibly replace the set of internal peers and +// the peerstore with the ones specified. This can be considered unsafe. +func (r *Raft) SetPeers(p []string) Future { + peerFuture := &peerFuture{ + peers: p, + } + peerFuture.init() + + select { + case r.peerCh <- peerFuture: + return peerFuture + case <-r.shutdownCh: + return errorFuture{ErrRaftShutdown} + } +} + +// Shutdown is used to stop the Raft background routines. +// This is not a graceful operation. Provides a future that +// can be used to block until all background routines have exited. +func (r *Raft) Shutdown() Future { + r.shutdownLock.Lock() + defer r.shutdownLock.Unlock() + + if !r.shutdown { + close(r.shutdownCh) + r.shutdown = true + r.setState(Shutdown) + } + + return &shutdownFuture{r} +} + +// Snapshot is used to manually force Raft to take a snapshot. +// Returns a future that can be used to block until complete. +func (r *Raft) Snapshot() Future { + snapFuture := &snapshotFuture{} + snapFuture.init() + select { + case r.snapshotCh <- snapFuture: + return snapFuture + case <-r.shutdownCh: + return errorFuture{ErrRaftShutdown} + } + +} + +// State is used to return the current raft state. +func (r *Raft) State() RaftState { + return r.getState() +} + +// LeaderCh is used to get a channel which delivers signals on +// acquiring or losing leadership. It sends true if we become +// the leader, and false if we lose it. The channel is not buffered, +// and does not block on writes. +func (r *Raft) LeaderCh() <-chan bool { + return r.leaderCh +} + +func (r *Raft) String() string { + return fmt.Sprintf("Node at %s [%v]", r.localAddr, r.getState()) +} + +// LastContact returns the time of last contact by a leader. +// This only makes sense if we are currently a follower. +func (r *Raft) LastContact() time.Time { + r.lastContactLock.RLock() + last := r.lastContact + r.lastContactLock.RUnlock() + return last +} + +// Stats is used to return a map of various internal stats. This should only +// be used for informative purposes or debugging. +func (r *Raft) Stats() map[string]string { + toString := func(v uint64) string { + return strconv.FormatUint(v, 10) + } + s := map[string]string{ + "state": r.getState().String(), + "term": toString(r.getCurrentTerm()), + "last_log_index": toString(r.getLastLogIndex()), + "last_log_term": toString(r.getLastLogTerm()), + "commit_index": toString(r.getCommitIndex()), + "applied_index": toString(r.getLastApplied()), + "fsm_pending": toString(uint64(len(r.fsmCommitCh))), + "last_snapshot_index": toString(r.getLastSnapshotIndex()), + "last_snapshot_term": toString(r.getLastSnapshotTerm()), + "num_peers": toString(uint64(len(r.peers))), + } + last := r.LastContact() + if last.IsZero() { + s["last_contact"] = "never" + } else if r.getState() == Leader { + s["last_contact"] = "0" + } else { + s["last_contact"] = fmt.Sprintf("%v", time.Now().Sub(last)) + } + return s +} + +// LastIndex returns the last index in stable storage, +// either from the last log or from the last snapshot. +func (r *Raft) LastIndex() uint64 { + return r.getLastIndex() +} + +// AppliedIndex returns the last index applied to the FSM. +// This is generally lagging behind the last index, especially +// for indexes that are persisted but have not yet been considered +// committed by the leader. +func (r *Raft) AppliedIndex() uint64 { + return r.getLastApplied() +} + +// runFSM is a long running goroutine responsible for applying logs +// to the FSM. This is done async of other logs since we don't want +// the FSM to block our internal operations. +func (r *Raft) runFSM() { + var lastIndex, lastTerm uint64 + for { + select { + case req := <-r.fsmRestoreCh: + // Open the snapshot + meta, source, err := r.snapshots.Open(req.ID) + if err != nil { + req.respond(fmt.Errorf("failed to open snapshot %v: %v", req.ID, err)) + continue + } + + // Attempt to restore + start := time.Now() + if err := r.fsm.Restore(source); err != nil { + req.respond(fmt.Errorf("failed to restore snapshot %v: %v", req.ID, err)) + source.Close() + continue + } + source.Close() + metrics.MeasureSince([]string{"raft", "fsm", "restore"}, start) + + // Update the last index and term + lastIndex = meta.Index + lastTerm = meta.Term + req.respond(nil) + + case req := <-r.fsmSnapshotCh: + // Is there something to snapshot? + if lastIndex == 0 { + req.respond(ErrNothingNewToSnapshot) + continue + } + + // Get our peers + peers, err := r.peerStore.Peers() + if err != nil { + req.respond(err) + continue + } + + // Start a snapshot + start := time.Now() + snap, err := r.fsm.Snapshot() + metrics.MeasureSince([]string{"raft", "fsm", "snapshot"}, start) + + // Respond to the request + req.index = lastIndex + req.term = lastTerm + req.peers = peers + req.snapshot = snap + req.respond(err) + + case commitTuple := <-r.fsmCommitCh: + // Apply the log if a command + var resp interface{} + if commitTuple.log.Type == LogCommand { + start := time.Now() + resp = r.fsm.Apply(commitTuple.log) + metrics.MeasureSince([]string{"raft", "fsm", "apply"}, start) + } + + // Update the indexes + lastIndex = commitTuple.log.Index + lastTerm = commitTuple.log.Term + + // Invoke the future if given + if commitTuple.future != nil { + commitTuple.future.response = resp + commitTuple.future.respond(nil) + } + case <-r.shutdownCh: + return + } + } +} + +// run is a long running goroutine that runs the Raft FSM. +func (r *Raft) run() { + for { + // Check if we are doing a shutdown + select { + case <-r.shutdownCh: + // Clear the leader to prevent forwarding + r.setLeader("") + return + default: + } + + // Enter into a sub-FSM + switch r.getState() { + case Follower: + r.runFollower() + case Candidate: + r.runCandidate() + case Leader: + r.runLeader() + } + } +} + +// runFollower runs the FSM for a follower. +func (r *Raft) runFollower() { + didWarn := false + r.logger.Printf("[INFO] raft: %v entering Follower state", r) + metrics.IncrCounter([]string{"raft", "state", "follower"}, 1) + heartbeatTimer := randomTimeout(r.conf.HeartbeatTimeout) + for { + select { + case rpc := <-r.rpcCh: + r.processRPC(rpc) + + case a := <-r.applyCh: + // Reject any operations since we are not the leader + a.respond(ErrNotLeader) + + case v := <-r.verifyCh: + // Reject any operations since we are not the leader + v.respond(ErrNotLeader) + + case p := <-r.peerCh: + // Set the peers + r.peers = ExcludePeer(p.peers, r.localAddr) + p.respond(r.peerStore.SetPeers(p.peers)) + + case <-heartbeatTimer: + // Restart the heartbeat timer + heartbeatTimer = randomTimeout(r.conf.HeartbeatTimeout) + + // Check if we have had a successful contact + lastContact := r.LastContact() + if time.Now().Sub(lastContact) < r.conf.HeartbeatTimeout { + continue + } + + // Heartbeat failed! Transition to the candidate state + r.setLeader("") + if len(r.peers) == 0 && !r.conf.EnableSingleNode { + if !didWarn { + r.logger.Printf("[WARN] raft: EnableSingleNode disabled, and no known peers. Aborting election.") + didWarn = true + } + } else { + r.logger.Printf("[WARN] raft: Heartbeat timeout reached, starting election") + + metrics.IncrCounter([]string{"raft", "transition", "heartbeat_timout"}, 1) + r.setState(Candidate) + return + } + + case <-r.shutdownCh: + return + } + } +} + +// runCandidate runs the FSM for a candidate. +func (r *Raft) runCandidate() { + r.logger.Printf("[INFO] raft: %v entering Candidate state", r) + metrics.IncrCounter([]string{"raft", "state", "candidate"}, 1) + + // Start vote for us, and set a timeout + voteCh := r.electSelf() + electionTimer := randomTimeout(r.conf.ElectionTimeout) + + // Tally the votes, need a simple majority + grantedVotes := 0 + votesNeeded := r.quorumSize() + r.logger.Printf("[DEBUG] raft: Votes needed: %d", votesNeeded) + + for r.getState() == Candidate { + select { + case rpc := <-r.rpcCh: + r.processRPC(rpc) + + case vote := <-voteCh: + // Check if the term is greater than ours, bail + if vote.Term > r.getCurrentTerm() { + r.logger.Printf("[DEBUG] raft: Newer term discovered, fallback to follower") + r.setState(Follower) + r.setCurrentTerm(vote.Term) + return + } + + // Check if the vote is granted + if vote.Granted { + grantedVotes++ + r.logger.Printf("[DEBUG] raft: Vote granted from %s. Tally: %d", vote.voter, grantedVotes) + } + + // Check if we've become the leader + if grantedVotes >= votesNeeded { + r.logger.Printf("[INFO] raft: Election won. Tally: %d", grantedVotes) + r.setState(Leader) + r.setLeader(r.localAddr) + return + } + + case a := <-r.applyCh: + // Reject any operations since we are not the leader + a.respond(ErrNotLeader) + + case v := <-r.verifyCh: + // Reject any operations since we are not the leader + v.respond(ErrNotLeader) + + case p := <-r.peerCh: + // Set the peers + r.peers = ExcludePeer(p.peers, r.localAddr) + p.respond(r.peerStore.SetPeers(p.peers)) + // Become a follower again + r.setState(Follower) + return + + case <-electionTimer: + // Election failed! Restart the election. We simply return, + // which will kick us back into runCandidate + r.logger.Printf("[WARN] raft: Election timeout reached, restarting election") + return + + case <-r.shutdownCh: + return + } + } +} + +// runLeader runs the FSM for a leader. Do the setup here and drop into +// the leaderLoop for the hot loop. +func (r *Raft) runLeader() { + r.logger.Printf("[INFO] raft: %v entering Leader state", r) + metrics.IncrCounter([]string{"raft", "state", "leader"}, 1) + + // Notify that we are the leader + asyncNotifyBool(r.leaderCh, true) + + // Push to the notify channel if given + if notify := r.conf.NotifyCh; notify != nil { + select { + case notify <- true: + case <-r.shutdownCh: + } + } + + // Setup leader state + r.leaderState.commitCh = make(chan struct{}, 1) + r.leaderState.inflight = newInflight(r.leaderState.commitCh) + r.leaderState.replState = make(map[string]*followerReplication) + r.leaderState.notify = make(map[*verifyFuture]struct{}) + r.leaderState.stepDown = make(chan struct{}, 1) + + // Cleanup state on step down + defer func() { + // Since we were the leader previously, we update our + // last contact time when we step down, so that we are not + // reporting a last contact time from before we were the + // leader. Otherwise, to a client it would seem our data + // is extremely stale. + r.setLastContact() + + // Stop replication + for _, p := range r.leaderState.replState { + close(p.stopCh) + } + + // Cancel inflight requests + r.leaderState.inflight.Cancel(ErrLeadershipLost) + + // Respond to any pending verify requests + for future := range r.leaderState.notify { + future.respond(ErrLeadershipLost) + } + + // Clear all the state + r.leaderState.commitCh = nil + r.leaderState.inflight = nil + r.leaderState.replState = nil + r.leaderState.notify = nil + r.leaderState.stepDown = nil + + // If we are stepping down for some reason, no known leader. + // We may have stepped down due to an RPC call, which would + // provide the leader, so we cannot always blank this out. + r.leaderLock.Lock() + if r.leader == r.localAddr { + r.leader = "" + } + r.leaderLock.Unlock() + + // Notify that we are not the leader + asyncNotifyBool(r.leaderCh, false) + + // Push to the notify channel if given + if notify := r.conf.NotifyCh; notify != nil { + select { + case notify <- false: + case <-r.shutdownCh: + // On shutdown, make a best effort but do not block + select { + case notify <- false: + default: + } + } + } + }() + + // Start a replication routine for each peer + for _, peer := range r.peers { + r.startReplication(peer) + } + + // Dispatch a no-op log first. Instead of LogNoop, + // we use a LogAddPeer with our peerset. This acts like + // a no-op as well, but when doing an initial bootstrap, ensures + // that all nodes share a common peerset. + peerSet := append([]string{r.localAddr}, r.peers...) + noop := &logFuture{ + log: Log{ + Type: LogAddPeer, + Data: encodePeers(peerSet, r.trans), + }, + } + r.dispatchLogs([]*logFuture{noop}) + + // Disable EnableSingleNode after we've been elected leader. + // This is to prevent a split brain in the future, if we are removed + // from the cluster and then elect ourself as leader. + if r.conf.DisableBootstrapAfterElect && r.conf.EnableSingleNode { + r.logger.Printf("[INFO] raft: Disabling EnableSingleNode (bootstrap)") + r.conf.EnableSingleNode = false + } + + // Sit in the leader loop until we step down + r.leaderLoop() +} + +// startReplication is a helper to setup state and start async replication to a peer. +func (r *Raft) startReplication(peer string) { + lastIdx := r.getLastIndex() + s := &followerReplication{ + peer: peer, + inflight: r.leaderState.inflight, + stopCh: make(chan uint64, 1), + triggerCh: make(chan struct{}, 1), + currentTerm: r.getCurrentTerm(), + matchIndex: 0, + nextIndex: lastIdx + 1, + lastContact: time.Now(), + notifyCh: make(chan struct{}, 1), + stepDown: r.leaderState.stepDown, + } + r.leaderState.replState[peer] = s + r.goFunc(func() { r.replicate(s) }) + asyncNotifyCh(s.triggerCh) +} + +// leaderLoop is the hot loop for a leader. It is invoked +// after all the various leader setup is done. +func (r *Raft) leaderLoop() { + // stepDown is used to track if there is an inflight log that + // would cause us to lose leadership (specifically a RemovePeer of + // ourselves). If this is the case, we must not allow any logs to + // be processed in parallel, otherwise we are basing commit on + // only a single peer (ourself) and replicating to an undefined set + // of peers. + stepDown := false + + lease := time.After(r.conf.LeaderLeaseTimeout) + for r.getState() == Leader { + select { + case rpc := <-r.rpcCh: + r.processRPC(rpc) + + case <-r.leaderState.stepDown: + r.setState(Follower) + + case <-r.leaderState.commitCh: + // Get the committed messages + committed := r.leaderState.inflight.Committed() + for e := committed.Front(); e != nil; e = e.Next() { + // Measure the commit time + commitLog := e.Value.(*logFuture) + metrics.MeasureSince([]string{"raft", "commitTime"}, commitLog.dispatch) + + // Increment the commit index + idx := commitLog.log.Index + r.setCommitIndex(idx) + r.processLogs(idx, commitLog) + } + + case v := <-r.verifyCh: + if v.quorumSize == 0 { + // Just dispatched, start the verification + r.verifyLeader(v) + + } else if v.votes < v.quorumSize { + // Early return, means there must be a new leader + r.logger.Printf("[WARN] raft: New leader elected, stepping down") + r.setState(Follower) + delete(r.leaderState.notify, v) + v.respond(ErrNotLeader) + + } else { + // Quorum of members agree, we are still leader + delete(r.leaderState.notify, v) + v.respond(nil) + } + + case p := <-r.peerCh: + p.respond(ErrLeader) + + case newLog := <-r.applyCh: + // Group commit, gather all the ready commits + ready := []*logFuture{newLog} + for i := 0; i < r.conf.MaxAppendEntries; i++ { + select { + case newLog := <-r.applyCh: + ready = append(ready, newLog) + default: + break + } + } + + // Handle any peer set changes + n := len(ready) + for i := 0; i < n; i++ { + // Fail all future transactions once stepDown is on + if stepDown { + ready[i].respond(ErrNotLeader) + ready[i], ready[n-1] = ready[n-1], nil + n-- + i-- + continue + } + + // Special case AddPeer and RemovePeer + log := ready[i] + if log.log.Type != LogAddPeer && log.log.Type != LogRemovePeer { + continue + } + + // Check if this log should be ignored. The logs can be + // reordered here since we have not yet assigned an index + // and are not violating any promises. + if !r.preparePeerChange(log) { + ready[i], ready[n-1] = ready[n-1], nil + n-- + i-- + continue + } + + // Apply peer set changes early and check if we will step + // down after the commit of this log. If so, we must not + // allow any future entries to make progress to avoid undefined + // behavior. + if ok := r.processLog(&log.log, nil, true); ok { + stepDown = true + } + } + + // Nothing to do if all logs are invalid + if n == 0 { + continue + } + + // Dispatch the logs + ready = ready[:n] + r.dispatchLogs(ready) + + case <-lease: + // Check if we've exceeded the lease, potentially stepping down + maxDiff := r.checkLeaderLease() + + // Next check interval should adjust for the last node we've + // contacted, without going negative + checkInterval := r.conf.LeaderLeaseTimeout - maxDiff + if checkInterval < minCheckInterval { + checkInterval = minCheckInterval + } + + // Renew the lease timer + lease = time.After(checkInterval) + + case <-r.shutdownCh: + return + } + } +} + +// verifyLeader must be called from the main thread for safety. +// Causes the followers to attempt an immediate heartbeat. +func (r *Raft) verifyLeader(v *verifyFuture) { + // Current leader always votes for self + v.votes = 1 + + // Set the quorum size, hot-path for single node + v.quorumSize = r.quorumSize() + if v.quorumSize == 1 { + v.respond(nil) + return + } + + // Track this request + v.notifyCh = r.verifyCh + r.leaderState.notify[v] = struct{}{} + + // Trigger immediate heartbeats + for _, repl := range r.leaderState.replState { + repl.notifyLock.Lock() + repl.notify = append(repl.notify, v) + repl.notifyLock.Unlock() + asyncNotifyCh(repl.notifyCh) + } +} + +// checkLeaderLease is used to check if we can contact a quorum of nodes +// within the last leader lease interval. If not, we need to step down, +// as we may have lost connectivity. Returns the maximum duration without +// contact. +func (r *Raft) checkLeaderLease() time.Duration { + // Track contacted nodes, we can always contact ourself + contacted := 1 + + // Check each follower + var maxDiff time.Duration + now := time.Now() + for peer, f := range r.leaderState.replState { + diff := now.Sub(f.LastContact()) + if diff <= r.conf.LeaderLeaseTimeout { + contacted++ + if diff > maxDiff { + maxDiff = diff + } + } else { + // Log at least once at high value, then debug. Otherwise it gets very verbose. + if diff <= 3*r.conf.LeaderLeaseTimeout { + r.logger.Printf("[WARN] raft: Failed to contact %v in %v", peer, diff) + } else { + r.logger.Printf("[DEBUG] raft: Failed to contact %v in %v", peer, diff) + } + } + metrics.AddSample([]string{"raft", "leader", "lastContact"}, float32(diff/time.Millisecond)) + } + + // Verify we can contact a quorum + quorum := r.quorumSize() + if contacted < quorum { + r.logger.Printf("[WARN] raft: Failed to contact quorum of nodes, stepping down") + r.setState(Follower) + metrics.IncrCounter([]string{"raft", "transition", "leader_lease_timeout"}, 1) + } + return maxDiff +} + +// quorumSize is used to return the quorum size +func (r *Raft) quorumSize() int { + return ((len(r.peers) + 1) / 2) + 1 +} + +// preparePeerChange checks if a LogAddPeer or LogRemovePeer should be performed, +// and properly formats the data field on the log before dispatching it. +func (r *Raft) preparePeerChange(l *logFuture) bool { + // Check if this is a known peer + p := l.log.peer + knownPeer := PeerContained(r.peers, p) || r.localAddr == p + + // Ignore known peers on add + if l.log.Type == LogAddPeer && knownPeer { + l.respond(ErrKnownPeer) + return false + } + + // Ignore unknown peers on remove + if l.log.Type == LogRemovePeer && !knownPeer { + l.respond(ErrUnknownPeer) + return false + } + + // Construct the peer set + var peerSet []string + if l.log.Type == LogAddPeer { + peerSet = append([]string{p, r.localAddr}, r.peers...) + } else { + peerSet = ExcludePeer(append([]string{r.localAddr}, r.peers...), p) + } + + // Setup the log + l.log.Data = encodePeers(peerSet, r.trans) + return true +} + +// dispatchLog is called to push a log to disk, mark it +// as inflight and begin replication of it. +func (r *Raft) dispatchLogs(applyLogs []*logFuture) { + now := time.Now() + defer metrics.MeasureSince([]string{"raft", "leader", "dispatchLog"}, now) + + term := r.getCurrentTerm() + lastIndex := r.getLastIndex() + logs := make([]*Log, len(applyLogs)) + + for idx, applyLog := range applyLogs { + applyLog.dispatch = now + applyLog.log.Index = lastIndex + uint64(idx) + 1 + applyLog.log.Term = term + applyLog.policy = newMajorityQuorum(len(r.peers) + 1) + logs[idx] = &applyLog.log + } + + // Write the log entry locally + if err := r.logs.StoreLogs(logs); err != nil { + r.logger.Printf("[ERR] raft: Failed to commit logs: %v", err) + for _, applyLog := range applyLogs { + applyLog.respond(err) + } + r.setState(Follower) + return + } + + // Add this to the inflight logs, commit + r.leaderState.inflight.StartAll(applyLogs) + + // Update the last log since it's on disk now + r.setLastLogIndex(lastIndex + uint64(len(applyLogs))) + r.setLastLogTerm(term) + + // Notify the replicators of the new log + for _, f := range r.leaderState.replState { + asyncNotifyCh(f.triggerCh) + } +} + +// processLogs is used to process all the logs from the lastApplied +// up to the given index. +func (r *Raft) processLogs(index uint64, future *logFuture) { + // Reject logs we've applied already + lastApplied := r.getLastApplied() + if index <= lastApplied { + r.logger.Printf("[WARN] raft: Skipping application of old log: %d", index) + return + } + + // Apply all the preceding logs + for idx := r.getLastApplied() + 1; idx <= index; idx++ { + // Get the log, either from the future or from our log store + if future != nil && future.log.Index == idx { + r.processLog(&future.log, future, false) + + } else { + l := new(Log) + if err := r.logs.GetLog(idx, l); err != nil { + r.logger.Printf("[ERR] raft: Failed to get log at %d: %v", idx, err) + panic(err) + } + r.processLog(l, nil, false) + } + + // Update the lastApplied index and term + r.setLastApplied(idx) + } +} + +// processLog is invoked to process the application of a single committed log. +// Returns if this log entry would cause us to stepDown after it commits. +func (r *Raft) processLog(l *Log, future *logFuture, precommit bool) (stepDown bool) { + switch l.Type { + case LogBarrier: + // Barrier is handled by the FSM + fallthrough + + case LogCommand: + // Forward to the fsm handler + select { + case r.fsmCommitCh <- commitTuple{l, future}: + case <-r.shutdownCh: + if future != nil { + future.respond(ErrRaftShutdown) + } + } + + // Return so that the future is only responded to + // by the FSM handler when the application is done + return + + case LogAddPeer: + fallthrough + case LogRemovePeer: + peers := decodePeers(l.Data, r.trans) + r.logger.Printf("[DEBUG] raft: Node %v updated peer set (%v): %v", r.localAddr, l.Type, peers) + + // If the peer set does not include us, remove all other peers + removeSelf := !PeerContained(peers, r.localAddr) && l.Type == LogRemovePeer + if removeSelf { + // Mark that this operation will cause us to step down as + // leader. This prevents the future logs from being Applied + // from this leader. + stepDown = true + + // We only modify the peers after the commit, otherwise we + // would be using a quorum size of 1 for the RemovePeer operation. + // This is used with the stepDown guard to prevent any other logs. + if !precommit { + r.peers = nil + r.peerStore.SetPeers([]string{r.localAddr}) + } + } else { + r.peers = ExcludePeer(peers, r.localAddr) + r.peerStore.SetPeers(peers) + } + + // Handle replication if we are the leader + if r.getState() == Leader { + for _, p := range r.peers { + if _, ok := r.leaderState.replState[p]; !ok { + r.logger.Printf("[INFO] raft: Added peer %v, starting replication", p) + r.startReplication(p) + } + } + } + + // Stop replication for old nodes + if r.getState() == Leader && !precommit { + var toDelete []string + for _, repl := range r.leaderState.replState { + if !PeerContained(r.peers, repl.peer) { + r.logger.Printf("[INFO] raft: Removed peer %v, stopping replication (Index: %d)", repl.peer, l.Index) + + // Replicate up to this index and stop + repl.stopCh <- l.Index + close(repl.stopCh) + toDelete = append(toDelete, repl.peer) + } + } + for _, name := range toDelete { + delete(r.leaderState.replState, name) + } + } + + // Handle removing ourself + if removeSelf && !precommit { + if r.conf.ShutdownOnRemove { + r.logger.Printf("[INFO] raft: Removed ourself, shutting down") + r.Shutdown() + } else { + r.logger.Printf("[INFO] raft: Removed ourself, transitioning to follower") + r.setState(Follower) + } + } + + case LogNoop: + // Ignore the no-op + default: + r.logger.Printf("[ERR] raft: Got unrecognized log type: %#v", l) + } + + // Invoke the future if given + if future != nil && !precommit { + future.respond(nil) + } + return +} + +// processRPC is called to handle an incoming RPC request. +func (r *Raft) processRPC(rpc RPC) { + switch cmd := rpc.Command.(type) { + case *AppendEntriesRequest: + r.appendEntries(rpc, cmd) + case *RequestVoteRequest: + r.requestVote(rpc, cmd) + case *InstallSnapshotRequest: + r.installSnapshot(rpc, cmd) + default: + r.logger.Printf("[ERR] raft: Got unexpected command: %#v", rpc.Command) + rpc.Respond(nil, fmt.Errorf("unexpected command")) + } +} + +// processHeartbeat is a special handler used just for heartbeat requests +// so that they can be fast-pathed if a transport supports it. +func (r *Raft) processHeartbeat(rpc RPC) { + defer metrics.MeasureSince([]string{"raft", "rpc", "processHeartbeat"}, time.Now()) + + // Check if we are shutdown, just ignore the RPC + select { + case <-r.shutdownCh: + return + default: + } + + // Ensure we are only handling a heartbeat + switch cmd := rpc.Command.(type) { + case *AppendEntriesRequest: + r.appendEntries(rpc, cmd) + default: + r.logger.Printf("[ERR] raft: Expected heartbeat, got command: %#v", rpc.Command) + rpc.Respond(nil, fmt.Errorf("unexpected command")) + } +} + +// appendEntries is invoked when we get an append entries RPC call. +func (r *Raft) appendEntries(rpc RPC, a *AppendEntriesRequest) { + defer metrics.MeasureSince([]string{"raft", "rpc", "appendEntries"}, time.Now()) + // Setup a response + resp := &AppendEntriesResponse{ + Term: r.getCurrentTerm(), + LastLog: r.getLastIndex(), + Success: false, + NoRetryBackoff: false, + } + var rpcErr error + defer func() { + rpc.Respond(resp, rpcErr) + }() + + // Ignore an older term + if a.Term < r.getCurrentTerm() { + return + } + + // Increase the term if we see a newer one, also transition to follower + // if we ever get an appendEntries call + if a.Term > r.getCurrentTerm() || r.getState() != Follower { + // Ensure transition to follower + r.setState(Follower) + r.setCurrentTerm(a.Term) + resp.Term = a.Term + } + + // Save the current leader + r.setLeader(r.trans.DecodePeer(a.Leader)) + + // Verify the last log entry + if a.PrevLogEntry > 0 { + lastIdx, lastTerm := r.getLastEntry() + + var prevLogTerm uint64 + if a.PrevLogEntry == lastIdx { + prevLogTerm = lastTerm + + } else { + var prevLog Log + if err := r.logs.GetLog(a.PrevLogEntry, &prevLog); err != nil { + r.logger.Printf("[WARN] raft: Failed to get previous log: %d %v (last: %d)", + a.PrevLogEntry, err, lastIdx) + resp.NoRetryBackoff = true + return + } + prevLogTerm = prevLog.Term + } + + if a.PrevLogTerm != prevLogTerm { + r.logger.Printf("[WARN] raft: Previous log term mis-match: ours: %d remote: %d", + prevLogTerm, a.PrevLogTerm) + resp.NoRetryBackoff = true + return + } + } + + // Process any new entries + if n := len(a.Entries); n > 0 { + start := time.Now() + first := a.Entries[0] + last := a.Entries[n-1] + + // Delete any conflicting entries + lastLogIdx := r.getLastLogIndex() + if first.Index <= lastLogIdx { + r.logger.Printf("[WARN] raft: Clearing log suffix from %d to %d", first.Index, lastLogIdx) + if err := r.logs.DeleteRange(first.Index, lastLogIdx); err != nil { + r.logger.Printf("[ERR] raft: Failed to clear log suffix: %v", err) + return + } + } + + // Append the entry + if err := r.logs.StoreLogs(a.Entries); err != nil { + r.logger.Printf("[ERR] raft: Failed to append to logs: %v", err) + return + } + + // Update the lastLog + r.setLastLogIndex(last.Index) + r.setLastLogTerm(last.Term) + metrics.MeasureSince([]string{"raft", "rpc", "appendEntries", "storeLogs"}, start) + } + + // Update the commit index + if a.LeaderCommitIndex > 0 && a.LeaderCommitIndex > r.getCommitIndex() { + start := time.Now() + idx := min(a.LeaderCommitIndex, r.getLastIndex()) + r.setCommitIndex(idx) + r.processLogs(idx, nil) + metrics.MeasureSince([]string{"raft", "rpc", "appendEntries", "processLogs"}, start) + } + + // Everything went well, set success + resp.Success = true + r.setLastContact() + return +} + +// requestVote is invoked when we get an request vote RPC call. +func (r *Raft) requestVote(rpc RPC, req *RequestVoteRequest) { + defer metrics.MeasureSince([]string{"raft", "rpc", "requestVote"}, time.Now()) + // Setup a response + resp := &RequestVoteResponse{ + Term: r.getCurrentTerm(), + Peers: encodePeers(r.peers, r.trans), + Granted: false, + } + var rpcErr error + defer func() { + rpc.Respond(resp, rpcErr) + }() + + // Check if we have an existing leader [who's not the candidate] + candidate := r.trans.DecodePeer(req.Candidate) + if leader := r.Leader(); leader != "" && leader != candidate { + r.logger.Printf("[WARN] raft: Rejecting vote request from %v since we have a leader: %v", + candidate, leader) + return + } + + // Ignore an older term + if req.Term < r.getCurrentTerm() { + return + } + + // Increase the term if we see a newer one + if req.Term > r.getCurrentTerm() { + // Ensure transition to follower + r.setState(Follower) + r.setCurrentTerm(req.Term) + resp.Term = req.Term + } + + // Check if we have voted yet + lastVoteTerm, err := r.stable.GetUint64(keyLastVoteTerm) + if err != nil && err.Error() != "not found" { + r.logger.Printf("[ERR] raft: Failed to get last vote term: %v", err) + return + } + lastVoteCandBytes, err := r.stable.Get(keyLastVoteCand) + if err != nil && err.Error() != "not found" { + r.logger.Printf("[ERR] raft: Failed to get last vote candidate: %v", err) + return + } + + // Check if we've voted in this election before + if lastVoteTerm == req.Term && lastVoteCandBytes != nil { + r.logger.Printf("[INFO] raft: Duplicate RequestVote for same term: %d", req.Term) + if bytes.Compare(lastVoteCandBytes, req.Candidate) == 0 { + r.logger.Printf("[WARN] raft: Duplicate RequestVote from candidate: %s", req.Candidate) + resp.Granted = true + } + return + } + + // Reject if their term is older + lastIdx, lastTerm := r.getLastEntry() + if lastTerm > req.LastLogTerm { + r.logger.Printf("[WARN] raft: Rejecting vote request from %v since our last term is greater (%d, %d)", + candidate, lastTerm, req.LastLogTerm) + return + } + + if lastIdx > req.LastLogIndex { + r.logger.Printf("[WARN] raft: Rejecting vote request from %v since our last index is greater (%d, %d)", + candidate, lastIdx, req.LastLogIndex) + return + } + + // Persist a vote for safety + if err := r.persistVote(req.Term, req.Candidate); err != nil { + r.logger.Printf("[ERR] raft: Failed to persist vote: %v", err) + return + } + + resp.Granted = true + return +} + +// installSnapshot is invoked when we get a InstallSnapshot RPC call. +// We must be in the follower state for this, since it means we are +// too far behind a leader for log replay. +func (r *Raft) installSnapshot(rpc RPC, req *InstallSnapshotRequest) { + defer metrics.MeasureSince([]string{"raft", "rpc", "installSnapshot"}, time.Now()) + // Setup a response + resp := &InstallSnapshotResponse{ + Term: r.getCurrentTerm(), + Success: false, + } + var rpcErr error + defer func() { + rpc.Respond(resp, rpcErr) + }() + + // Ignore an older term + if req.Term < r.getCurrentTerm() { + return + } + + // Increase the term if we see a newer one + if req.Term > r.getCurrentTerm() { + // Ensure transition to follower + r.setState(Follower) + r.setCurrentTerm(req.Term) + resp.Term = req.Term + } + + // Save the current leader + r.setLeader(r.trans.DecodePeer(req.Leader)) + + // Create a new snapshot + sink, err := r.snapshots.Create(req.LastLogIndex, req.LastLogTerm, req.Peers) + if err != nil { + r.logger.Printf("[ERR] raft: Failed to create snapshot to install: %v", err) + rpcErr = fmt.Errorf("failed to create snapshot: %v", err) + return + } + + // Spill the remote snapshot to disk + n, err := io.Copy(sink, rpc.Reader) + if err != nil { + sink.Cancel() + r.logger.Printf("[ERR] raft: Failed to copy snapshot: %v", err) + rpcErr = err + return + } + + // Check that we received it all + if n != req.Size { + sink.Cancel() + r.logger.Printf("[ERR] raft: Failed to receive whole snapshot: %d / %d", n, req.Size) + rpcErr = fmt.Errorf("short read") + return + } + + // Finalize the snapshot + if err := sink.Close(); err != nil { + r.logger.Printf("[ERR] raft: Failed to finalize snapshot: %v", err) + rpcErr = err + return + } + r.logger.Printf("[INFO] raft: Copied %d bytes to local snapshot", n) + + // Restore snapshot + future := &restoreFuture{ID: sink.ID()} + future.init() + select { + case r.fsmRestoreCh <- future: + case <-r.shutdownCh: + future.respond(ErrRaftShutdown) + return + } + + // Wait for the restore to happen + if err := future.Error(); err != nil { + r.logger.Printf("[ERR] raft: Failed to restore snapshot: %v", err) + rpcErr = err + return + } + + // Update the lastApplied so we don't replay old logs + r.setLastApplied(req.LastLogIndex) + + // Update the last stable snapshot info + r.setLastSnapshotIndex(req.LastLogIndex) + r.setLastSnapshotTerm(req.LastLogTerm) + + // Restore the peer set + peers := decodePeers(req.Peers, r.trans) + r.peers = ExcludePeer(peers, r.localAddr) + r.peerStore.SetPeers(peers) + + // Compact logs, continue even if this fails + if err := r.compactLogs(req.LastLogIndex); err != nil { + r.logger.Printf("[ERR] raft: Failed to compact logs: %v", err) + } + + r.logger.Printf("[INFO] raft: Installed remote snapshot") + resp.Success = true + r.setLastContact() + return +} + +// setLastContact is used to set the last contact time to now +func (r *Raft) setLastContact() { + r.lastContactLock.Lock() + r.lastContact = time.Now() + r.lastContactLock.Unlock() +} + +type voteResult struct { + RequestVoteResponse + voter string +} + +// electSelf is used to send a RequestVote RPC to all peers, +// and vote for ourself. This has the side affecting of incrementing +// the current term. The response channel returned is used to wait +// for all the responses (including a vote for ourself). +func (r *Raft) electSelf() <-chan *voteResult { + // Create a response channel + respCh := make(chan *voteResult, len(r.peers)+1) + + // Increment the term + r.setCurrentTerm(r.getCurrentTerm() + 1) + + // Construct the request + lastIdx, lastTerm := r.getLastEntry() + req := &RequestVoteRequest{ + Term: r.getCurrentTerm(), + Candidate: r.trans.EncodePeer(r.localAddr), + LastLogIndex: lastIdx, + LastLogTerm: lastTerm, + } + + // Construct a function to ask for a vote + askPeer := func(peer string) { + r.goFunc(func() { + defer metrics.MeasureSince([]string{"raft", "candidate", "electSelf"}, time.Now()) + resp := &voteResult{voter: peer} + err := r.trans.RequestVote(peer, req, &resp.RequestVoteResponse) + if err != nil { + r.logger.Printf("[ERR] raft: Failed to make RequestVote RPC to %v: %v", peer, err) + resp.Term = req.Term + resp.Granted = false + } + + // If we are not a peer, we could have been removed but failed + // to receive the log message. OR it could mean an improperly configured + // cluster. Either way, we should warn + if err == nil { + peerSet := decodePeers(resp.Peers, r.trans) + if !PeerContained(peerSet, r.localAddr) { + r.logger.Printf("[WARN] raft: Remote peer %v does not have local node %v as a peer", + peer, r.localAddr) + } + } + + respCh <- resp + }) + } + + // For each peer, request a vote + for _, peer := range r.peers { + askPeer(peer) + } + + // Persist a vote for ourselves + if err := r.persistVote(req.Term, req.Candidate); err != nil { + r.logger.Printf("[ERR] raft: Failed to persist vote : %v", err) + return nil + } + + // Include our own vote + respCh <- &voteResult{ + RequestVoteResponse: RequestVoteResponse{ + Term: req.Term, + Granted: true, + }, + voter: r.localAddr, + } + return respCh +} + +// persistVote is used to persist our vote for safety. +func (r *Raft) persistVote(term uint64, candidate []byte) error { + if err := r.stable.SetUint64(keyLastVoteTerm, term); err != nil { + return err + } + if err := r.stable.Set(keyLastVoteCand, candidate); err != nil { + return err + } + return nil +} + +// setCurrentTerm is used to set the current term in a durable manner. +func (r *Raft) setCurrentTerm(t uint64) { + // Persist to disk first + if err := r.stable.SetUint64(keyCurrentTerm, t); err != nil { + panic(fmt.Errorf("failed to save current term: %v", err)) + } + r.raftState.setCurrentTerm(t) +} + +// setState is used to update the current state. Any state +// transition causes the known leader to be cleared. This means +// that leader should be set only after updating the state. +func (r *Raft) setState(state RaftState) { + r.setLeader("") + r.raftState.setState(state) +} + +// runSnapshots is a long running goroutine used to manage taking +// new snapshots of the FSM. It runs in parallel to the FSM and +// main goroutines, so that snapshots do not block normal operation. +func (r *Raft) runSnapshots() { + for { + select { + case <-randomTimeout(r.conf.SnapshotInterval): + // Check if we should snapshot + if !r.shouldSnapshot() { + continue + } + + // Trigger a snapshot + if err := r.takeSnapshot(); err != nil { + r.logger.Printf("[ERR] raft: Failed to take snapshot: %v", err) + } + + case future := <-r.snapshotCh: + // User-triggered, run immediately + err := r.takeSnapshot() + if err != nil { + r.logger.Printf("[ERR] raft: Failed to take snapshot: %v", err) + } + future.respond(err) + + case <-r.shutdownCh: + return + } + } +} + +// shouldSnapshot checks if we meet the conditions to take +// a new snapshot. +func (r *Raft) shouldSnapshot() bool { + // Check the last snapshot index + lastSnap := r.getLastSnapshotIndex() + + // Check the last log index + lastIdx, err := r.logs.LastIndex() + if err != nil { + r.logger.Printf("[ERR] raft: Failed to get last log index: %v", err) + return false + } + + // Compare the delta to the threshold + delta := lastIdx - lastSnap + return delta >= r.conf.SnapshotThreshold +} + +// takeSnapshot is used to take a new snapshot. +func (r *Raft) takeSnapshot() error { + defer metrics.MeasureSince([]string{"raft", "snapshot", "takeSnapshot"}, time.Now()) + // Create a snapshot request + req := &reqSnapshotFuture{} + req.init() + + // Wait for dispatch or shutdown + select { + case r.fsmSnapshotCh <- req: + case <-r.shutdownCh: + return ErrRaftShutdown + } + + // Wait until we get a response + if err := req.Error(); err != nil { + if err != ErrNothingNewToSnapshot { + err = fmt.Errorf("failed to start snapshot: %v", err) + } + return err + } + defer req.snapshot.Release() + + // Log that we are starting the snapshot + r.logger.Printf("[INFO] raft: Starting snapshot up to %d", req.index) + + // Encode the peerset + peerSet := encodePeers(req.peers, r.trans) + + // Create a new snapshot + start := time.Now() + sink, err := r.snapshots.Create(req.index, req.term, peerSet) + if err != nil { + return fmt.Errorf("failed to create snapshot: %v", err) + } + metrics.MeasureSince([]string{"raft", "snapshot", "create"}, start) + + // Try to persist the snapshot + start = time.Now() + if err := req.snapshot.Persist(sink); err != nil { + sink.Cancel() + return fmt.Errorf("failed to persist snapshot: %v", err) + } + metrics.MeasureSince([]string{"raft", "snapshot", "persist"}, start) + + // Close and check for error + if err := sink.Close(); err != nil { + return fmt.Errorf("failed to close snapshot: %v", err) + } + + // Update the last stable snapshot info + r.setLastSnapshotIndex(req.index) + r.setLastSnapshotTerm(req.term) + + // Compact the logs + if err := r.compactLogs(req.index); err != nil { + return err + } + + // Log completion + r.logger.Printf("[INFO] raft: Snapshot to %d complete", req.index) + return nil +} + +// compactLogs takes the last inclusive index of a snapshot +// and trims the logs that are no longer needed. +func (r *Raft) compactLogs(snapIdx uint64) error { + defer metrics.MeasureSince([]string{"raft", "compactLogs"}, time.Now()) + // Determine log ranges to compact + minLog, err := r.logs.FirstIndex() + if err != nil { + return fmt.Errorf("failed to get first log index: %v", err) + } + + // Check if we have enough logs to truncate + if r.getLastLogIndex() <= r.conf.TrailingLogs { + return nil + } + + // Truncate up to the end of the snapshot, or `TrailingLogs` + // back from the head, which ever is further back. This ensures + // at least `TrailingLogs` entries, but does not allow logs + // after the snapshot to be removed. + maxLog := min(snapIdx, r.getLastLogIndex()-r.conf.TrailingLogs) + + // Log this + r.logger.Printf("[INFO] raft: Compacting logs from %d to %d", minLog, maxLog) + + // Compact the logs + if err := r.logs.DeleteRange(minLog, maxLog); err != nil { + return fmt.Errorf("log compaction failed: %v", err) + } + return nil +} + +// restoreSnapshot attempts to restore the latest snapshots, and fails +// if none of them can be restored. This is called at initialization time, +// and is completely unsafe to call at any other time. +func (r *Raft) restoreSnapshot() error { + snapshots, err := r.snapshots.List() + if err != nil { + r.logger.Printf("[ERR] raft: Failed to list snapshots: %v", err) + return err + } + + // Try to load in order of newest to oldest + for _, snapshot := range snapshots { + _, source, err := r.snapshots.Open(snapshot.ID) + if err != nil { + r.logger.Printf("[ERR] raft: Failed to open snapshot %v: %v", snapshot.ID, err) + continue + } + defer source.Close() + + if err := r.fsm.Restore(source); err != nil { + r.logger.Printf("[ERR] raft: Failed to restore snapshot %v: %v", snapshot.ID, err) + continue + } + + // Log success + r.logger.Printf("[INFO] raft: Restored from snapshot %v", snapshot.ID) + + // Update the lastApplied so we don't replay old logs + r.setLastApplied(snapshot.Index) + + // Update the last stable snapshot info + r.setLastSnapshotIndex(snapshot.Index) + r.setLastSnapshotTerm(snapshot.Term) + + // Success! + return nil + } + + // If we had snapshots and failed to load them, its an error + if len(snapshots) > 0 { + return fmt.Errorf("failed to load any existing snapshots") + } + return nil +} diff --git a/Godeps/_workspace/src/github.com/hashicorp/raft/replication.go b/Godeps/_workspace/src/github.com/hashicorp/raft/replication.go new file mode 100644 index 00000000000..6a01631d237 --- /dev/null +++ b/Godeps/_workspace/src/github.com/hashicorp/raft/replication.go @@ -0,0 +1,517 @@ +package raft + +import ( + "errors" + "fmt" + "sync" + "time" + + "github.com/armon/go-metrics" +) + +const ( + maxFailureScale = 12 + failureWait = 10 * time.Millisecond +) + +var ( + // ErrLogNotFound indicates a given log entry is not available. + ErrLogNotFound = errors.New("log not found") + + // ErrPipelineReplicationNotSupported can be returned by the transport to + // signal that pipeline replication is not supported in general, and that + // no error message should be produced. + ErrPipelineReplicationNotSupported = errors.New("pipeline replication not supported") +) + +type followerReplication struct { + peer string + inflight *inflight + + stopCh chan uint64 + triggerCh chan struct{} + + currentTerm uint64 + matchIndex uint64 + nextIndex uint64 + + lastContact time.Time + lastContactLock sync.RWMutex + + failures uint64 + + notifyCh chan struct{} + notify []*verifyFuture + notifyLock sync.Mutex + + // stepDown is used to indicate to the leader that we + // should step down based on information from a follower. + stepDown chan struct{} + + // allowPipeline is used to control it seems like + // pipeline replication should be enabled. + allowPipeline bool +} + +// notifyAll is used to notify all the waiting verify futures +// if the follower believes we are still the leader. +func (s *followerReplication) notifyAll(leader bool) { + // Clear the waiting notifies minimizing lock time + s.notifyLock.Lock() + n := s.notify + s.notify = nil + s.notifyLock.Unlock() + + // Submit our votes + for _, v := range n { + v.vote(leader) + } +} + +// LastContact returns the time of last contact. +func (s *followerReplication) LastContact() time.Time { + s.lastContactLock.RLock() + last := s.lastContact + s.lastContactLock.RUnlock() + return last +} + +// setLastContact sets the last contact to the current time. +func (s *followerReplication) setLastContact() { + s.lastContactLock.Lock() + s.lastContact = time.Now() + s.lastContactLock.Unlock() +} + +// replicate is a long running routine that is used to manage +// the process of replicating logs to our followers. +func (r *Raft) replicate(s *followerReplication) { + // Start an async heartbeating routing + stopHeartbeat := make(chan struct{}) + defer close(stopHeartbeat) + r.goFunc(func() { r.heartbeat(s, stopHeartbeat) }) + +RPC: + shouldStop := false + for !shouldStop { + select { + case maxIndex := <-s.stopCh: + // Make a best effort to replicate up to this index + if maxIndex > 0 { + r.replicateTo(s, maxIndex) + } + return + case <-s.triggerCh: + shouldStop = r.replicateTo(s, r.getLastLogIndex()) + case <-randomTimeout(r.conf.CommitTimeout): + shouldStop = r.replicateTo(s, r.getLastLogIndex()) + } + + // If things looks healthy, switch to pipeline mode + if !shouldStop && s.allowPipeline { + goto PIPELINE + } + } + return + +PIPELINE: + // Disable until re-enabled + s.allowPipeline = false + + // Replicates using a pipeline for high performance. This method + // is not able to gracefully recover from errors, and so we fall back + // to standard mode on failure. + if err := r.pipelineReplicate(s); err != nil { + if err != ErrPipelineReplicationNotSupported { + r.logger.Printf("[ERR] raft: Failed to start pipeline replication to %s: %s", s.peer, err) + } + } + goto RPC +} + +// replicateTo is used to replicate the logs up to a given last index. +// If the follower log is behind, we take care to bring them up to date. +func (r *Raft) replicateTo(s *followerReplication, lastIndex uint64) (shouldStop bool) { + // Create the base request + var req AppendEntriesRequest + var resp AppendEntriesResponse + var start time.Time +START: + // Prevent an excessive retry rate on errors + if s.failures > 0 { + select { + case <-time.After(backoff(failureWait, s.failures, maxFailureScale)): + case <-r.shutdownCh: + } + } + + // Setup the request + if err := r.setupAppendEntries(s, &req, s.nextIndex, lastIndex); err == ErrLogNotFound { + goto SEND_SNAP + } else if err != nil { + return + } + + // Make the RPC call + start = time.Now() + if err := r.trans.AppendEntries(s.peer, &req, &resp); err != nil { + r.logger.Printf("[ERR] raft: Failed to AppendEntries to %v: %v", s.peer, err) + s.failures++ + return + } + appendStats(s.peer, start, float32(len(req.Entries))) + + // Check for a newer term, stop running + if resp.Term > req.Term { + r.handleStaleTerm(s) + return true + } + + // Update the last contact + s.setLastContact() + + // Update s based on success + if resp.Success { + // Update our replication state + updateLastAppended(s, &req) + + // Clear any failures, allow pipelining + s.failures = 0 + s.allowPipeline = true + } else { + s.nextIndex = max(min(s.nextIndex-1, resp.LastLog+1), 1) + s.matchIndex = s.nextIndex - 1 + if resp.NoRetryBackoff { + s.failures = 0 + } else { + s.failures++ + } + r.logger.Printf("[WARN] raft: AppendEntries to %v rejected, sending older logs (next: %d)", s.peer, s.nextIndex) + } + +CHECK_MORE: + // Check if there are more logs to replicate + if s.nextIndex <= lastIndex { + goto START + } + return + + // SEND_SNAP is used when we fail to get a log, usually because the follower + // is too far behind, and we must ship a snapshot down instead +SEND_SNAP: + if stop, err := r.sendLatestSnapshot(s); stop { + return true + } else if err != nil { + r.logger.Printf("[ERR] raft: Failed to send snapshot to %v: %v", s.peer, err) + return + } + + // Check if there is more to replicate + goto CHECK_MORE +} + +// sendLatestSnapshot is used to send the latest snapshot we have +// down to our follower. +func (r *Raft) sendLatestSnapshot(s *followerReplication) (bool, error) { + // Get the snapshots + snapshots, err := r.snapshots.List() + if err != nil { + r.logger.Printf("[ERR] raft: Failed to list snapshots: %v", err) + return false, err + } + + // Check we have at least a single snapshot + if len(snapshots) == 0 { + return false, fmt.Errorf("no snapshots found") + } + + // Open the most recent snapshot + snapID := snapshots[0].ID + meta, snapshot, err := r.snapshots.Open(snapID) + if err != nil { + r.logger.Printf("[ERR] raft: Failed to open snapshot %v: %v", snapID, err) + return false, err + } + defer snapshot.Close() + + // Setup the request + req := InstallSnapshotRequest{ + Term: s.currentTerm, + Leader: r.trans.EncodePeer(r.localAddr), + LastLogIndex: meta.Index, + LastLogTerm: meta.Term, + Peers: meta.Peers, + Size: meta.Size, + } + + // Make the call + start := time.Now() + var resp InstallSnapshotResponse + if err := r.trans.InstallSnapshot(s.peer, &req, &resp, snapshot); err != nil { + r.logger.Printf("[ERR] raft: Failed to install snapshot %v: %v", snapID, err) + s.failures++ + return false, err + } + metrics.MeasureSince([]string{"raft", "replication", "installSnapshot", s.peer}, start) + + // Check for a newer term, stop running + if resp.Term > req.Term { + r.handleStaleTerm(s) + return true, nil + } + + // Update the last contact + s.setLastContact() + + // Check for success + if resp.Success { + // Mark any inflight logs as committed + s.inflight.CommitRange(s.matchIndex+1, meta.Index) + + // Update the indexes + s.matchIndex = meta.Index + s.nextIndex = s.matchIndex + 1 + + // Clear any failures + s.failures = 0 + + // Notify we are still leader + s.notifyAll(true) + } else { + s.failures++ + r.logger.Printf("[WARN] raft: InstallSnapshot to %v rejected", s.peer) + } + return false, nil +} + +// heartbeat is used to periodically invoke AppendEntries on a peer +// to ensure they don't time out. This is done async of replicate(), +// since that routine could potentially be blocked on disk IO. +func (r *Raft) heartbeat(s *followerReplication, stopCh chan struct{}) { + var failures uint64 + req := AppendEntriesRequest{ + Term: s.currentTerm, + Leader: r.trans.EncodePeer(r.localAddr), + } + var resp AppendEntriesResponse + for { + // Wait for the next heartbeat interval or forced notify + select { + case <-s.notifyCh: + case <-randomTimeout(r.conf.HeartbeatTimeout / 10): + case <-stopCh: + return + } + + start := time.Now() + if err := r.trans.AppendEntries(s.peer, &req, &resp); err != nil { + r.logger.Printf("[ERR] raft: Failed to heartbeat to %v: %v", s.peer, err) + failures++ + select { + case <-time.After(backoff(failureWait, failures, maxFailureScale)): + case <-stopCh: + } + } else { + s.setLastContact() + failures = 0 + metrics.MeasureSince([]string{"raft", "replication", "heartbeat", s.peer}, start) + s.notifyAll(resp.Success) + } + } +} + +// pipelineReplicate is used when we have synchronized our state with the follower, +// and want to switch to a higher performance pipeline mode of replication. +// We only pipeline AppendEntries commands, and if we ever hit an error, we fall +// back to the standard replication which can handle more complex situations. +func (r *Raft) pipelineReplicate(s *followerReplication) error { + // Create a new pipeline + pipeline, err := r.trans.AppendEntriesPipeline(s.peer) + if err != nil { + return err + } + defer pipeline.Close() + + // Log start and stop of pipeline + r.logger.Printf("[INFO] raft: pipelining replication to peer %v", s.peer) + defer r.logger.Printf("[INFO] raft: aborting pipeline replication to peer %v", s.peer) + + // Create a shutdown and finish channel + stopCh := make(chan struct{}) + finishCh := make(chan struct{}) + + // Start a dedicated decoder + r.goFunc(func() { r.pipelineDecode(s, pipeline, stopCh, finishCh) }) + + // Start pipeline sends at the last good nextIndex + nextIndex := s.nextIndex + + shouldStop := false +SEND: + for !shouldStop { + select { + case <-finishCh: + break SEND + case maxIndex := <-s.stopCh: + if maxIndex > 0 { + r.pipelineSend(s, pipeline, &nextIndex, maxIndex) + } + break SEND + case <-s.triggerCh: + shouldStop = r.pipelineSend(s, pipeline, &nextIndex, r.getLastLogIndex()) + case <-randomTimeout(r.conf.CommitTimeout): + shouldStop = r.pipelineSend(s, pipeline, &nextIndex, r.getLastLogIndex()) + } + } + + // Stop our decoder, and wait for it to finish + close(stopCh) + select { + case <-finishCh: + case <-r.shutdownCh: + } + return nil +} + +// pipelineSend is used to send data over a pipeline. +func (r *Raft) pipelineSend(s *followerReplication, p AppendPipeline, nextIdx *uint64, lastIndex uint64) (shouldStop bool) { + // Create a new append request + req := new(AppendEntriesRequest) + if err := r.setupAppendEntries(s, req, *nextIdx, lastIndex); err != nil { + return true + } + + // Pipeline the append entries + if _, err := p.AppendEntries(req, new(AppendEntriesResponse)); err != nil { + r.logger.Printf("[ERR] raft: Failed to pipeline AppendEntries to %v: %v", s.peer, err) + return true + } + + // Increase the next send log to avoid re-sending old logs + if n := len(req.Entries); n > 0 { + last := req.Entries[n-1] + *nextIdx = last.Index + 1 + } + return false +} + +// pipelineDecode is used to decode the responses of pipelined requests. +func (r *Raft) pipelineDecode(s *followerReplication, p AppendPipeline, stopCh, finishCh chan struct{}) { + defer close(finishCh) + respCh := p.Consumer() + for { + select { + case ready := <-respCh: + req, resp := ready.Request(), ready.Response() + appendStats(s.peer, ready.Start(), float32(len(req.Entries))) + + // Check for a newer term, stop running + if resp.Term > req.Term { + r.handleStaleTerm(s) + return + } + + // Update the last contact + s.setLastContact() + + // Abort pipeline if not successful + if !resp.Success { + return + } + + // Update our replication state + updateLastAppended(s, req) + case <-stopCh: + return + } + } +} + +// setupAppendEntries is used to setup an append entries request. +func (r *Raft) setupAppendEntries(s *followerReplication, req *AppendEntriesRequest, nextIndex, lastIndex uint64) error { + req.Term = s.currentTerm + req.Leader = r.trans.EncodePeer(r.localAddr) + req.LeaderCommitIndex = r.getCommitIndex() + if err := r.setPreviousLog(req, nextIndex); err != nil { + return err + } + if err := r.setNewLogs(req, nextIndex, lastIndex); err != nil { + return err + } + return nil +} + +// setPreviousLog is used to setup the PrevLogEntry and PrevLogTerm for an +// AppendEntriesRequest given the next index to replicate. +func (r *Raft) setPreviousLog(req *AppendEntriesRequest, nextIndex uint64) error { + // Guard for the first index, since there is no 0 log entry + // Guard against the previous index being a snapshot as well + if nextIndex == 1 { + req.PrevLogEntry = 0 + req.PrevLogTerm = 0 + + } else if (nextIndex - 1) == r.getLastSnapshotIndex() { + req.PrevLogEntry = r.getLastSnapshotIndex() + req.PrevLogTerm = r.getLastSnapshotTerm() + + } else { + var l Log + if err := r.logs.GetLog(nextIndex-1, &l); err != nil { + r.logger.Printf("[ERR] raft: Failed to get log at index %d: %v", + nextIndex-1, err) + return err + } + + // Set the previous index and term (0 if nextIndex is 1) + req.PrevLogEntry = l.Index + req.PrevLogTerm = l.Term + } + return nil +} + +// setNewLogs is used to setup the logs which should be appended for a request. +func (r *Raft) setNewLogs(req *AppendEntriesRequest, nextIndex, lastIndex uint64) error { + // Append up to MaxAppendEntries or up to the lastIndex + req.Entries = make([]*Log, 0, r.conf.MaxAppendEntries) + maxIndex := min(nextIndex+uint64(r.conf.MaxAppendEntries)-1, lastIndex) + for i := nextIndex; i <= maxIndex; i++ { + oldLog := new(Log) + if err := r.logs.GetLog(i, oldLog); err != nil { + r.logger.Printf("[ERR] raft: Failed to get log at index %d: %v", i, err) + return err + } + req.Entries = append(req.Entries, oldLog) + } + return nil +} + +// appendStats is used to emit stats about an AppendEntries invocation. +func appendStats(peer string, start time.Time, logs float32) { + metrics.MeasureSince([]string{"raft", "replication", "appendEntries", "rpc", peer}, start) + metrics.IncrCounter([]string{"raft", "replication", "appendEntries", "logs", peer}, logs) +} + +// handleStaleTerm is used when a follower indicates that we have a stale term. +func (r *Raft) handleStaleTerm(s *followerReplication) { + r.logger.Printf("[ERR] raft: peer %v has newer term, stopping replication", s.peer) + s.notifyAll(false) // No longer leader + asyncNotifyCh(s.stepDown) +} + +// updateLastAppended is used to update follower replication state after a successful +// AppendEntries RPC. +func updateLastAppended(s *followerReplication, req *AppendEntriesRequest) { + // Mark any inflight logs as committed + if logs := req.Entries; len(logs) > 0 { + first := logs[0] + last := logs[len(logs)-1] + s.inflight.CommitRange(first.Index, last.Index) + + // Update the indexes + s.matchIndex = last.Index + s.nextIndex = last.Index + 1 + } + + // Notify still leader + s.notifyAll(true) +} diff --git a/Godeps/_workspace/src/github.com/hashicorp/raft/snapshot.go b/Godeps/_workspace/src/github.com/hashicorp/raft/snapshot.go new file mode 100644 index 00000000000..7151f43ce26 --- /dev/null +++ b/Godeps/_workspace/src/github.com/hashicorp/raft/snapshot.go @@ -0,0 +1,40 @@ +package raft + +import ( + "io" +) + +// SnapshotMeta is for metadata of a snapshot. +type SnapshotMeta struct { + ID string // ID is opaque to the store, and is used for opening + Index uint64 + Term uint64 + Peers []byte + Size int64 +} + +// SnapshotStore interface is used to allow for flexible implementations +// of snapshot storage and retrieval. For example, a client could implement +// a shared state store such as S3, allowing new nodes to restore snapshots +// without steaming from the leader. +type SnapshotStore interface { + // Create is used to begin a snapshot at a given index and term, + // with the current peer set already encoded. + Create(index, term uint64, peers []byte) (SnapshotSink, error) + + // List is used to list the available snapshots in the store. + // It should return then in descending order, with the highest index first. + List() ([]*SnapshotMeta, error) + + // Open takes a snapshot ID and provides a ReadCloser. Once close is + // called it is assumed the snapshot is no longer needed. + Open(id string) (*SnapshotMeta, io.ReadCloser, error) +} + +// SnapshotSink is returned by StartSnapshot. The FSM will Write state +// to the sink and call Close on completion. On error, Cancel will be invoked. +type SnapshotSink interface { + io.WriteCloser + ID() string + Cancel() error +} diff --git a/Godeps/_workspace/src/github.com/hashicorp/raft/stable.go b/Godeps/_workspace/src/github.com/hashicorp/raft/stable.go new file mode 100644 index 00000000000..ff59a8c570a --- /dev/null +++ b/Godeps/_workspace/src/github.com/hashicorp/raft/stable.go @@ -0,0 +1,15 @@ +package raft + +// StableStore is used to provide stable storage +// of key configurations to ensure safety. +type StableStore interface { + Set(key []byte, val []byte) error + + // Get returns the value for key, or an empty byte slice if key was not found. + Get(key []byte) ([]byte, error) + + SetUint64(key []byte, val uint64) error + + // GetUint64 returns the uint64 value for key, or 0 if key was not found. + GetUint64(key []byte) (uint64, error) +} diff --git a/Godeps/_workspace/src/github.com/hashicorp/raft/state.go b/Godeps/_workspace/src/github.com/hashicorp/raft/state.go new file mode 100644 index 00000000000..41e80a1b510 --- /dev/null +++ b/Godeps/_workspace/src/github.com/hashicorp/raft/state.go @@ -0,0 +1,169 @@ +package raft + +import ( + "sync/atomic" +) + +// RaftState captures the state of a Raft node: Follower, Candidate, Leader, +// or Shutdown. +type RaftState uint32 + +const ( + // Follower is the initial state of a Raft node. + Follower RaftState = iota + + // Candidate is one of the valid states of a Raft node. + Candidate + + // Leader is one of the valid states of a Raft node. + Leader + + // Shutdown is the terminal state of a Raft node. + Shutdown +) + +func (s RaftState) String() string { + switch s { + case Follower: + return "Follower" + case Candidate: + return "Candidate" + case Leader: + return "Leader" + case Shutdown: + return "Shutdown" + default: + return "Unknown" + } +} + +// raftState is used to maintain various state variables +// and provides an interface to set/get the variables in a +// thread safe manner. +type raftState struct { + // The current term, cache of StableStore + currentTerm uint64 + + // Cache the latest log from LogStore + LastLogIndex uint64 + LastLogTerm uint64 + + // Highest committed log entry + commitIndex uint64 + + // Last applied log to the FSM + lastApplied uint64 + + // Cache the latest snapshot index/term + lastSnapshotIndex uint64 + lastSnapshotTerm uint64 + + // Tracks the number of live routines + runningRoutines int32 + + // The current state + state RaftState +} + +func (r *raftState) getState() RaftState { + stateAddr := (*uint32)(&r.state) + return RaftState(atomic.LoadUint32(stateAddr)) +} + +func (r *raftState) setState(s RaftState) { + stateAddr := (*uint32)(&r.state) + atomic.StoreUint32(stateAddr, uint32(s)) +} + +func (r *raftState) getCurrentTerm() uint64 { + return atomic.LoadUint64(&r.currentTerm) +} + +func (r *raftState) setCurrentTerm(term uint64) { + atomic.StoreUint64(&r.currentTerm, term) +} + +func (r *raftState) getLastLogIndex() uint64 { + return atomic.LoadUint64(&r.LastLogIndex) +} + +func (r *raftState) setLastLogIndex(term uint64) { + atomic.StoreUint64(&r.LastLogIndex, term) +} + +func (r *raftState) getLastLogTerm() uint64 { + return atomic.LoadUint64(&r.LastLogTerm) +} + +func (r *raftState) setLastLogTerm(term uint64) { + atomic.StoreUint64(&r.LastLogTerm, term) +} + +func (r *raftState) getCommitIndex() uint64 { + return atomic.LoadUint64(&r.commitIndex) +} + +func (r *raftState) setCommitIndex(term uint64) { + atomic.StoreUint64(&r.commitIndex, term) +} + +func (r *raftState) getLastApplied() uint64 { + return atomic.LoadUint64(&r.lastApplied) +} + +func (r *raftState) setLastApplied(term uint64) { + atomic.StoreUint64(&r.lastApplied, term) +} + +func (r *raftState) getLastSnapshotIndex() uint64 { + return atomic.LoadUint64(&r.lastSnapshotIndex) +} + +func (r *raftState) setLastSnapshotIndex(term uint64) { + atomic.StoreUint64(&r.lastSnapshotIndex, term) +} + +func (r *raftState) getLastSnapshotTerm() uint64 { + return atomic.LoadUint64(&r.lastSnapshotTerm) +} + +func (r *raftState) setLastSnapshotTerm(term uint64) { + atomic.StoreUint64(&r.lastSnapshotTerm, term) +} + +func (r *raftState) incrRoutines() { + atomic.AddInt32(&r.runningRoutines, 1) +} + +func (r *raftState) decrRoutines() { + atomic.AddInt32(&r.runningRoutines, -1) +} + +func (r *raftState) getRoutines() int32 { + return atomic.LoadInt32(&r.runningRoutines) +} + +// Start a goroutine and properly handle the race between a routine +// starting and incrementing, and exiting and decrementing. +func (r *raftState) goFunc(f func()) { + r.incrRoutines() + go func() { + defer r.decrRoutines() + f() + }() +} + +// getLastIndex returns the last index in stable storage. +// Either from the last log or from the last snapshot. +func (r *raftState) getLastIndex() uint64 { + return max(r.getLastLogIndex(), r.getLastSnapshotIndex()) +} + +// getLastEntry returns the last index and term in stable storage. +// Either from the last log or from the last snapshot. +func (r *raftState) getLastEntry() (uint64, uint64) { + if r.getLastLogIndex() >= r.getLastSnapshotIndex() { + return r.getLastLogIndex(), r.getLastLogTerm() + } + return r.getLastSnapshotIndex(), r.getLastSnapshotTerm() +} diff --git a/Godeps/_workspace/src/github.com/hashicorp/raft/tcp_transport.go b/Godeps/_workspace/src/github.com/hashicorp/raft/tcp_transport.go new file mode 100644 index 00000000000..50c6d15df18 --- /dev/null +++ b/Godeps/_workspace/src/github.com/hashicorp/raft/tcp_transport.go @@ -0,0 +1,105 @@ +package raft + +import ( + "errors" + "io" + "log" + "net" + "time" +) + +var ( + errNotAdvertisable = errors.New("local bind address is not advertisable") + errNotTCP = errors.New("local address is not a TCP address") +) + +// TCPStreamLayer implements StreamLayer interface for plain TCP. +type TCPStreamLayer struct { + advertise net.Addr + listener *net.TCPListener +} + +// NewTCPTransport returns a NetworkTransport that is built on top of +// a TCP streaming transport layer. +func NewTCPTransport( + bindAddr string, + advertise net.Addr, + maxPool int, + timeout time.Duration, + logOutput io.Writer, +) (*NetworkTransport, error) { + return newTCPTransport(bindAddr, advertise, maxPool, timeout, func(stream StreamLayer) *NetworkTransport { + return NewNetworkTransport(stream, maxPool, timeout, logOutput) + }) +} + +// NewTCPTransportWithLogger returns a NetworkTransport that is built on top of +// a TCP streaming transport layer, with log output going to the supplied Logger +func NewTCPTransportWithLogger( + bindAddr string, + advertise net.Addr, + maxPool int, + timeout time.Duration, + logger *log.Logger, +) (*NetworkTransport, error) { + return newTCPTransport(bindAddr, advertise, maxPool, timeout, func(stream StreamLayer) *NetworkTransport { + return NewNetworkTransportWithLogger(stream, maxPool, timeout, logger) + }) +} + +func newTCPTransport(bindAddr string, + advertise net.Addr, + maxPool int, + timeout time.Duration, + transportCreator func(stream StreamLayer) *NetworkTransport) (*NetworkTransport, error) { + // Try to bind + list, err := net.Listen("tcp", bindAddr) + if err != nil { + return nil, err + } + + // Create stream + stream := &TCPStreamLayer{ + advertise: advertise, + listener: list.(*net.TCPListener), + } + + // Verify that we have a usable advertise address + addr, ok := stream.Addr().(*net.TCPAddr) + if !ok { + list.Close() + return nil, errNotTCP + } + if addr.IP.IsUnspecified() { + list.Close() + return nil, errNotAdvertisable + } + + // Create the network transport + trans := transportCreator(stream) + return trans, nil +} + +// Dial implements the StreamLayer interface. +func (t *TCPStreamLayer) Dial(address string, timeout time.Duration) (net.Conn, error) { + return net.DialTimeout("tcp", address, timeout) +} + +// Accept implements the net.Listener interface. +func (t *TCPStreamLayer) Accept() (c net.Conn, err error) { + return t.listener.Accept() +} + +// Close implements the net.Listener interface. +func (t *TCPStreamLayer) Close() (err error) { + return t.listener.Close() +} + +// Addr implements the net.Listener interface. +func (t *TCPStreamLayer) Addr() net.Addr { + // Use an advertise addr if provided + if t.advertise != nil { + return t.advertise + } + return t.listener.Addr() +} diff --git a/Godeps/_workspace/src/github.com/hashicorp/raft/transport.go b/Godeps/_workspace/src/github.com/hashicorp/raft/transport.go new file mode 100644 index 00000000000..8928de0c2fc --- /dev/null +++ b/Godeps/_workspace/src/github.com/hashicorp/raft/transport.go @@ -0,0 +1,85 @@ +package raft + +import ( + "io" + "time" +) + +// RPCResponse captures both a response and a potential error. +type RPCResponse struct { + Response interface{} + Error error +} + +// RPC has a command, and provides a response mechanism. +type RPC struct { + Command interface{} + Reader io.Reader // Set only for InstallSnapshot + RespChan chan<- RPCResponse +} + +// Respond is used to respond with a response, error or both +func (r *RPC) Respond(resp interface{}, err error) { + r.RespChan <- RPCResponse{resp, err} +} + +// Transport provides an interface for network transports +// to allow Raft to communicate with other nodes. +type Transport interface { + // Consumer returns a channel that can be used to + // consume and respond to RPC requests. + Consumer() <-chan RPC + + // LocalAddr is used to return our local address to distinguish from our peers. + LocalAddr() string + + // AppendEntriesPipeline returns an interface that can be used to pipeline + // AppendEntries requests. + AppendEntriesPipeline(target string) (AppendPipeline, error) + + // AppendEntries sends the appropriate RPC to the target node. + AppendEntries(target string, args *AppendEntriesRequest, resp *AppendEntriesResponse) error + + // RequestVote sends the appropriate RPC to the target node. + RequestVote(target string, args *RequestVoteRequest, resp *RequestVoteResponse) error + + // InstallSnapshot is used to push a snapshot down to a follower. The data is read from + // the ReadCloser and streamed to the client. + InstallSnapshot(target string, args *InstallSnapshotRequest, resp *InstallSnapshotResponse, data io.Reader) error + + // EncodePeer is used to serialize a peer name. + EncodePeer(string) []byte + + // DecodePeer is used to deserialize a peer name. + DecodePeer([]byte) string + + // SetHeartbeatHandler is used to setup a heartbeat handler + // as a fast-pass. This is to avoid head-of-line blocking from + // disk IO. If a Transport does not support this, it can simply + // ignore the call, and push the heartbeat onto the Consumer channel. + SetHeartbeatHandler(cb func(rpc RPC)) +} + +// AppendPipeline is used for pipelining AppendEntries requests. It is used +// to increase the replication throughput by masking latency and better +// utilizing bandwidth. +type AppendPipeline interface { + // AppendEntries is used to add another request to the pipeline. + // The send may block which is an effective form of back-pressure. + AppendEntries(args *AppendEntriesRequest, resp *AppendEntriesResponse) (AppendFuture, error) + + // Consumer returns a channel that can be used to consume + // response futures when they are ready. + Consumer() <-chan AppendFuture + + // Closes pipeline and cancels all inflight RPCs + Close() error +} + +// AppendFuture is used to return information about a pipelined AppendEntries request. +type AppendFuture interface { + Future + Start() time.Time + Request() *AppendEntriesRequest + Response() *AppendEntriesResponse +} diff --git a/Godeps/_workspace/src/github.com/hashicorp/raft/util.go b/Godeps/_workspace/src/github.com/hashicorp/raft/util.go new file mode 100644 index 00000000000..a6642c4c9e6 --- /dev/null +++ b/Godeps/_workspace/src/github.com/hashicorp/raft/util.go @@ -0,0 +1,200 @@ +package raft + +import ( + "bytes" + crand "crypto/rand" + "encoding/binary" + "fmt" + "math" + "math/big" + "math/rand" + "time" + + "github.com/hashicorp/go-msgpack/codec" +) + +func init() { + // Ensure we use a high-entropy seed for the psuedo-random generator + rand.Seed(newSeed()) +} + +// returns an int64 from a crypto random source +// can be used to seed a source for a math/rand. +func newSeed() int64 { + r, err := crand.Int(crand.Reader, big.NewInt(math.MaxInt64)) + if err != nil { + panic(fmt.Errorf("failed to read random bytes: %v", err)) + } + return r.Int64() +} + +// randomTimeout returns a value that is between the minVal and 2x minVal. +func randomTimeout(minVal time.Duration) <-chan time.Time { + if minVal == 0 { + return nil + } + extra := (time.Duration(rand.Int63()) % minVal) + return time.After(minVal + extra) +} + +// min returns the minimum. +func min(a, b uint64) uint64 { + if a <= b { + return a + } + return b +} + +// max returns the maximum. +func max(a, b uint64) uint64 { + if a >= b { + return a + } + return b +} + +// generateUUID is used to generate a random UUID. +func generateUUID() string { + buf := make([]byte, 16) + if _, err := crand.Read(buf); err != nil { + panic(fmt.Errorf("failed to read random bytes: %v", err)) + } + + return fmt.Sprintf("%08x-%04x-%04x-%04x-%12x", + buf[0:4], + buf[4:6], + buf[6:8], + buf[8:10], + buf[10:16]) +} + +// asyncNotify is used to do an async channel send to +// a list of channels. This will not block. +func asyncNotify(chans []chan struct{}) { + for _, ch := range chans { + asyncNotifyCh(ch) + } +} + +// asyncNotifyCh is used to do an async channel send +// to a single channel without blocking. +func asyncNotifyCh(ch chan struct{}) { + select { + case ch <- struct{}{}: + default: + } +} + +// asyncNotifyBool is used to do an async notification +// on a bool channel. +func asyncNotifyBool(ch chan bool, v bool) { + select { + case ch <- v: + default: + } +} + +// ExcludePeer is used to exclude a single peer from a list of peers. +func ExcludePeer(peers []string, peer string) []string { + otherPeers := make([]string, 0, len(peers)) + for _, p := range peers { + if p != peer { + otherPeers = append(otherPeers, p) + } + } + return otherPeers +} + +// PeerContained checks if a given peer is contained in a list. +func PeerContained(peers []string, peer string) bool { + for _, p := range peers { + if p == peer { + return true + } + } + return false +} + +// AddUniquePeer is used to add a peer to a list of existing +// peers only if it is not already contained. +func AddUniquePeer(peers []string, peer string) []string { + if PeerContained(peers, peer) { + return peers + } + return append(peers, peer) +} + +// encodePeers is used to serialize a list of peers. +func encodePeers(peers []string, trans Transport) []byte { + // Encode each peer + var encPeers [][]byte + for _, p := range peers { + encPeers = append(encPeers, trans.EncodePeer(p)) + } + + // Encode the entire array + buf, err := encodeMsgPack(encPeers) + if err != nil { + panic(fmt.Errorf("failed to encode peers: %v", err)) + } + + return buf.Bytes() +} + +// decodePeers is used to deserialize a list of peers. +func decodePeers(buf []byte, trans Transport) []string { + // Decode the buffer first + var encPeers [][]byte + if err := decodeMsgPack(buf, &encPeers); err != nil { + panic(fmt.Errorf("failed to decode peers: %v", err)) + } + + // Deserialize each peer + var peers []string + for _, enc := range encPeers { + peers = append(peers, trans.DecodePeer(enc)) + } + + return peers +} + +// Decode reverses the encode operation on a byte slice input. +func decodeMsgPack(buf []byte, out interface{}) error { + r := bytes.NewBuffer(buf) + hd := codec.MsgpackHandle{} + dec := codec.NewDecoder(r, &hd) + return dec.Decode(out) +} + +// Encode writes an encoded object to a new bytes buffer. +func encodeMsgPack(in interface{}) (*bytes.Buffer, error) { + buf := bytes.NewBuffer(nil) + hd := codec.MsgpackHandle{} + enc := codec.NewEncoder(buf, &hd) + err := enc.Encode(in) + return buf, err +} + +// Converts bytes to an integer. +func bytesToUint64(b []byte) uint64 { + return binary.BigEndian.Uint64(b) +} + +// Converts a uint64 to a byte slice. +func uint64ToBytes(u uint64) []byte { + buf := make([]byte, 8) + binary.BigEndian.PutUint64(buf, u) + return buf +} + +// backoff is used to compute an exponential backoff +// duration. Base time is scaled by the current round, +// up to some maximum scale factor. +func backoff(base time.Duration, round, limit uint64) time.Duration { + power := min(round, limit) + for power > 2 { + base *= 2 + power-- + } + return base +} diff --git a/Godeps/_workspace/src/github.com/influxdb/influxdb/LICENSE b/Godeps/_workspace/src/github.com/influxdb/influxdb/LICENSE index 03f21e89fec..d50222706cf 100644 --- a/Godeps/_workspace/src/github.com/influxdb/influxdb/LICENSE +++ b/Godeps/_workspace/src/github.com/influxdb/influxdb/LICENSE @@ -1,6 +1,6 @@ The MIT License (MIT) -Copyright (c) 2013-2014 Errplane Inc. +Copyright (c) 2013-2015 Errplane Inc. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in diff --git a/Godeps/_workspace/src/github.com/influxdb/influxdb/_vendor/raft/LICENSE b/Godeps/_workspace/src/github.com/influxdb/influxdb/_vendor/raft/LICENSE deleted file mode 100644 index ee7f222286b..00000000000 --- a/Godeps/_workspace/src/github.com/influxdb/influxdb/_vendor/raft/LICENSE +++ /dev/null @@ -1,20 +0,0 @@ -Copyright 2013 go-raft contributors - -Permission is hereby granted, free of charge, to any person obtaining -a copy of this software and associated documentation files (the -"Software"), to deal in the Software without restriction, including -without limitation the rights to use, copy, modify, merge, publish, -distribute, sublicense, and/or sell copies of the Software, and to -permit persons to whom the Software is furnished to do so, subject to -the following conditions: - -The above copyright notice and this permission notice shall be -included in all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE -LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION -OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION -WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/Godeps/_workspace/src/github.com/influxdb/influxdb/client/README.md b/Godeps/_workspace/src/github.com/influxdb/influxdb/client/README.md index 2d849dfb7e0..012109bc00c 100644 --- a/Godeps/_workspace/src/github.com/influxdb/influxdb/client/README.md +++ b/Godeps/_workspace/src/github.com/influxdb/influxdb/client/README.md @@ -1,2 +1,206 @@ -influxdb-go -=========== +# InfluxDB Client + +[![GoDoc](https://godoc.org/github.com/influxdb/influxdb?status.svg)](http://godoc.org/github.com/influxdb/influxdb/client) + +## Description + +A Go client library written and maintained by the **InfluxDB** team. +This package provides convenience functions to read and write time series data. +It uses the HTTP protocol to communicate with your **InfluxDB** cluster. + + +## Getting Started + +### Connecting To Your Database + +Connecting to an **InfluxDB** database is straightforward. You will need a host +name, a port and the cluster user credentials if applicable. The default port is 8086. +You can customize these settings to your specific installation via the +**InfluxDB** configuration file. + +Thought not necessary for experimentation, you may want to create a new user +and authenticate the connection to your database. + +For more information please check out the +[Cluster Admin Docs](http://influxdb.com/docs/v0.9/query_language/database_administration.html). + +For the impatient, you can create a new admin user _bubba_ by firing off the +[InfluxDB CLI](https://github.com/influxdb/influxdb/blob/master/cmd/influx/main.go). + +```shell +influx +> create user bubba with password 'bumblebeetuna' +> grant all privileges to bubba +``` + +And now for good measure set the credentials in you shell environment. +In the example below we will use $INFLUX_USER and $INFLUX_PWD + +Now with the administrivia out of the way, let's connect to our database. + +NOTE: If you've opted out of creating a user, you can omit Username and Password in +the configuration below. + +```go +package main + +import "github.com/influxdb/influxdb/client" + +const ( + MyHost = "localhost" + MyPort = 8086 + MyDB = "square_holes" + MyMeasurement = "shapes" +) + +func main() { + u, err := url.Parse(fmt.Sprintf("http://%s:%d", MyHost, MyPort)) + if err != nil { + log.Fatal(err) + } + + conf := client.Config{ + URL: *u, + Username: os.Getenv("INFLUX_USER"), + Password: os.Getenv("INFLUX_PWD"), + } + + con, err := client.NewClient(conf) + if err != nil { + log.Fatal(err) + } + + dur, ver, err := con.Ping() + if err != nil { + log.Fatal(err) + } + log.Printf("Happy as a Hippo! %v, %s", dur, ver) +} + +``` + +### Inserting Data + +Time series data aka *points* are written to the database using batch inserts. +The mechanism is to create one or more points and then create a batch aka *batch points* +and write these to a given database and series. A series is a combination of a +measurement (time/values) and a set of tags. + +In this sample we will create a batch of a 1,000 points. Each point has a time and +a single value as well as 2 tags indicating a shape and color. We write these points +to a database called _square_holes_ using a measurement named _shapes_. + +NOTE: You can specify a RetentionPolicy as part of the batch points. If not +provided InfluxDB will use the database _default_ retention policy. By default, the _default_ +retention policy never deletes any data it contains. + +```go +func writePoints(con *client.Client) { + var ( + shapes = []string{"circle", "rectangle", "square", "triangle"} + colors = []string{"red", "blue", "green"} + sampleSize = 1000 + pts = make([]client.Point, sampleSize) + ) + + rand.Seed(42) + for i := 0; i < sampleSize; i++ { + pts[i] = client.Point{ + Measurement: "shapes", + Tags: map[string]string{ + "color": strconv.Itoa(rand.Intn(len(colors))), + "shape": strconv.Itoa(rand.Intn(len(shapes))), + }, + Fields: map[string]interface{}{ + "value": rand.Intn(sampleSize), + }, + Time: time.Now(), + Precision: "s", + } + } + + bps := client.BatchPoints{ + Points: pts, + Database: MyDB, + RetentionPolicy: "default", + } + _, err := con.Write(bps) + if err != nil { + log.Fatal(err) + } +} +``` + + +### Querying Data + +One nice advantage of using **InfluxDB** the ability to query your data using familiar +SQL constructs. In this example we can create a convenience function to query the database +as follows: + +```go +// queryDB convenience function to query the database +func queryDB(con *client.Client, cmd string) (res []client.Result, err error) { + q := client.Query{ + Command: cmd, + Database: MyDB, + } + if response, err := con.Query(q); err == nil { + if response.Error() != nil { + return res, response.Error() + } + res = response.Results + } + return +} +``` + +#### Creating a Database +```go +_, err := queryDB(con, fmt.Sprintf("create database %s", MyDB)) +if err != nil { + log.Fatal(err) +} +``` + +#### Count Records +```go +q := fmt.Sprintf("select count(%s) from %s", "value", MyMeasurement) +res, err := queryDB(con, q) +if err != nil { + log.Fatal(err) +} +count := res[0].Series[0].Values[0][1] +log.Printf("Found a total of `%v records", count) + +``` + +#### Find the last 10 _shapes_ records + +```go +q := fmt.Sprintf("select * from %s limit %d", MyMeasurement, 20) +res, err = queryDB(con, q) +if err != nil { + log.Fatal(err) +} + +for i, row := range res[0].Series[0].Values { + t, err := time.Parse(time.RFC3339, row[0].(string)) + if err != nil { + log.Fatal(err) + } + val, err := row[1].(json.Number).Int64() + log.Printf("[%2d] %s: %03d\n", i, t.Format(time.Stamp), val) +} +``` + +## Go Docs + +Please refer to +[http://godoc.org/github.com/influxdb/influxdb/client](http://godoc.org/github.com/influxdb/influxdb/client) +for documentation. + +## See Also + +You can also examine how the client library is used by the +[InfluxDB CLI](https://github.com/influxdb/influxdb/blob/master/cmd/influx/main.go). diff --git a/Godeps/_workspace/src/github.com/influxdb/influxdb/client/examples/example.go b/Godeps/_workspace/src/github.com/influxdb/influxdb/client/examples/example.go deleted file mode 100644 index 6cc866e88c0..00000000000 --- a/Godeps/_workspace/src/github.com/influxdb/influxdb/client/examples/example.go +++ /dev/null @@ -1,200 +0,0 @@ -package examples - -import ( - "fmt" - - "github.com/influxdb/influxdb/client" -) - -func main() { - TestClient() -} - -func TestClient() { - internalTest(true) -} - -func TestClientWithoutCompression() { - internalTest(false) -} - -func internalTest(compression bool) { - c, err := client.NewClient(&client.ClientConfig{}) - if err != nil { - panic(err) - } - - admins, err := c.GetClusterAdminList() - if err != nil { - panic(err) - } - - if len(admins) == 1 { - if err := c.CreateClusterAdmin("admin", "password"); err != nil { - panic(err) - } - } - - admins, err = c.GetClusterAdminList() - if err != nil { - panic(err) - } - - if len(admins) != 2 { - panic("more than two admins returned") - } - - dbs, err := c.GetDatabaseList() - if err != nil { - panic(err) - } - - if len(dbs) == 0 { - if err := c.CreateDatabase("foobar"); err != nil { - panic(err) - } - } - - dbs, err = c.GetDatabaseList() - if err != nil { - panic(err) - } - - if len(dbs) != 1 && dbs[0]["foobar"] == nil { - panic("List of databases don't match") - } - - users, err := c.GetDatabaseUserList("foobar") - if err != nil { - panic(err) - } - - if len(users) == 0 { - if err := c.CreateDatabaseUser("foobar", "dbuser", "pass"); err != nil { - panic(err) - } - - if err := c.AlterDatabasePrivilege("foobar", "dbuser", true); err != nil { - panic(err) - } - } - - users, err = c.GetDatabaseUserList("foobar") - if err != nil { - panic(err) - } - - if len(users) != 1 { - panic("more than one user returned") - } - - c, err = client.NewClient(&client.ClientConfig{ - Username: "dbuser", - Password: "pass", - Database: "foobar", - }) - - if !compression { - c.DisableCompression() - } - - if err != nil { - panic(err) - } - - name := "ts9" - if !compression { - name = "ts9_uncompressed" - } - - series := &client.Series{ - Name: name, - Columns: []string{"value"}, - Points: [][]interface{}{ - {1.0}, - }, - } - if err := c.WriteSeries([]*client.Series{series}); err != nil { - panic(err) - } - - result, err := c.Query("select * from " + name) - if err != nil { - panic(err) - } - - if len(result) != 1 { - panic(fmt.Errorf("expected one time series returned: %d", len(result))) - } - - if len(result[0].Points) != 1 { - panic(fmt.Errorf("Expected one point: %d", len(result[0].Points))) - } - - if result[0].Points[0][2].(float64) != 1 { - panic("Value not equal to 1") - } - - c, err = client.NewClient(&client.ClientConfig{ - Username: "root", - Password: "root", - }) - - if err != nil { - panic(err) - } - - spaces, err := c.GetShardSpaces() - if err != nil || len(spaces) == 0 { - panic(fmt.Errorf("Got empty spaces back: %s", err)) - } - if spaces[0].Name != "default" { - panic("Space name isn't default") - } - space := &client.ShardSpace{Name: "foo", Regex: "/^paul_is_rad/"} - err = c.CreateShardSpace("foobar", space) - if err != nil { - panic(err) - } - spaces, _ = c.GetShardSpaces() - if spaces[1].Name != "foo" { - panic("Space name isn't foo") - } - shards, err := c.GetShards() - if err != nil { - panic(fmt.Errorf("Couldn't get shards back: %s", err)) - } - - c, err = client.NewClient(&client.ClientConfig{ - Username: "root", - Password: "root", - Database: "", - }) - series = &client.Series{ - Name: "paul_is_rad", - Columns: []string{"value"}, - Points: [][]interface{}{ - {1.0}, - }, - } - if err := c.WriteSeries([]*client.Series{series}); err != nil { - panic(err) - } - - spaces, _ = c.GetShardSpaces() - count := 0 - for _, s := range shards.All { - if s.SpaceName == "foo" { - count++ - } - } - - if err := c.DropShardSpace("foobar", "foo"); err != nil { - panic(fmt.Errorf("Error: %s", err)) - } - - spaces, err = c.GetShardSpaces() - if err != nil || len(spaces) != 1 || spaces[0].Name != "default" { - panic(fmt.Errorf("Error: %s, %d, %s", err, len(spaces), spaces[0].Name)) - } -} diff --git a/Godeps/_workspace/src/github.com/influxdb/influxdb/client/influxdb.go b/Godeps/_workspace/src/github.com/influxdb/influxdb/client/influxdb.go index 22a50e5bcd2..2ec08a96eec 100644 --- a/Godeps/_workspace/src/github.com/influxdb/influxdb/client/influxdb.go +++ b/Godeps/_workspace/src/github.com/influxdb/influxdb/client/influxdb.go @@ -2,609 +2,582 @@ package client import ( "bytes" - "compress/gzip" "encoding/json" + "errors" "fmt" "io" "io/ioutil" - "net" "net/http" "net/url" - "strings" + "time" + + "github.com/influxdb/influxdb/influxql" + "github.com/influxdb/influxdb/tsdb" ) -const ( - UDPMaxMessageSize = 2048 -) +// Query is used to send a command to the server. Both Command and Database are required. +type Query struct { + Command string + Database string +} +// Config is used to specify what server to connect to. +// URL: The URL of the server connecting to. +// Username/Password are optional. They will be passed via basic auth if provided. +// UserAgent: If not provided, will default "InfluxDBClient", +// Timeout: If not provided, will default to 0 (no timeout) +type Config struct { + URL url.URL + Username string + Password string + UserAgent string + Timeout time.Duration +} + +// Client is used to make calls to the server. type Client struct { - host string - username string - password string - database string - httpClient *http.Client - udpConn *net.UDPConn - schema string - compression bool + url url.URL + username string + password string + httpClient *http.Client + userAgent string } -type ClientConfig struct { - Host string - Username string - Password string - Database string - HttpClient *http.Client - IsSecure bool - IsUDP bool -} - -var defaults *ClientConfig - -func init() { - defaults = &ClientConfig{ - Host: "localhost:8086", - Username: "root", - Password: "root", - Database: "", - HttpClient: http.DefaultClient, - } -} - -func getDefault(value, defaultValue string) string { - if value == "" { - return defaultValue - } - return value -} - -func New(config *ClientConfig) (*Client, error) { - return NewClient(config) -} - -func NewClient(config *ClientConfig) (*Client, error) { - host := getDefault(config.Host, defaults.Host) - username := getDefault(config.Username, defaults.Username) - password := getDefault(config.Password, defaults.Password) - database := getDefault(config.Database, defaults.Database) - if config.HttpClient == nil { - config.HttpClient = defaults.HttpClient - } - var udpConn *net.UDPConn - if config.IsUDP { - serverAddr, err := net.ResolveUDPAddr("udp", host) - if err != nil { - return nil, err - } - udpConn, err = net.DialUDP("udp", nil, serverAddr) - if err != nil { - return nil, err - } - } - - schema := "http" - if config.IsSecure { - schema = "https" - } - return &Client{host, username, password, database, config.HttpClient, udpConn, schema, false}, nil -} - -func (self *Client) DisableCompression() { - self.compression = false -} - -func (self *Client) getUrl(path string) string { - return self.getUrlWithUserAndPass(path, self.username, self.password) -} - -func (self *Client) getUrlWithUserAndPass(path, username, password string) string { - return fmt.Sprintf("%s://%s%s?u=%s&p=%s", self.schema, self.host, path, username, password) -} - -func responseToError(response *http.Response, err error, closeResponse bool) error { - if err != nil { - return err - } - if closeResponse { - defer response.Body.Close() - } - if response.StatusCode >= 200 && response.StatusCode < 300 { - return nil - } - defer response.Body.Close() - body, err := ioutil.ReadAll(response.Body) - if err != nil { - return err - } - return fmt.Errorf("Server returned (%d): %s", response.StatusCode, string(body)) -} - -func (self *Client) CreateDatabase(name string) error { - url := self.getUrl("/db") - payload := map[string]string{"name": name} - data, err := json.Marshal(payload) - if err != nil { - return err - } - resp, err := self.httpClient.Post(url, "application/json", bytes.NewBuffer(data)) - return responseToError(resp, err, true) -} - -func (self *Client) del(url string) (*http.Response, error) { - return self.delWithBody(url, nil) -} - -func (self *Client) delWithBody(url string, body io.Reader) (*http.Response, error) { - req, err := http.NewRequest("DELETE", url, body) - if err != nil { - return nil, err - } - return self.httpClient.Do(req) -} - -func (self *Client) DeleteDatabase(name string) error { - url := self.getUrl("/db/" + name) - resp, err := self.del(url) - return responseToError(resp, err, true) -} - -func (self *Client) get(url string) ([]byte, error) { - resp, err := self.httpClient.Get(url) - err = responseToError(resp, err, false) - if err != nil { - return nil, err - } - defer resp.Body.Close() - body, err := ioutil.ReadAll(resp.Body) - return body, err -} - -func (self *Client) getWithVersion(url string) ([]byte, string, error) { - resp, err := self.httpClient.Get(url) - err = responseToError(resp, err, false) - if err != nil { - return nil, "", err - } - defer resp.Body.Close() - body, err := ioutil.ReadAll(resp.Body) - version := resp.Header.Get("X-Influxdb-Version") - fields := strings.Fields(version) - if len(fields) > 2 { - return body, fields[1], err - } - return body, "", err -} - -func (self *Client) listSomething(url string) ([]map[string]interface{}, error) { - body, err := self.get(url) - if err != nil { - return nil, err - } - somethings := []map[string]interface{}{} - err = json.Unmarshal(body, &somethings) - if err != nil { - return nil, err - } - return somethings, nil -} - -func (self *Client) GetDatabaseList() ([]map[string]interface{}, error) { - url := self.getUrl("/db") - return self.listSomething(url) -} - -func (self *Client) CreateClusterAdmin(name, password string) error { - url := self.getUrl("/cluster_admins") - payload := map[string]string{"name": name, "password": password} - data, err := json.Marshal(payload) - if err != nil { - return err - } - resp, err := self.httpClient.Post(url, "application/json", bytes.NewBuffer(data)) - return responseToError(resp, err, true) -} - -func (self *Client) UpdateClusterAdmin(name, password string) error { - url := self.getUrl("/cluster_admins/" + name) - payload := map[string]string{"password": password} - data, err := json.Marshal(payload) - if err != nil { - return err - } - resp, err := self.httpClient.Post(url, "application/json", bytes.NewBuffer(data)) - return responseToError(resp, err, true) -} - -func (self *Client) DeleteClusterAdmin(name string) error { - url := self.getUrl("/cluster_admins/" + name) - resp, err := self.del(url) - return responseToError(resp, err, true) -} - -func (self *Client) GetClusterAdminList() ([]map[string]interface{}, error) { - url := self.getUrl("/cluster_admins") - return self.listSomething(url) -} - -func (self *Client) Servers() ([]map[string]interface{}, error) { - url := self.getUrl("/cluster/servers") - return self.listSomething(url) -} - -func (self *Client) RemoveServer(id int) error { - resp, err := self.del(self.getUrl(fmt.Sprintf("/cluster/servers/%d", id))) - return responseToError(resp, err, true) -} - -// Creates a new database user for the given database. permissions can -// be omitted in which case the user will be able to read and write to -// all time series. If provided, there should be two strings, the -// first for read and the second for write. The strings are regexes -// that are used to match the time series name to determine whether -// the user has the ability to read/write to the given time series. -// -// client.CreateDatabaseUser("db", "user", "pass") -// // the following user cannot read from any series and can write -// // to the limited time series only -// client.CreateDatabaseUser("db", "limited", "pass", "^$", "limited") -func (self *Client) CreateDatabaseUser(database, name, password string, permissions ...string) error { - readMatcher, writeMatcher := ".*", ".*" - switch len(permissions) { - case 0: - case 2: - readMatcher, writeMatcher = permissions[0], permissions[1] - default: - return fmt.Errorf("You have to provide two ") - } - - url := self.getUrl("/db/" + database + "/users") - payload := map[string]string{"name": name, "password": password, "readFrom": readMatcher, "writeTo": writeMatcher} - data, err := json.Marshal(payload) - if err != nil { - return err - } - resp, err := self.httpClient.Post(url, "application/json", bytes.NewBuffer(data)) - return responseToError(resp, err, true) -} - -// Change the cluster admin password -func (self *Client) ChangeClusterAdminPassword(name, newPassword string) error { - url := self.getUrl("/cluster_admins/" + name) - payload := map[string]interface{}{"password": newPassword} - data, err := json.Marshal(payload) - if err != nil { - return err - } - resp, err := self.httpClient.Post(url, "application/json", bytes.NewBuffer(data)) - return responseToError(resp, err, true) -} - -// Change the user password, adming flag and optionally permissions -func (self *Client) ChangeDatabaseUser(database, name, newPassword string, isAdmin bool, newPermissions ...string) error { - switch len(newPermissions) { - case 0, 2: - default: - return fmt.Errorf("You have to provide two ") - } - - url := self.getUrl("/db/" + database + "/users/" + name) - payload := map[string]interface{}{"password": newPassword, "admin": isAdmin} - if len(newPermissions) == 2 { - payload["readFrom"] = newPermissions[0] - payload["writeTo"] = newPermissions[1] - } - data, err := json.Marshal(payload) - if err != nil { - return err - } - resp, err := self.httpClient.Post(url, "application/json", bytes.NewBuffer(data)) - return responseToError(resp, err, true) -} - -// See Client.CreateDatabaseUser for more info on the permissions -// argument -func (self *Client) updateDatabaseUserCommon(database, name string, password *string, isAdmin *bool, permissions ...string) error { - url := self.getUrl("/db/" + database + "/users/" + name) - payload := map[string]interface{}{} - if password != nil { - payload["password"] = *password - } - if isAdmin != nil { - payload["admin"] = *isAdmin - } - switch len(permissions) { - case 0: - case 2: - payload["readFrom"] = permissions[0] - payload["writeTo"] = permissions[1] - default: - } - - data, err := json.Marshal(payload) - if err != nil { - return err - } - resp, err := self.httpClient.Post(url, "application/json", bytes.NewBuffer(data)) - return responseToError(resp, err, true) -} - -func (self *Client) UpdateDatabaseUser(database, name, password string) error { - return self.updateDatabaseUserCommon(database, name, &password, nil) -} - -func (self *Client) UpdateDatabaseUserPermissions(database, name, readPermission, writePermissions string) error { - return self.updateDatabaseUserCommon(database, name, nil, nil, readPermission, writePermissions) -} - -func (self *Client) DeleteDatabaseUser(database, name string) error { - url := self.getUrl("/db/" + database + "/users/" + name) - resp, err := self.del(url) - return responseToError(resp, err, true) -} - -func (self *Client) GetDatabaseUserList(database string) ([]map[string]interface{}, error) { - url := self.getUrl("/db/" + database + "/users") - return self.listSomething(url) -} - -func (self *Client) AlterDatabasePrivilege(database, name string, isAdmin bool, permissions ...string) error { - return self.updateDatabaseUserCommon(database, name, nil, &isAdmin, permissions...) -} - -type TimePrecision string - const ( - Second TimePrecision = "s" - Millisecond TimePrecision = "ms" - Microsecond TimePrecision = "u" + ConsistencyOne = "one" + ConsistencyAll = "all" + ConsistencyQuorum = "quorum" + ConsistencyAny = "any" ) -func (self *Client) WriteSeries(series []*Series) error { - return self.writeSeriesCommon(series, nil) +// NewClient will instantiate and return a connected client to issue commands to the server. +func NewClient(c Config) (*Client, error) { + client := Client{ + url: c.URL, + username: c.Username, + password: c.Password, + httpClient: &http.Client{Timeout: c.Timeout}, + userAgent: c.UserAgent, + } + if client.userAgent == "" { + client.userAgent = "InfluxDBClient" + } + return &client, nil } -func (self *Client) WriteSeriesOverUDP(series []*Series) error { - if self.udpConn == nil { - return fmt.Errorf("UDP isn't enabled. Make sure to set config.IsUDP to true") +// SetAuth will update the username and passwords +func (c *Client) SetAuth(u, p string) { + c.username = u + c.password = p +} + +// Query sends a command to the server and returns the Response +func (c *Client) Query(q Query) (*Response, error) { + u := c.url + + u.Path = "query" + values := u.Query() + values.Set("q", q.Command) + values.Set("db", q.Database) + u.RawQuery = values.Encode() + + req, err := http.NewRequest("GET", u.String(), nil) + if err != nil { + return nil, err + } + req.Header.Set("User-Agent", c.userAgent) + if c.username != "" { + req.SetBasicAuth(c.username, c.password) } - data, err := json.Marshal(series) + resp, err := c.httpClient.Do(req) + if err != nil { + return nil, err + } + defer resp.Body.Close() + + var response Response + dec := json.NewDecoder(resp.Body) + dec.UseNumber() + decErr := dec.Decode(&response) + + // ignore this error if we got an invalid status code + if decErr != nil && decErr.Error() == "EOF" && resp.StatusCode != http.StatusOK { + decErr = nil + } + // If we got a valid decode error, send that back + if decErr != nil { + return nil, decErr + } + // If we don't have an error in our json response, and didn't get statusOK, then send back an error + if resp.StatusCode != http.StatusOK && response.Error() == nil { + return &response, fmt.Errorf("received status code %d from server", resp.StatusCode) + } + return &response, nil +} + +// Write takes BatchPoints and allows for writing of multiple points with defaults +// If successful, error is nil and Response is nil +// If an error occurs, Response may contain additional information if populated. +func (c *Client) Write(bp BatchPoints) (*Response, error) { + c.url.Path = "write" + + var b bytes.Buffer + for _, p := range bp.Points { + if p.Raw != "" { + if _, err := b.WriteString(p.Raw); err != nil { + return nil, err + } + } else { + for k, v := range bp.Tags { + if p.Tags == nil { + p.Tags = make(map[string]string, len(bp.Tags)) + } + p.Tags[k] = v + } + + if _, err := b.WriteString(p.MarshalString()); err != nil { + return nil, err + } + } + + if err := b.WriteByte('\n'); err != nil { + return nil, err + } + } + + req, err := http.NewRequest("POST", c.url.String(), &b) + if err != nil { + return nil, err + } + req.Header.Set("Content-Type", "") + req.Header.Set("User-Agent", c.userAgent) + if c.username != "" { + req.SetBasicAuth(c.username, c.password) + } + params := req.URL.Query() + params.Add("db", bp.Database) + params.Add("rp", bp.RetentionPolicy) + params.Add("precision", bp.Precision) + params.Add("consistency", bp.WriteConsistency) + req.URL.RawQuery = params.Encode() + + resp, err := c.httpClient.Do(req) + if err != nil { + return nil, err + } + defer resp.Body.Close() + + var response Response + body, err := ioutil.ReadAll(resp.Body) + if err != nil && err.Error() != "EOF" { + return nil, err + } + + if resp.StatusCode != http.StatusNoContent && resp.StatusCode != http.StatusOK { + var err = fmt.Errorf(string(body)) + response.Err = err + return &response, err + } + + return nil, nil +} + +// Ping will check to see if the server is up +// Ping returns how long the request took, the version of the server it connected to, and an error if one occurred. +func (c *Client) Ping() (time.Duration, string, error) { + now := time.Now() + u := c.url + u.Path = "ping" + + req, err := http.NewRequest("GET", u.String(), nil) + if err != nil { + return 0, "", err + } + req.Header.Set("User-Agent", c.userAgent) + if c.username != "" { + req.SetBasicAuth(c.username, c.password) + } + + resp, err := c.httpClient.Do(req) + if err != nil { + return 0, "", err + } + defer resp.Body.Close() + + version := resp.Header.Get("X-Influxdb-Version") + return time.Since(now), version, nil +} + +// Dump connects to server and retrieves all data stored for specified database. +// If successful, Dump returns the entire response body, which is an io.ReadCloser +func (c *Client) Dump(db string) (io.ReadCloser, error) { + u := c.url + u.Path = "dump" + values := u.Query() + values.Set("db", db) + u.RawQuery = values.Encode() + + req, err := http.NewRequest("GET", u.String(), nil) + if err != nil { + return nil, err + } + req.Header.Set("User-Agent", c.userAgent) + if c.username != "" { + req.SetBasicAuth(c.username, c.password) + } + + resp, err := c.httpClient.Do(req) + if err != nil { + return nil, err + } + if resp.StatusCode != http.StatusOK { + return resp.Body, fmt.Errorf("HTTP Protocol error %d", resp.StatusCode) + } + return resp.Body, nil +} + +// Structs + +// Result represents a resultset returned from a single statement. +type Result struct { + Series []influxql.Row + Err error +} + +// MarshalJSON encodes the result into JSON. +func (r *Result) MarshalJSON() ([]byte, error) { + // Define a struct that outputs "error" as a string. + var o struct { + Series []influxql.Row `json:"series,omitempty"` + Err string `json:"error,omitempty"` + } + + // Copy fields to output struct. + o.Series = r.Series + if r.Err != nil { + o.Err = r.Err.Error() + } + + return json.Marshal(&o) +} + +// UnmarshalJSON decodes the data into the Result struct +func (r *Result) UnmarshalJSON(b []byte) error { + var o struct { + Series []influxql.Row `json:"series,omitempty"` + Err string `json:"error,omitempty"` + } + + dec := json.NewDecoder(bytes.NewBuffer(b)) + dec.UseNumber() + err := dec.Decode(&o) if err != nil { return err } - // because max of msg over upd is 2048 bytes - // https://github.com/influxdb/influxdb/blob/master/src/api/udp/api.go#L65 - if len(data) >= UDPMaxMessageSize { - err = fmt.Errorf("data size over limit %v limit is %v", len(data), UDPMaxMessageSize) - fmt.Println(err) - return err - } - _, err = self.udpConn.Write(data) - if err != nil { - return err + r.Series = o.Series + if o.Err != "" { + r.Err = errors.New(o.Err) } return nil } -func (self *Client) WriteSeriesWithTimePrecision(series []*Series, timePrecision TimePrecision) error { - return self.writeSeriesCommon(series, map[string]string{"time_precision": string(timePrecision)}) +// Response represents a list of statement results. +type Response struct { + Results []Result + Err error } -func (self *Client) writeSeriesCommon(series []*Series, options map[string]string) error { - data, err := json.Marshal(series) +// MarshalJSON encodes the response into JSON. +func (r *Response) MarshalJSON() ([]byte, error) { + // Define a struct that outputs "error" as a string. + var o struct { + Results []Result `json:"results,omitempty"` + Err string `json:"error,omitempty"` + } + + // Copy fields to output struct. + o.Results = r.Results + if r.Err != nil { + o.Err = r.Err.Error() + } + + return json.Marshal(&o) +} + +// UnmarshalJSON decodes the data into the Response struct +func (r *Response) UnmarshalJSON(b []byte) error { + var o struct { + Results []Result `json:"results,omitempty"` + Err string `json:"error,omitempty"` + } + + dec := json.NewDecoder(bytes.NewBuffer(b)) + dec.UseNumber() + err := dec.Decode(&o) if err != nil { return err } - url := self.getUrl("/db/" + self.database + "/series") - for name, value := range options { - url += fmt.Sprintf("&%s=%s", name, value) + r.Results = o.Results + if o.Err != "" { + r.Err = errors.New(o.Err) } - var b *bytes.Buffer - if self.compression { - b = bytes.NewBuffer(nil) - w := gzip.NewWriter(b) - if _, err := w.Write(data); err != nil { + return nil +} + +// Error returns the first error from any statement. +// Returns nil if no errors occurred on any statements. +func (r Response) Error() error { + if r.Err != nil { + return r.Err + } + for _, result := range r.Results { + if result.Err != nil { + return result.Err + } + } + return nil +} + +// Point defines the fields that will be written to the database +// Measurement, Time, and Fields are required +// Precision can be specified if the time is in epoch format (integer). +// Valid values for Precision are n, u, ms, s, m, and h +type Point struct { + Measurement string + Tags map[string]string + Time time.Time + Fields map[string]interface{} + Precision string + Raw string +} + +// MarshalJSON will format the time in RFC3339Nano +// Precision is also ignored as it is only used for writing, not reading +// Or another way to say it is we always send back in nanosecond precision +func (p *Point) MarshalJSON() ([]byte, error) { + point := struct { + Measurement string `json:"measurement,omitempty"` + Tags map[string]string `json:"tags,omitempty"` + Time string `json:"time,omitempty"` + Fields map[string]interface{} `json:"fields,omitempty"` + Precision string `json:"precision,omitempty"` + }{ + Measurement: p.Measurement, + Tags: p.Tags, + Fields: p.Fields, + Precision: p.Precision, + } + // Let it omit empty if it's really zero + if !p.Time.IsZero() { + point.Time = p.Time.UTC().Format(time.RFC3339Nano) + } + return json.Marshal(&point) +} + +func (p *Point) MarshalString() string { + return tsdb.NewPoint(p.Measurement, p.Tags, p.Fields, p.Time).String() +} + +// UnmarshalJSON decodes the data into the Point struct +func (p *Point) UnmarshalJSON(b []byte) error { + var normal struct { + Measurement string `json:"measurement"` + Tags map[string]string `json:"tags"` + Time time.Time `json:"time"` + Precision string `json:"precision"` + Fields map[string]interface{} `json:"fields"` + } + var epoch struct { + Measurement string `json:"measurement"` + Tags map[string]string `json:"tags"` + Time *int64 `json:"time"` + Precision string `json:"precision"` + Fields map[string]interface{} `json:"fields"` + } + + if err := func() error { + var err error + dec := json.NewDecoder(bytes.NewBuffer(b)) + dec.UseNumber() + if err = dec.Decode(&epoch); err != nil { return err } - w.Flush() - w.Close() - } else { - b = bytes.NewBuffer(data) + // Convert from epoch to time.Time, but only if Time + // was actually set. + var ts time.Time + if epoch.Time != nil { + ts, err = EpochToTime(*epoch.Time, epoch.Precision) + if err != nil { + return err + } + } + p.Measurement = epoch.Measurement + p.Tags = epoch.Tags + p.Time = ts + p.Precision = epoch.Precision + p.Fields = normalizeFields(epoch.Fields) + return nil + }(); err == nil { + return nil } - req, err := http.NewRequest("POST", url, b) - if err != nil { + + dec := json.NewDecoder(bytes.NewBuffer(b)) + dec.UseNumber() + if err := dec.Decode(&normal); err != nil { return err } - if self.compression { - req.Header.Set("Content-Encoding", "gzip") - } - resp, err := self.httpClient.Do(req) - return responseToError(resp, err, true) + normal.Time = SetPrecision(normal.Time, normal.Precision) + p.Measurement = normal.Measurement + p.Tags = normal.Tags + p.Time = normal.Time + p.Precision = normal.Precision + p.Fields = normalizeFields(normal.Fields) + + return nil } -func (self *Client) Query(query string, precision ...TimePrecision) ([]*Series, error) { - return self.queryCommon(query, false, precision...) +// Remove any notion of json.Number +func normalizeFields(fields map[string]interface{}) map[string]interface{} { + newFields := map[string]interface{}{} + + for k, v := range fields { + switch v := v.(type) { + case json.Number: + jv, e := v.Float64() + if e != nil { + panic(fmt.Sprintf("unable to convert json.Number to float64: %s", e)) + } + newFields[k] = jv + default: + newFields[k] = v + } + } + return newFields } -func (self *Client) QueryWithNumbers(query string, precision ...TimePrecision) ([]*Series, error) { - return self.queryCommon(query, true, precision...) +// BatchPoints is used to send batched data in a single write. +// Database and Points are required +// If no retention policy is specified, it will use the databases default retention policy. +// If tags are specified, they will be "merged" with all points. If a point already has that tag, it is ignored. +// If time is specified, it will be applied to any point with an empty time. +// Precision can be specified if the time is in epoch format (integer). +// Valid values for Precision are n, u, ms, s, m, and h +type BatchPoints struct { + Points []Point `json:"points,omitempty"` + Database string `json:"database,omitempty"` + RetentionPolicy string `json:"retentionPolicy,omitempty"` + Tags map[string]string `json:"tags,omitempty"` + Time time.Time `json:"time,omitempty"` + Precision string `json:"precision,omitempty"` + WriteConsistency string `json:"-"` } -func (self *Client) queryCommon(query string, useNumber bool, precision ...TimePrecision) ([]*Series, error) { - escapedQuery := url.QueryEscape(query) - url := self.getUrl("/db/" + self.database + "/series") - if len(precision) > 0 { - url += "&time_precision=" + string(precision[0]) +// UnmarshalJSON decodes the data into the BatchPoints struct +func (bp *BatchPoints) UnmarshalJSON(b []byte) error { + var normal struct { + Points []Point `json:"points"` + Database string `json:"database"` + RetentionPolicy string `json:"retentionPolicy"` + Tags map[string]string `json:"tags"` + Time time.Time `json:"time"` + Precision string `json:"precision"` } - url += "&q=" + escapedQuery - req, err := http.NewRequest("GET", url, nil) - if err != nil { - return nil, err - } - if !self.compression { - req.Header.Set("Accept-Encoding", "identity") - } - resp, err := self.httpClient.Do(req) - err = responseToError(resp, err, false) - if err != nil { - return nil, err - } - defer resp.Body.Close() - - series := []*Series{} - decoder := json.NewDecoder(resp.Body) - if useNumber { - decoder.UseNumber() - } - err = decoder.Decode(&series) - if err != nil { - return nil, err - } - return series, nil -} - -func (self *Client) Ping() error { - url := self.getUrl("/ping") - resp, err := self.httpClient.Get(url) - return responseToError(resp, err, true) -} - -func (self *Client) AuthenticateDatabaseUser(database, username, password string) error { - url := self.getUrlWithUserAndPass(fmt.Sprintf("/db/%s/authenticate", database), username, password) - resp, err := self.httpClient.Get(url) - return responseToError(resp, err, true) -} - -func (self *Client) AuthenticateClusterAdmin(username, password string) error { - url := self.getUrlWithUserAndPass("/cluster_admins/authenticate", username, password) - resp, err := self.httpClient.Get(url) - return responseToError(resp, err, true) -} - -type LongTermShortTermShards struct { - // Long term shards, (doesn't get populated for version >= 0.8.0) - LongTerm []*Shard `json:"longTerm"` - // Short term shards, (doesn't get populated for version >= 0.8.0) - ShortTerm []*Shard `json:"shortTerm"` - // All shards in the system (Long + Short term shards for version < 0.8.0) - All []*Shard `json:"-"` -} - -type Shard struct { - Id uint32 `json:"id"` - EndTime int64 `json:"endTime"` - StartTime int64 `json:"startTime"` - ServerIds []uint32 `json:"serverIds"` - SpaceName string `json:"spaceName"` - Database string `json:"database"` -} - -type ShardSpaceCollection struct { - ShardSpaces []ShardSpace -} - -func (self *Client) GetShards() (*LongTermShortTermShards, error) { - url := self.getUrlWithUserAndPass("/cluster/shards", self.username, self.password) - body, version, err := self.getWithVersion(url) - if err != nil { - return nil, err - } - return parseShards(body, version) -} - -func isOrNewerThan(version, reference string) bool { - if version == "vdev" { - return true - } - majorMinor := strings.Split(version[1:], ".")[:2] - refMajorMinor := strings.Split(reference[1:], ".")[:2] - if majorMinor[0] > refMajorMinor[0] { - return true - } - if majorMinor[1] > refMajorMinor[1] { - return true - } - return majorMinor[1] == refMajorMinor[1] -} - -func parseShards(body []byte, version string) (*LongTermShortTermShards, error) { - // strip the initial v in `v0.8.0` and split on the dots - if version != "" && isOrNewerThan(version, "v0.8") { - return parseNewShards(body) - } - shards := &LongTermShortTermShards{} - err := json.Unmarshal(body, &shards) - if err != nil { - return nil, err + var epoch struct { + Points []Point `json:"points"` + Database string `json:"database"` + RetentionPolicy string `json:"retentionPolicy"` + Tags map[string]string `json:"tags"` + Time *int64 `json:"time"` + Precision string `json:"precision"` } - shards.All = make([]*Shard, len(shards.LongTerm)+len(shards.ShortTerm)) - copy(shards.All, shards.LongTerm) - copy(shards.All[len(shards.LongTerm):], shards.ShortTerm) - return shards, nil -} - -func parseNewShards(body []byte) (*LongTermShortTermShards, error) { - shards := []*Shard{} - err := json.Unmarshal(body, &shards) - if err != nil { - return nil, err + if err := func() error { + var err error + if err = json.Unmarshal(b, &epoch); err != nil { + return err + } + // Convert from epoch to time.Time + var ts time.Time + if epoch.Time != nil { + ts, err = EpochToTime(*epoch.Time, epoch.Precision) + if err != nil { + return err + } + } + bp.Points = epoch.Points + bp.Database = epoch.Database + bp.RetentionPolicy = epoch.RetentionPolicy + bp.Tags = epoch.Tags + bp.Time = ts + bp.Precision = epoch.Precision + return nil + }(); err == nil { + return nil } - return &LongTermShortTermShards{All: shards}, nil -} - -// Added to InfluxDB in 0.8.0 -func (self *Client) GetShardSpaces() ([]*ShardSpace, error) { - url := self.getUrlWithUserAndPass("/cluster/shard_spaces", self.username, self.password) - body, err := self.get(url) - if err != nil { - return nil, err - } - spaces := []*ShardSpace{} - err = json.Unmarshal(body, &spaces) - if err != nil { - return nil, err - } - - return spaces, nil -} - -// Added to InfluxDB in 0.8.0 -func (self *Client) DropShardSpace(database, name string) error { - url := self.getUrlWithUserAndPass(fmt.Sprintf("/cluster/shard_spaces/%s/%s", database, name), self.username, self.password) - _, err := self.del(url) - return err -} - -// Added to InfluxDB in 0.8.0 -func (self *Client) CreateShardSpace(database string, space *ShardSpace) error { - url := self.getUrl(fmt.Sprintf("/cluster/shard_spaces/%s", database)) - data, err := json.Marshal(space) - if err != nil { + if err := json.Unmarshal(b, &normal); err != nil { return err } - resp, err := self.httpClient.Post(url, "application/json", bytes.NewBuffer(data)) - return responseToError(resp, err, true) + normal.Time = SetPrecision(normal.Time, normal.Precision) + bp.Points = normal.Points + bp.Database = normal.Database + bp.RetentionPolicy = normal.RetentionPolicy + bp.Tags = normal.Tags + bp.Time = normal.Time + bp.Precision = normal.Precision + + return nil } -func (self *Client) DropShard(id uint32, serverIds []uint32) error { - url := self.getUrlWithUserAndPass(fmt.Sprintf("/cluster/shards/%d", id), self.username, self.password) - ids := map[string][]uint32{"serverIds": serverIds} - body, err := json.Marshal(ids) - if err != nil { - return err - } - _, err = self.delWithBody(url, bytes.NewBuffer(body)) - return err +// utility functions + +// Addr provides the current url as a string of the server the client is connected to. +func (c *Client) Addr() string { + return c.url.String() } -// Added to InfluxDB in 0.8.2 -func (self *Client) UpdateShardSpace(database, name string, space *ShardSpace) error { - url := self.getUrl(fmt.Sprintf("/cluster/shard_spaces/%s/%s", database, name)) - data, err := json.Marshal(space) - if err != nil { - return err +// helper functions + +// EpochToTime takes a unix epoch time and uses precision to return back a time.Time +func EpochToTime(epoch int64, precision string) (time.Time, error) { + if precision == "" { + precision = "s" } - resp, err := self.httpClient.Post(url, "application/json", bytes.NewBuffer(data)) - return responseToError(resp, err, true) + var t time.Time + switch precision { + case "h": + t = time.Unix(0, epoch*int64(time.Hour)) + case "m": + t = time.Unix(0, epoch*int64(time.Minute)) + case "s": + t = time.Unix(0, epoch*int64(time.Second)) + case "ms": + t = time.Unix(0, epoch*int64(time.Millisecond)) + case "u": + t = time.Unix(0, epoch*int64(time.Microsecond)) + case "n": + t = time.Unix(0, epoch) + default: + return time.Time{}, fmt.Errorf("Unknown precision %q", precision) + } + return t, nil +} + +// SetPrecision will round a time to the specified precision +func SetPrecision(t time.Time, precision string) time.Time { + switch precision { + case "n": + case "u": + return t.Round(time.Microsecond) + case "ms": + return t.Round(time.Millisecond) + case "s": + return t.Round(time.Second) + case "m": + return t.Round(time.Minute) + case "h": + return t.Round(time.Hour) + } + return t } diff --git a/Godeps/_workspace/src/github.com/influxdb/influxdb/client/series.go b/Godeps/_workspace/src/github.com/influxdb/influxdb/client/series.go deleted file mode 100644 index f18b8bbb59e..00000000000 --- a/Godeps/_workspace/src/github.com/influxdb/influxdb/client/series.go +++ /dev/null @@ -1,19 +0,0 @@ -package client - -type Series struct { - Name string `json:"name"` - Columns []string `json:"columns"` - Points [][]interface{} `json:"points"` -} - -func (self *Series) GetName() string { - return self.Name -} - -func (self *Series) GetColumns() []string { - return self.Columns -} - -func (self *Series) GetPoints() [][]interface{} { - return self.Points -} diff --git a/Godeps/_workspace/src/github.com/influxdb/influxdb/client/shard_space.go b/Godeps/_workspace/src/github.com/influxdb/influxdb/client/shard_space.go deleted file mode 100644 index 87dea1173bc..00000000000 --- a/Godeps/_workspace/src/github.com/influxdb/influxdb/client/shard_space.go +++ /dev/null @@ -1,15 +0,0 @@ -package client - -type ShardSpace struct { - // required, must be unique within the database - Name string `json:"name"` - // required, a database has many shard spaces and a shard space belongs to a database - Database string `json:"database"` - // this is optional, if they don't set it, we'll set to /.*/ - Regex string `json:"regex"` - // this is optional, if they don't set it, it will default to the storage.dir in the config - RetentionPolicy string `json:"retentionPolicy"` - ShardDuration string `json:"shardDuration"` - ReplicationFactor uint32 `json:"replicationFactor"` - Split uint32 `json:"split"` -} diff --git a/Godeps/_workspace/src/github.com/influxdb/influxdb/influxql/INFLUXQL.md b/Godeps/_workspace/src/github.com/influxdb/influxdb/influxql/INFLUXQL.md new file mode 100644 index 00000000000..087fc3b9ff6 --- /dev/null +++ b/Godeps/_workspace/src/github.com/influxdb/influxdb/influxql/INFLUXQL.md @@ -0,0 +1,650 @@ +# The Influx Query Language Specification + +## Introduction + +This is a reference for the Influx Query Language ("InfluxQL"). + +InfluxQL is a SQL-like query language for interacting with InfluxDB. It has been lovingly crafted to feel familiar to those coming from other SQL or SQL-like environments while providing features specific to storing and analyzing time series data. + +## Notation + +The syntax is specified using Extended Backus-Naur Form ("EBNF"). EBNF is the same notation used in the [Go](http://golang.org) programming language specification, which can be found [here](https://golang.org/ref/spec). Not so coincidentally, InfluxDB is written in Go. + +``` +Production = production_name "=" [ Expression ] "." . +Expression = Alternative { "|" Alternative } . +Alternative = Term { Term } . +Term = production_name | token [ "…" token ] | Group | Option | Repetition . +Group = "(" Expression ")" . +Option = "[" Expression "]" . +Repetition = "{" Expression "}" . +``` + +Notation operators in order of increasing precedence: + +``` +| alternation +() grouping +[] option (0 or 1 times) +{} repetition (0 to n times) +``` + +## Query representation + +### Characters + +InfluxQL is Unicode text encoded in [UTF-8](http://en.wikipedia.org/wiki/UTF-8). + +``` +newline = /* the Unicode code point U+000A */ . +unicode_char = /* an arbitrary Unicode code point except newline */ . +``` + +## Letters and digits + +Letters are the set of ASCII characters plus the underscore character _ (U+005F) is considered a letter. + +Only decimal digits are supported. + +``` +letter = ascii_letter | "_" . +ascii_letter = "A" … "Z" | "a" … "z" . +digit = "0" … "9" . +``` + +## Identifiers + +Identifiers are tokens which refer to database names, retention policy names, user names, measurement names, tag keys, and field names. + +The rules: + +- double quoted identifiers can contain any unicode character other than a new line +- double quoted identifiers can contain escaped `"` characters (i.e., `\"`) +- unquoted identifiers must start with an upper or lowercase ASCII character or "_" +- unquoted identifiers may contain only ASCII letters, decimal digits, and "_" + +``` +identifier = unquoted_identifier | quoted_identifier . +unquoted_identifier = ( letter ) { letter | digit } . +quoted_identifier = `"` unicode_char { unicode_char } `"` . +``` + +#### Examples: + +``` +cpu +_cpu_stats +"1h" +"anything really" +"1_Crazy-1337.identifier>NAME👍" +``` + +## Keywords + +``` +ALL ALTER AS ASC BEGIN BY +CREATE CONTINUOUS DATABASE DATABASES DEFAULT DELETE +DESC DROP DURATION END EXISTS EXPLAIN +FIELD FROM GRANT GROUP IF IN +INNER INSERT INTO KEY KEYS LIMIT +SHOW MEASUREMENT MEASUREMENTS OFFSET ON ORDER +PASSWORD POLICY POLICIES PRIVILEGES QUERIES QUERY +READ REPLICATION RETENTION REVOKE SELECT SERIES +SLIMIT SOFFSET TAG TO USER USERS +VALUES WHERE WITH WRITE +``` + +## Literals + +### Integers + +InfluxQL supports decimal integer literals. Hexadecimal and octal literals are not currently supported. + +``` +int_lit = ( "1" … "9" ) { digit } . +``` + +### Floats + +InfluxQL supports floating-point literals. Exponents are not currently supported. + +``` +float_lit = int_lit "." int_lit . +``` + +### Strings + +String literals must be surrounded by single quotes. Strings may contain `'` characters as long as they are escaped (i.e., `\'`). + +``` +string_lit = `'` { unicode_char } `'`' . +``` + +### Durations + +Duration literals specify a length of time. An integer literal followed immediately (with no spaces) by a duration unit listed below is interpreted as a duration literal. + +``` +Duration unit definitions +------------------------- +| Units | Meaning | +|--------|-----------------------------------------| +| u or µ | microseconds (1 millionth of a second) | +| ms | milliseconds (1 thousandth of a second) | +| s | second | +| m | minute | +| h | hour | +| d | day | +| w | week | +``` + +``` +duration_lit = int_lit duration_unit . +duration_unit = "u" | "µ" | "s" | "h" | "d" | "w" | "ms" . +``` + +### Dates & Times + +The date and time literal format is not specified in EBNF like the rest of this document. It is specified using Go's date / time parsing format, which is a reference date written in the format required by InfluxQL. The reference date time is: + +InfluxQL reference date time: January 2nd, 2006 at 3:04:05 PM + +``` +time_lit = "2006-01-02 15:04:05.999999" | "2006-01-02" +``` + +### Booleans + +``` +bool_lit = TRUE | FALSE . +``` + +### Regular Expressions + +``` +regex_lit = "/" { unicode_char } "/" . +``` + +## Queries + +A query is composed of one or more statements separated by a semicolon. + +``` +query = statement { ; statement } . + +statement = alter_retention_policy_stmt | + create_continuous_query_stmt | + create_database_stmt | + create_retention_policy_stmt | + create_user_stmt | + delete_stmt | + drop_continuous_query_stmt | + drop_database_stmt | + drop_measurement_stmt | + drop_retention_policy_stmt | + drop_series_stmt | + drop_user_stmt | + grant_stmt | + show_continuous_queries_stmt | + show_databases_stmt | + show_field_keys_stmt | + show_measurements_stmt | + show_retention_policies | + show_series_stmt | + show_tag_keys_stmt | + show_tag_values_stmt | + show_users_stmt | + revoke_stmt | + select_stmt . +``` + +## Statements + +### ALTER RETENTION POLICY + +``` +alter_retention_policy_stmt = "ALTER RETENTION POLICY" policy_name "ON" + db_name retention_policy_option + [ retention_policy_option ] + [ retention_policy_option ] . + +db_name = identifier . + +policy_name = identifier . + +retention_policy_option = retention_policy_duration | + retention_policy_replication | + "DEFAULT" . + +retention_policy_duration = "DURATION" duration_lit . +retention_policy_replication = "REPLICATION" int_lit +``` + +#### Examples: + +```sql +-- Set default retention policy for mydb to 1h.cpu. +ALTER RETENTION POLICY "1h.cpu" ON mydb DEFAULT; + +-- Change duration and replication factor. +ALTER RETENTION POLICY policy1 ON somedb DURATION 1h REPLICATION 4 +``` + +### CREATE CONTINUOUS QUERY + +``` +create_continuous_query_stmt = "CREATE CONTINUOUS QUERY" query_name "ON" db_name + "BEGIN" select_stmt "END" . + +query_name = identifier . +``` + +#### Examples: + +```sql +-- selects from default retention policy and writes into 6_months retention policy +CREATE CONTINUOUS QUERY "10m_event_count" +ON db_name +BEGIN + SELECT count(value) + INTO "6_months".events + FROM events + GROUP BY time(10m) +END; + +-- this selects from the output of one continuous query in one retention policy and outputs to another series in another retention policy +CREATE CONTINUOUS QUERY "1h_event_count" +ON db_name +BEGIN + SELECT sum(count) as count + INTO "2_years".events + FROM "6_months".events + GROUP BY time(1h) +END; +``` + +### CREATE DATABASE + +``` +create_database_stmt = "CREATE DATABASE" db_name +``` + +#### Example: + +```sql +CREATE DATABASE foo +``` + +### CREATE RETENTION POLICY + +``` +create_retention_policy_stmt = "CREATE RETENTION POLICY" policy_name "ON" + db_name retention_policy_duration + retention_policy_replication + [ "DEFAULT" ] . +``` + +#### Examples + +```sql +-- Create a retention policy. +CREATE RETENTION POLICY "10m.events" ON somedb DURATION 10m REPLICATION 2; + +-- Create a retention policy and set it as the default. +CREATE RETENTION POLICY "10m.events" ON somedb DURATION 10m REPLICATION 2 DEFAULT; +``` + +### CREATE USER + +``` +create_user_stmt = "CREATE USER" user_name "WITH PASSWORD" password + [ "WITH ALL PRIVILEGES" ] . +``` + +#### Examples: + +```sql +-- Create a normal database user. +CREATE USER jdoe WITH PASSWORD '1337password'; + +-- Create a cluster admin. +-- Note: Unlike the GRANT statement, the "PRIVILEGES" keyword is required here. +CREATE USER jdoe WITH PASSWORD '1337password' WITH ALL PRIVILEGES; +``` + +### DELETE + +``` +delete_stmt = "DELETE" from_clause where_clause . +``` + +#### Example: + +```sql +-- delete data points from the cpu measurement where the region tag +-- equals 'uswest' +DELETE FROM cpu WHERE region = 'uswest'; +``` + +### DROP CONTINUOUS QUERY + +drop_continuous_query_stmt = "DROP CONTINUOUS QUERY" query_name . + +#### Example: + +```sql +DROP CONTINUOUS QUERY myquery; +``` + +### DROP DATABASE + +drop_database_stmt = "DROP DATABASE" db_name . + +#### Example: + +```sql +DROP DATABASE mydb; +``` + +### DROP MEASUREMENT + +``` +drop_measurement_stmt = "DROP MEASUREMENT" measurement . +``` + +#### Examples: + +```sql +-- drop the cpu measurement +DROP MEASUREMENT cpu; +``` + +### DROP RETENTION POLICY + +``` +drop_retention_policy_stmt = "DROP RETENTION POLICY" policy_name "ON" db_name . +``` + +#### Example: + +```sql +-- drop the retention policy named 1h.cpu from mydb +DROP RETENTION POLICY "1h.cpu" ON mydb; +``` + +### DROP SERIES + +``` +drop_series_stmt = "DROP SERIES" [ from_clause ] [ where_clause ] +``` + +#### Example: + +```sql + +``` + +### DROP USER + +``` +drop_user_stmt = "DROP USER" user_name . +``` + +#### Example: + +```sql +DROP USER jdoe; + +``` + +### GRANT + +NOTE: Users can be granted privileges on databases that do not exist. + +``` +grant_stmt = "GRANT" privilege [ on_clause ] to_clause +``` + +#### Examples: + +```sql +-- grant cluster admin privileges +GRANT ALL TO jdoe; + +-- grant read access to a database +GRANT READ ON mydb TO jdoe; +``` + +### SHOW CONTINUOUS QUERIES + +show_continuous_queries_stmt = "SHOW CONTINUOUS QUERIES" + +#### Example: + +```sql +-- show all continuous queries +SHOW CONTINUOUS QUERIES; +``` + +### SHOW DATABASES + +``` +show_databases_stmt = "SHOW DATABASES" . +``` + +#### Example: + +```sql +-- show all databases +SHOW DATABASES; +``` + +### SHOW FIELD + +show_field_keys_stmt = "SHOW FIELD KEYS" [ from_clause ] . + +#### Examples: + +```sql +-- show field keys from all measurements +SHOW FIELD KEYS; + +-- show field keys from specified measurement +SHOW FIELD KEYS FROM cpu; +``` + +### SHOW MEASUREMENTS + +show_measurements_stmt = [ where_clause ] [ group_by_clause ] [ limit_clause ] + [ offset_clause ] . + +```sql +-- show all measurements +SHOW MEASUREMENTS; + +-- show measurements where region tag = 'uswest' AND host tag = 'serverA' +SHOW MEASUREMENTS WHERE region = 'uswest' AND host = 'serverA'; +``` + +### SHOW RETENTION POLICIES + +``` +show_retention_policies = "SHOW RETENTION POLICIES" db_name . +``` + +#### Example: + +```sql +-- show all retention policies on a database +SHOW RETENTION POLICIES mydb; +``` + +### SHOW SERIES + +``` +show_series_stmt = [ from_clause ] [ where_clause ] [ group_by_clause ] + [ limit_clause ] [ offset_clause ] . +``` + +#### Example: + +```sql + +``` + +### SHOW TAG KEYS + +``` +show_tag_keys_stmt = [ from_clause ] [ where_clause ] [ group_by_clause ] + [ limit_clause ] [ offset_clause ] . +``` + +#### Examples: + +```sql +-- show all tag keys +SHOW TAG KEYS; + +-- show all tag keys from the cpu measurement +SHOW TAG KEYS FROM cpu; + +-- show all tag keys from the cpu measurement where the region key = 'uswest' +SHOW TAG KEYS FROM cpu WHERE region = 'uswest'; + +-- show all tag keys where the host key = 'serverA' +SHOW TAG KEYS WHERE host = 'serverA'; +``` + +### SHOW TAG VALUES + +``` +show_tag_values_stmt = [ from_clause ] with_tag_clause [ where_clause ] + [ group_by_clause ] [ limit_clause ] [ offset_clause ] . +``` + +#### Examples: + +```sql +-- show all tag values across all measurements for the region tag +SHOW TAG VALUES WITH TAG = 'region'; + +-- show tag values from the cpu measurement for the region tag +SHOW TAG VALUES FROM cpu WITH TAG = 'region'; + +-- show tag values from the cpu measurement for region & host tag keys where service = 'redis' +SHOW TAG VALUES FROM cpu WITH TAG IN (region, host) WHERE service = 'redis'; +``` + +### SHOW USERS + +``` +show_users_stmt = "SHOW USERS" . +``` + +#### Example: + +```sql +-- show all users +SHOW USERS; +``` + +### REVOKE + +``` +revoke_stmt = privilege [ "ON" db_name ] "FROM" user_name +``` + +#### Examples: + +```sql +-- revoke cluster admin from jdoe +REVOKE ALL PRIVILEGES FROM jdoe; + +-- revoke read privileges from jdoe on mydb +REVOKE READ ON mydb FROM jdoe; +``` + +### SELECT + +``` +select_stmt = fields from_clause [ into_clause ] [ where_clause ] + [ group_by_clause ] [ order_by_clause ] [ limit_clause ] + [ offset_clause ] [ slimit_clause ] [ soffset_clause ]. +``` + +#### Examples: + +```sql +-- select mean value from the cpu measurement where region = 'uswest' grouped by 10 minute intervals +SELECT mean(value) FROM cpu WHERE region = 'uswest' GROUP BY time(10m) fill(0); +``` + +## Clauses + +``` +from_clause = "FROM" measurements . + +group_by_clause = "GROUP BY" dimensions fill(