Kubernetes Mesos integration

This commit includes the fundamental components of the Kubernetes Mesos
integration:

* Kubernetes-Mesos scheduler
* Kubernetes-Mesos executor
* Supporting libs

Dependencies and upstream changes are included in a separate commit for easy
review.

After this initial upstream, there'll be two PRs following.

* km (hypercube) and k8sm-controller-manager #9265
* Static pods support #9077

Fixes applied:

- Precise metrics subsystems definitions
  -  mesosphere/kubernetes-mesos#331
  - https://github.com/GoogleCloudPlatform/kubernetes/pull/8882#discussion_r31875232
  - https://github.com/GoogleCloudPlatform/kubernetes/pull/8882#discussion_r31875240
- Improve comments and add clarifications
  - Fixes https://github.com/GoogleCloudPlatform/kubernetes/pull/8882#discussion-diff-31875208
  - Fixes https://github.com/GoogleCloudPlatform/kubernetes/pull/8882#discussion-diff-31875226
  - Fixes https://github.com/GoogleCloudPlatform/kubernetes/pull/8882#discussion-diff-31875227
  - Fixes https://github.com/GoogleCloudPlatform/kubernetes/pull/8882#discussion-diff-31875228
  - Fixes https://github.com/GoogleCloudPlatform/kubernetes/pull/8882#discussion-diff-31875239
  - Fixes https://github.com/GoogleCloudPlatform/kubernetes/pull/8882#discussion-diff-31875243
  - Fixes https://github.com/GoogleCloudPlatform/kubernetes/pull/8882#discussion-diff-31875234
  - Fixes https://github.com/GoogleCloudPlatform/kubernetes/pull/8882#discussion-diff-31875256
  - Fixes https://github.com/GoogleCloudPlatform/kubernetes/pull/8882#discussion-diff-31875255
  - Fixes https://github.com/GoogleCloudPlatform/kubernetes/pull/8882#discussion-diff-31875251
- Clarify which Schedule function is actually called
  - Fixes https://github.com/GoogleCloudPlatform/kubernetes/pull/8882#discussion-diff-31875246
This commit is contained in:
James DeFelice
2015-06-10 20:58:22 +00:00
parent 7d66559725
commit 932c58a497
105 changed files with 15162 additions and 0 deletions

View File

@@ -0,0 +1,18 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// This package main implements the executable Kubernetes Mesos executor.
package main

View File

@@ -0,0 +1,47 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package main
import (
"fmt"
"os"
"runtime"
"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/executor/service"
"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/hyperkube"
"github.com/GoogleCloudPlatform/kubernetes/pkg/util"
"github.com/GoogleCloudPlatform/kubernetes/pkg/version/verflag"
"github.com/spf13/pflag"
)
func main() {
runtime.GOMAXPROCS(runtime.NumCPU())
s := service.NewKubeletExecutorServer()
s.AddStandaloneFlags(pflag.CommandLine)
util.InitFlags()
util.InitLogs()
defer util.FlushLogs()
verflag.PrintAndExitIfRequested()
if err := s.Run(hyperkube.Nil(), pflag.CommandLine.Args()); err != nil {
fmt.Fprintf(os.Stderr, err.Error())
os.Exit(1)
}
}

View File

@@ -0,0 +1,21 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// This package main is used for testing the redirfd package.
// Inspired by http://skarnet.org/software/execline/redirfd.html.
// Usage:
// k8sm-redirfb [-n] [-b] {mode} {fd} {file} {prog...}
package main

View File

@@ -0,0 +1,105 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package main
import (
"flag"
"fmt"
"os"
"os/exec"
"syscall"
"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/redirfd"
)
func main() {
nonblock := flag.Bool("n", false, "open file in non-blocking mode")
changemode := flag.Bool("b", false, "change mode of file after opening it: to non-blocking mode if the -n option was not given, to blocking mode if it was")
flag.Parse()
args := flag.Args()
if len(args) < 4 {
fmt.Fprintf(os.Stderr, "expected {mode} {fd} {file} instead of: %v\n", args)
os.Exit(1)
}
var mode redirfd.RedirectMode
switch m := args[0]; m {
case "r":
mode = redirfd.Read
case "w":
mode = redirfd.Write
case "u":
mode = redirfd.Update
case "a":
mode = redirfd.Append
case "c":
mode = redirfd.AppendExisting
case "x":
mode = redirfd.WriteNew
default:
fmt.Fprintf(os.Stderr, "unrecognized mode %q\n", mode)
os.Exit(1)
}
fd, err := redirfd.ParseFileDescriptor(args[1])
if err != nil {
fmt.Fprintf(os.Stderr, "failed to parse file descriptor: %v\n", err)
os.Exit(1)
}
file := args[2]
f, err := mode.Redirect(*nonblock, *changemode, fd, file)
if err != nil {
fmt.Fprintf(os.Stderr, "redirect failed: %q, %v\n", args[1], err)
os.Exit(1)
}
var pargs []string
if len(args) > 4 {
pargs = args[4:]
}
cmd := exec.Command(args[3], pargs...)
cmd.Stdin = os.Stdin
cmd.Stdout = os.Stdout
cmd.Stderr = os.Stderr
switch fd {
case redirfd.Stdin:
cmd.Stdin = f
case redirfd.Stdout:
cmd.Stdout = f
case redirfd.Stderr:
cmd.Stderr = f
default:
cmd.ExtraFiles = []*os.File{f}
}
defer f.Close()
if err = cmd.Run(); err != nil {
exiterr := err.(*exec.ExitError)
state := exiterr.ProcessState
if state != nil {
sys := state.Sys()
if waitStatus, ok := sys.(syscall.WaitStatus); ok {
if waitStatus.Signaled() {
os.Exit(256 + int(waitStatus.Signal()))
} else {
os.Exit(waitStatus.ExitStatus())
}
}
}
os.Exit(3)
}
}

View File

@@ -0,0 +1,18 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// This package main implements the executable Kubernetes Mesos scheduler.
package main

View File

@@ -0,0 +1,46 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package main
import (
"fmt"
"os"
"runtime"
"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/hyperkube"
"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/scheduler/service"
"github.com/GoogleCloudPlatform/kubernetes/pkg/util"
"github.com/GoogleCloudPlatform/kubernetes/pkg/version/verflag"
"github.com/spf13/pflag"
)
func main() {
runtime.GOMAXPROCS(runtime.NumCPU())
s := service.NewSchedulerServer()
s.AddStandaloneFlags(pflag.CommandLine)
util.InitFlags()
util.InitLogs()
defer util.FlushLogs()
verflag.PrintAndExitIfRequested()
if err := s.Run(hyperkube.Nil(), pflag.CommandLine.Args()); err != nil {
fmt.Fprintf(os.Stderr, err.Error())
os.Exit(1)
}
}

View File

@@ -0,0 +1,43 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package assert
import (
"testing"
"time"
"github.com/stretchr/testify/assert"
)
// EventuallyTrue asserts that the given predicate becomes true within the given timeout. It
// checks the predicate regularly each 100ms.
func EventuallyTrue(t *testing.T, timeout time.Duration, fn func() bool, msgAndArgs ...interface{}) bool {
start := time.Now()
for {
if fn() {
return true
}
if time.Now().Sub(start) > timeout {
if len(msgAndArgs) > 0 {
return assert.Fail(t, msgAndArgs[0].(string), msgAndArgs[1:]...)
} else {
return assert.Fail(t, "predicate fn has not been true after %v", timeout.String())
}
}
time.Sleep(100 * time.Millisecond)
}
}

View File

@@ -0,0 +1,19 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Package assert is an utility package containing reusable testing functionality
// extending github.com/stretchr/testify/assert
package assert

View File

@@ -0,0 +1,96 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package backoff
import (
"math/rand"
"sync"
"time"
log "github.com/golang/glog"
)
type clock interface {
Now() time.Time
}
type realClock struct{}
func (realClock) Now() time.Time {
return time.Now()
}
type backoffEntry struct {
backoff time.Duration
lastUpdate time.Time
}
type Backoff struct {
perItemBackoff map[string]*backoffEntry
lock sync.Mutex
clock clock
defaultDuration time.Duration
maxDuration time.Duration
}
func New(initial, max time.Duration) *Backoff {
return &Backoff{
perItemBackoff: map[string]*backoffEntry{},
clock: realClock{},
defaultDuration: initial,
maxDuration: max,
}
}
func (p *Backoff) getEntry(id string) *backoffEntry {
p.lock.Lock()
defer p.lock.Unlock()
entry, ok := p.perItemBackoff[id]
if !ok {
entry = &backoffEntry{backoff: p.defaultDuration}
p.perItemBackoff[id] = entry
}
entry.lastUpdate = p.clock.Now()
return entry
}
func (p *Backoff) Get(id string) time.Duration {
entry := p.getEntry(id)
duration := entry.backoff
entry.backoff *= 2
if entry.backoff > p.maxDuration {
entry.backoff = p.maxDuration
}
//TODO(jdef) parameterize use of jitter?
// add jitter, get better backoff distribution
duration = time.Duration(rand.Int63n(int64(duration)))
log.V(3).Infof("Backing off %v for pod %s", duration, id)
return duration
}
// Garbage collect records that have aged past maxDuration. Backoff users are expected
// to invoke this periodically.
func (p *Backoff) GC() {
p.lock.Lock()
defer p.lock.Unlock()
now := p.clock.Now()
for id, entry := range p.perItemBackoff {
if now.Sub(entry.lastUpdate) > p.maxDuration {
delete(p.perItemBackoff, id)
}
}
}

View File

@@ -0,0 +1,19 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Package backoff provides backoff functionality with a simple API.
// Originally copied from Kubernetes: plugin/pkg/scheduler/factory/factory.go
package backoff

View File

@@ -0,0 +1,18 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Package election provides interfaces used for master election.
package election

View File

@@ -0,0 +1,185 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package election
import (
"fmt"
"time"
"github.com/GoogleCloudPlatform/kubernetes/pkg/tools"
"github.com/GoogleCloudPlatform/kubernetes/pkg/util"
"github.com/GoogleCloudPlatform/kubernetes/pkg/watch"
"github.com/coreos/go-etcd/etcd"
"github.com/golang/glog"
)
// Master is used to announce the current elected master.
type Master string
// IsAnAPIObject is used solely so we can work with the watch package.
// TODO(k8s): Either fix watch so this isn't necessary, or make this a real API Object.
// TODO(k8s): when it becomes clear how this package will be used, move these declarations to
// to the proper place.
func (Master) IsAnAPIObject() {}
// NewEtcdMasterElector returns an implementation of election.MasterElector backed by etcd.
func NewEtcdMasterElector(h tools.EtcdGetSet) MasterElector {
return &etcdMasterElector{etcd: h}
}
type empty struct{}
// internal implementation struct
type etcdMasterElector struct {
etcd tools.EtcdGetSet
done chan empty
events chan watch.Event
}
// Elect implements the election.MasterElector interface.
func (e *etcdMasterElector) Elect(path, id string) watch.Interface {
e.done = make(chan empty)
e.events = make(chan watch.Event)
go util.Forever(func() { e.run(path, id) }, time.Second*5)
return e
}
func (e *etcdMasterElector) run(path, id string) {
masters := make(chan string)
errors := make(chan error)
go e.master(path, id, 30, masters, errors, e.done) // TODO(jdef) extract constant
for {
select {
case m := <-masters:
e.events <- watch.Event{
Type: watch.Modified,
Object: Master(m),
}
case e := <-errors:
glog.Errorf("error in election: %v", e)
}
}
}
// ResultChan implements the watch.Interface interface.
func (e *etcdMasterElector) ResultChan() <-chan watch.Event {
return e.events
}
// extendMaster attempts to extend ownership of a master lock for TTL seconds.
// returns "", nil if extension failed
// returns id, nil if extension succeeded
// returns "", err if an error occurred
func (e *etcdMasterElector) extendMaster(path, id string, ttl uint64, res *etcd.Response) (string, error) {
// If it matches the passed in id, extend the lease by writing a new entry.
// Uses compare and swap, so that if we TTL out in the meantime, the write will fail.
// We don't handle the TTL delete w/o a write case here, it's handled in the next loop
// iteration.
_, err := e.etcd.CompareAndSwap(path, id, ttl, "", res.Node.ModifiedIndex)
if err != nil && !tools.IsEtcdTestFailed(err) {
return "", err
}
if err != nil && tools.IsEtcdTestFailed(err) {
return "", nil
}
return id, nil
}
// becomeMaster attempts to become the master for this lock.
// returns "", nil if the attempt failed
// returns id, nil if the attempt succeeded
// returns "", err if an error occurred
func (e *etcdMasterElector) becomeMaster(path, id string, ttl uint64) (string, error) {
_, err := e.etcd.Create(path, id, ttl)
if err != nil && !tools.IsEtcdNodeExist(err) {
// unexpected error
return "", err
}
if err != nil && tools.IsEtcdNodeExist(err) {
return "", nil
}
return id, nil
}
// handleMaster performs one loop of master locking.
// on success it returns <master>, nil
// on error it returns "", err
// in situations where you should try again due to concurrent state changes (e.g. another actor simultaneously acquiring the lock)
// it returns "", nil
func (e *etcdMasterElector) handleMaster(path, id string, ttl uint64) (string, error) {
res, err := e.etcd.Get(path, false, false)
// Unexpected error, bail out
if err != nil && !tools.IsEtcdNotFound(err) {
return "", err
}
// There is no master, try to become the master.
if err != nil && tools.IsEtcdNotFound(err) {
return e.becomeMaster(path, id, ttl)
}
// This should never happen.
if res.Node == nil {
return "", fmt.Errorf("unexpected response: %#v", res)
}
// We're not the master, just return the current value
if res.Node.Value != id {
return res.Node.Value, nil
}
// We are the master, try to extend out lease
return e.extendMaster(path, id, ttl, res)
}
// master provices a distributed master election lock, maintains lock until failure, or someone sends something in the done channel.
// The basic algorithm is:
// while !done
// Get the current master
// If there is no current master
// Try to become the master
// Otherwise
// If we are the master, extend the lease
// If the master is different than the last time through the loop, report the master
// Sleep 80% of TTL
func (e *etcdMasterElector) master(path, id string, ttl uint64, masters chan<- string, errors chan<- error, done <-chan empty) {
lastMaster := ""
for {
master, err := e.handleMaster(path, id, ttl)
if err != nil {
errors <- err
} else if len(master) == 0 {
continue
} else if master != lastMaster {
lastMaster = master
masters <- master
}
// TODO(k8s): Add Watch here, skip the polling for faster reactions
// If done is closed, break out.
select {
case <-done:
return
case <-time.After(time.Duration((ttl*8)/10) * time.Second):
}
}
}
// ResultChan implements the watch.Interface interface
func (e *etcdMasterElector) Stop() {
close(e.done)
}

View File

@@ -0,0 +1,98 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package election
import (
"testing"
"github.com/GoogleCloudPlatform/kubernetes/pkg/tools"
"github.com/GoogleCloudPlatform/kubernetes/pkg/watch"
"github.com/coreos/go-etcd/etcd"
)
func TestEtcdMasterOther(t *testing.T) {
path := "foo"
etcd := tools.NewFakeEtcdClient(t)
etcd.Set(path, "baz", 0)
master := NewEtcdMasterElector(etcd)
w := master.Elect(path, "bar")
result := <-w.ResultChan()
if result.Type != watch.Modified || result.Object.(Master) != "baz" {
t.Errorf("unexpected event: %#v", result)
}
w.Stop()
}
func TestEtcdMasterNoOther(t *testing.T) {
path := "foo"
e := tools.NewFakeEtcdClient(t)
e.TestIndex = true
e.Data["foo"] = tools.EtcdResponseWithError{
R: &etcd.Response{
Node: nil,
},
E: &etcd.EtcdError{
ErrorCode: tools.EtcdErrorCodeNotFound,
},
}
master := NewEtcdMasterElector(e)
w := master.Elect(path, "bar")
result := <-w.ResultChan()
if result.Type != watch.Modified || result.Object.(Master) != "bar" {
t.Errorf("unexpected event: %#v", result)
}
w.Stop()
}
func TestEtcdMasterNoOtherThenConflict(t *testing.T) {
path := "foo"
e := tools.NewFakeEtcdClient(t)
e.TestIndex = true
// Ok, so we set up a chain of responses from etcd:
// 1) Nothing there
// 2) conflict (someone else wrote)
// 3) new value (the data they wrote)
empty := tools.EtcdResponseWithError{
R: &etcd.Response{
Node: nil,
},
E: &etcd.EtcdError{
ErrorCode: tools.EtcdErrorCodeNotFound,
},
}
empty.N = &tools.EtcdResponseWithError{
R: &etcd.Response{},
E: &etcd.EtcdError{
ErrorCode: tools.EtcdErrorCodeNodeExist,
},
}
empty.N.N = &tools.EtcdResponseWithError{
R: &etcd.Response{
Node: &etcd.Node{
Value: "baz",
},
},
}
e.Data["foo"] = empty
master := NewEtcdMasterElector(e)
w := master.Elect(path, "bar")
result := <-w.ResultChan()
if result.Type != watch.Modified || result.Object.(Master) != "bar" {
t.Errorf("unexpected event: %#v", result)
}
w.Stop()
}

View File

@@ -0,0 +1,53 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package election
import (
"sync"
"github.com/GoogleCloudPlatform/kubernetes/pkg/watch"
)
// Fake allows for testing of anything consuming a MasterElector.
type Fake struct {
mux *watch.Broadcaster
currentMaster Master
lock sync.Mutex // Protect access of currentMaster
}
// NewFake makes a new fake MasterElector.
func NewFake() *Fake {
// 0 means block for clients.
return &Fake{mux: watch.NewBroadcaster(0, watch.WaitIfChannelFull)}
}
func (f *Fake) ChangeMaster(newMaster Master) {
f.lock.Lock()
defer f.lock.Unlock()
f.mux.Action(watch.Modified, newMaster)
f.currentMaster = newMaster
}
func (f *Fake) Elect(path, id string) watch.Interface {
f.lock.Lock()
defer f.lock.Unlock()
w := f.mux.Watch()
if f.currentMaster != "" {
f.mux.Action(watch.Modified, f.currentMaster)
}
return w
}

View File

@@ -0,0 +1,134 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package election
import (
"sync"
"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/runtime"
"github.com/GoogleCloudPlatform/kubernetes/pkg/watch"
"github.com/golang/glog"
)
// MasterElector is an interface for services that can elect masters.
// Important Note: MasterElectors are not inter-operable, all participants in the election need to be
// using the same underlying implementation of this interface for correct behavior.
type MasterElector interface {
// Elect makes the caller represented by 'id' enter into a master election for the
// distributed lock defined by 'path'
// The returned watch.Interface provides a stream of Master objects which
// contain the current master.
// Calling Stop on the returned interface relinquishes ownership (if currently possesed)
// and removes the caller from the election
Elect(path, id string) watch.Interface
}
// Service represents anything that can start and stop on demand.
type Service interface {
Validate(desired, current Master)
Start()
Stop()
}
type notifier struct {
lock sync.Mutex
cond *sync.Cond
// desired is updated with every change, current is updated after
// Start()/Stop() finishes. 'cond' is used to signal that a change
// might be needed. This handles the case where mastership flops
// around without calling Start()/Stop() excessively.
desired, current Master
// for comparison, to see if we are master.
id Master
service Service
}
// Notify runs Elect() on m, and calls Start()/Stop() on s when the
// elected master starts/stops matching 'id'. Never returns.
func Notify(m MasterElector, path, id string, s Service, abort <-chan struct{}) {
n := &notifier{id: Master(id), service: s}
n.cond = sync.NewCond(&n.lock)
finished := runtime.After(func() {
runtime.Until(func() {
for {
w := m.Elect(path, id)
for {
select {
case <-abort:
return
case event, open := <-w.ResultChan():
if !open {
break
}
if event.Type != watch.Modified {
continue
}
electedMaster, ok := event.Object.(Master)
if !ok {
glog.Errorf("Unexpected object from election channel: %v", event.Object)
break
}
func() {
n.lock.Lock()
defer n.lock.Unlock()
n.desired = electedMaster
if n.desired != n.current {
n.cond.Signal()
}
}()
}
}
}
}, 0, abort)
})
runtime.Until(func() { n.serviceLoop(finished) }, 0, abort)
}
// serviceLoop waits for changes, and calls Start()/Stop() as needed.
func (n *notifier) serviceLoop(abort <-chan struct{}) {
n.lock.Lock()
defer n.lock.Unlock()
for {
select {
case <-abort:
return
default:
for n.desired == n.current {
ch := runtime.After(n.cond.Wait)
select {
case <-abort:
n.cond.Signal() // ensure that Wait() returns
<-ch
return
case <-ch:
// we were notified and have the lock, proceed..
}
}
if n.current != n.id && n.desired == n.id {
n.service.Validate(n.desired, n.current)
n.service.Start()
} else if n.current == n.id && n.desired != n.id {
n.service.Stop()
}
n.current = n.desired
}
}
}

View File

@@ -0,0 +1,98 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package election
import (
"testing"
"time"
"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/runtime"
)
type slowService struct {
t *testing.T
on bool
// We explicitly have no lock to prove that
// Start and Stop are not called concurrently.
changes chan<- bool
done <-chan struct{}
}
func (s *slowService) Validate(d, c Master) {
// noop
}
func (s *slowService) Start() {
select {
case <-s.done:
return // avoid writing to closed changes chan
default:
}
if s.on {
s.t.Errorf("started already on service")
}
time.Sleep(2 * time.Millisecond)
s.on = true
s.changes <- true
}
func (s *slowService) Stop() {
select {
case <-s.done:
return // avoid writing to closed changes chan
default:
}
if !s.on {
s.t.Errorf("stopped already off service")
}
time.Sleep(2 * time.Millisecond)
s.on = false
s.changes <- false
}
func Test(t *testing.T) {
m := NewFake()
changes := make(chan bool, 1500)
done := make(chan struct{})
s := &slowService{t: t, changes: changes, done: done}
notifyDone := runtime.After(func() { Notify(m, "", "me", s, done) })
go func() {
defer close(done)
for i := 0; i < 500; i++ {
for _, key := range []string{"me", "notme", "alsonotme"} {
m.ChangeMaster(Master(key))
}
}
}()
<-notifyDone
close(changes)
changeList := []bool{}
for {
change, ok := <-changes
if !ok {
break
}
changeList = append(changeList, change)
}
if len(changeList) > 1000 {
t.Errorf("unexpected number of changes: %v", len(changeList))
}
}

View File

@@ -0,0 +1,29 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package config
import (
"time"
)
// default values to use when constructing mesos ExecutorInfo messages
const (
DefaultInfoID = "k8sm-executor"
DefaultInfoSource = "kubernetes"
DefaultInfoName = "Kubelet-Executor"
DefaultSuicideTimeout = 20 * time.Minute
)

View File

@@ -0,0 +1,18 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Package config contains executor configuration constants.
package config

View File

@@ -0,0 +1,21 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
/*
Package executor includes a mesos executor, which contains
a kubelet as its member to manage containers.
*/
package executor

View File

@@ -0,0 +1,846 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package executor
import (
"encoding/json"
"fmt"
"strings"
"sync"
"sync/atomic"
"time"
"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/executor/messages"
"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/scheduler/meta"
"github.com/GoogleCloudPlatform/kubernetes/pkg/api"
"github.com/GoogleCloudPlatform/kubernetes/pkg/client"
"github.com/GoogleCloudPlatform/kubernetes/pkg/kubelet"
"github.com/GoogleCloudPlatform/kubernetes/pkg/kubelet/container"
"github.com/GoogleCloudPlatform/kubernetes/pkg/kubelet/dockertools"
"github.com/GoogleCloudPlatform/kubernetes/pkg/util"
"github.com/GoogleCloudPlatform/kubernetes/pkg/watch"
"github.com/fsouza/go-dockerclient"
"github.com/gogo/protobuf/proto"
log "github.com/golang/glog"
bindings "github.com/mesos/mesos-go/executor"
mesos "github.com/mesos/mesos-go/mesosproto"
mutil "github.com/mesos/mesos-go/mesosutil"
)
const (
containerPollTime = 300 * time.Millisecond
launchGracePeriod = 5 * time.Minute
)
type stateType int32
const (
disconnectedState stateType = iota
connectedState
suicidalState
terminalState
)
func (s *stateType) get() stateType {
return stateType(atomic.LoadInt32((*int32)(s)))
}
func (s *stateType) transition(from, to stateType) bool {
return atomic.CompareAndSwapInt32((*int32)(s), int32(from), int32(to))
}
func (s *stateType) transitionTo(to stateType, unless ...stateType) bool {
if len(unless) == 0 {
atomic.StoreInt32((*int32)(s), int32(to))
return true
}
for {
state := s.get()
for _, x := range unless {
if state == x {
return false
}
}
if s.transition(state, to) {
return true
}
}
}
type kuberTask struct {
mesosTaskInfo *mesos.TaskInfo
podName string
}
// func that attempts suicide
type jumper func(bindings.ExecutorDriver, <-chan struct{})
type suicideWatcher interface {
Next(time.Duration, bindings.ExecutorDriver, jumper) suicideWatcher
Reset(time.Duration) bool
Stop() bool
}
type podStatusFunc func() (*api.PodStatus, error)
// KubernetesExecutor is an mesos executor that runs pods
// in a minion machine.
type KubernetesExecutor struct {
kl *kubelet.Kubelet // the kubelet instance.
updateChan chan<- interface{} // to send pod config updates to the kubelet
state stateType
tasks map[string]*kuberTask
pods map[string]*api.Pod
lock sync.RWMutex
sourcename string
client *client.Client
events <-chan watch.Event
done chan struct{} // signals shutdown
outgoing chan func() (mesos.Status, error) // outgoing queue to the mesos driver
dockerClient dockertools.DockerInterface
suicideWatch suicideWatcher
suicideTimeout time.Duration
shutdownAlert func() // invoked just prior to executor shutdown
kubeletFinished <-chan struct{} // signals that kubelet Run() died
initialRegistration sync.Once
exitFunc func(int)
podStatusFunc func(*kubelet.Kubelet, *api.Pod) (*api.PodStatus, error)
}
type Config struct {
Kubelet *kubelet.Kubelet
Updates chan<- interface{} // to send pod config updates to the kubelet
SourceName string
APIClient *client.Client
Watch watch.Interface
Docker dockertools.DockerInterface
ShutdownAlert func()
SuicideTimeout time.Duration
KubeletFinished <-chan struct{} // signals that kubelet Run() died
ExitFunc func(int)
PodStatusFunc func(*kubelet.Kubelet, *api.Pod) (*api.PodStatus, error)
}
func (k *KubernetesExecutor) isConnected() bool {
return connectedState == (&k.state).get()
}
// New creates a new kubernetes executor.
func New(config Config) *KubernetesExecutor {
k := &KubernetesExecutor{
kl: config.Kubelet,
updateChan: config.Updates,
state: disconnectedState,
tasks: make(map[string]*kuberTask),
pods: make(map[string]*api.Pod),
sourcename: config.SourceName,
client: config.APIClient,
done: make(chan struct{}),
outgoing: make(chan func() (mesos.Status, error), 1024),
dockerClient: config.Docker,
suicideTimeout: config.SuicideTimeout,
kubeletFinished: config.KubeletFinished,
suicideWatch: &suicideTimer{},
shutdownAlert: config.ShutdownAlert,
exitFunc: config.ExitFunc,
podStatusFunc: config.PodStatusFunc,
}
//TODO(jdef) do something real with these events..
if config.Watch != nil {
events := config.Watch.ResultChan()
if events != nil {
go func() {
for e := range events {
// e ~= watch.Event { ADDED, *api.Event }
log.V(1).Info(e)
}
}()
k.events = events
}
}
return k
}
func (k *KubernetesExecutor) Init(driver bindings.ExecutorDriver) {
k.killKubeletContainers()
k.resetSuicideWatch(driver)
go k.sendLoop()
//TODO(jdef) monitor kubeletFinished and shutdown if it happens
}
func (k *KubernetesExecutor) Done() <-chan struct{} {
return k.done
}
func (k *KubernetesExecutor) isDone() bool {
select {
case <-k.done:
return true
default:
return false
}
}
// Registered is called when the executor is successfully registered with the slave.
func (k *KubernetesExecutor) Registered(driver bindings.ExecutorDriver,
executorInfo *mesos.ExecutorInfo, frameworkInfo *mesos.FrameworkInfo, slaveInfo *mesos.SlaveInfo) {
if k.isDone() {
return
}
log.Infof("Executor %v of framework %v registered with slave %v\n",
executorInfo, frameworkInfo, slaveInfo)
if !(&k.state).transition(disconnectedState, connectedState) {
log.Errorf("failed to register/transition to a connected state")
}
k.initialRegistration.Do(k.onInitialRegistration)
}
// Reregistered is called when the executor is successfully re-registered with the slave.
// This can happen when the slave fails over.
func (k *KubernetesExecutor) Reregistered(driver bindings.ExecutorDriver, slaveInfo *mesos.SlaveInfo) {
if k.isDone() {
return
}
log.Infof("Reregistered with slave %v\n", slaveInfo)
if !(&k.state).transition(disconnectedState, connectedState) {
log.Errorf("failed to reregister/transition to a connected state")
}
k.initialRegistration.Do(k.onInitialRegistration)
}
func (k *KubernetesExecutor) onInitialRegistration() {
// emit an empty update to allow the mesos "source" to be marked as seen
k.updateChan <- kubelet.PodUpdate{
Pods: []*api.Pod{},
Op: kubelet.SET,
Source: k.sourcename,
}
}
// Disconnected is called when the executor is disconnected from the slave.
func (k *KubernetesExecutor) Disconnected(driver bindings.ExecutorDriver) {
if k.isDone() {
return
}
log.Infof("Slave is disconnected\n")
if !(&k.state).transition(connectedState, disconnectedState) {
log.Errorf("failed to disconnect/transition to a disconnected state")
}
}
// LaunchTask is called when the executor receives a request to launch a task.
// The happens when the k8sm scheduler has decided to schedule the pod
// (which corresponds to a Mesos Task) onto the node where this executor
// is running, but the binding is not recorded in the Kubernetes store yet.
// This function is invoked to tell the executor to record the binding in the
// Kubernetes store and start the pod via the Kubelet.
func (k *KubernetesExecutor) LaunchTask(driver bindings.ExecutorDriver, taskInfo *mesos.TaskInfo) {
if k.isDone() {
return
}
log.Infof("Launch task %v\n", taskInfo)
if !k.isConnected() {
log.Errorf("Ignore launch task because the executor is disconnected\n")
k.sendStatus(driver, newStatus(taskInfo.GetTaskId(), mesos.TaskState_TASK_FAILED,
messages.ExecutorUnregistered))
return
}
obj, err := api.Codec.Decode(taskInfo.GetData())
if err != nil {
log.Errorf("failed to extract yaml data from the taskInfo.data %v", err)
k.sendStatus(driver, newStatus(taskInfo.GetTaskId(), mesos.TaskState_TASK_FAILED,
messages.UnmarshalTaskDataFailure))
return
}
pod, ok := obj.(*api.Pod)
if !ok {
log.Errorf("expected *api.Pod instead of %T: %+v", pod, pod)
k.sendStatus(driver, newStatus(taskInfo.GetTaskId(), mesos.TaskState_TASK_FAILED,
messages.UnmarshalTaskDataFailure))
return
}
k.lock.Lock()
defer k.lock.Unlock()
taskId := taskInfo.GetTaskId().GetValue()
if _, found := k.tasks[taskId]; found {
log.Errorf("task already launched\n")
// Not to send back TASK_RUNNING here, because
// may be duplicated messages or duplicated task id.
return
}
// remember this task so that:
// (a) we ignore future launches for it
// (b) we have a record of it so that we can kill it if needed
// (c) we're leaving podName == "" for now, indicates we don't need to delete containers
k.tasks[taskId] = &kuberTask{
mesosTaskInfo: taskInfo,
}
k.resetSuicideWatch(driver)
go k.launchTask(driver, taskId, pod)
}
// TODO(jdef) add metrics for this?
type suicideTimer struct {
timer *time.Timer
}
func (w *suicideTimer) Next(d time.Duration, driver bindings.ExecutorDriver, f jumper) suicideWatcher {
return &suicideTimer{
timer: time.AfterFunc(d, func() {
log.Warningf("Suicide timeout (%v) expired", d)
f(driver, nil)
}),
}
}
func (w *suicideTimer) Stop() (result bool) {
if w != nil && w.timer != nil {
log.Infoln("stopping suicide watch") //TODO(jdef) debug
result = w.timer.Stop()
}
return
}
// return true if the timer was successfully reset
func (w *suicideTimer) Reset(d time.Duration) bool {
if w != nil && w.timer != nil {
log.Infoln("resetting suicide watch") //TODO(jdef) debug
w.timer.Reset(d)
return true
}
return false
}
// determine whether we need to start a suicide countdown. if so, then start
// a timer that, upon expiration, causes this executor to commit suicide.
// this implementation runs asynchronously. callers that wish to wait for the
// reset to complete may wait for the returned signal chan to close.
func (k *KubernetesExecutor) resetSuicideWatch(driver bindings.ExecutorDriver) <-chan struct{} {
ch := make(chan struct{})
go func() {
defer close(ch)
k.lock.Lock()
defer k.lock.Unlock()
if k.suicideTimeout < 1 {
return
}
if k.suicideWatch != nil {
if len(k.tasks) > 0 {
k.suicideWatch.Stop()
return
}
if k.suicideWatch.Reset(k.suicideTimeout) {
// valid timer, reset was successful
return
}
}
//TODO(jdef) reduce verbosity here once we're convinced that suicide watch is working properly
log.Infof("resetting suicide watch timer for %v", k.suicideTimeout)
k.suicideWatch = k.suicideWatch.Next(k.suicideTimeout, driver, jumper(k.attemptSuicide))
}()
return ch
}
func (k *KubernetesExecutor) attemptSuicide(driver bindings.ExecutorDriver, abort <-chan struct{}) {
k.lock.Lock()
defer k.lock.Unlock()
// this attempt may have been queued and since been aborted
select {
case <-abort:
//TODO(jdef) reduce verbosity once suicide watch is working properly
log.Infof("aborting suicide attempt since watch was cancelled")
return
default: // continue
}
// fail-safe, will abort kamikaze attempts if there are tasks
if len(k.tasks) > 0 {
ids := []string{}
for taskid := range k.tasks {
ids = append(ids, taskid)
}
log.Errorf("suicide attempt failed, there are still running tasks: %v", ids)
return
}
log.Infoln("Attempting suicide")
if (&k.state).transitionTo(suicidalState, suicidalState, terminalState) {
//TODO(jdef) let the scheduler know?
//TODO(jdef) is suicide more graceful than slave-demanded shutdown?
k.doShutdown(driver)
}
}
// async continuation of LaunchTask
func (k *KubernetesExecutor) launchTask(driver bindings.ExecutorDriver, taskId string, pod *api.Pod) {
//HACK(jdef): cloned binding construction from k8s plugin/pkg/scheduler/scheduler.go
binding := &api.Binding{
ObjectMeta: api.ObjectMeta{
Namespace: pod.Namespace,
Name: pod.Name,
Annotations: make(map[string]string),
},
Target: api.ObjectReference{
Kind: "Node",
Name: pod.Annotations[meta.BindingHostKey],
},
}
// forward the annotations that the scheduler wants to apply
for k, v := range pod.Annotations {
binding.Annotations[k] = v
}
deleteTask := func() {
k.lock.Lock()
defer k.lock.Unlock()
delete(k.tasks, taskId)
k.resetSuicideWatch(driver)
}
log.Infof("Binding '%v/%v' to '%v' with annotations %+v...", pod.Namespace, pod.Name, binding.Target.Name, binding.Annotations)
ctx := api.WithNamespace(api.NewContext(), binding.Namespace)
// TODO(k8s): use Pods interface for binding once clusters are upgraded
// return b.Pods(binding.Namespace).Bind(binding)
err := k.client.Post().Namespace(api.NamespaceValue(ctx)).Resource("bindings").Body(binding).Do().Error()
if err != nil {
deleteTask()
k.sendStatus(driver, newStatus(mutil.NewTaskID(taskId), mesos.TaskState_TASK_FAILED,
messages.CreateBindingFailure))
return
}
podFullName := container.GetPodFullName(pod)
// allow a recently failed-over scheduler the chance to recover the task/pod binding:
// it may have failed and recovered before the apiserver is able to report the updated
// binding information. replays of this status event will signal to the scheduler that
// the apiserver should be up-to-date.
data, err := json.Marshal(api.PodStatusResult{
ObjectMeta: api.ObjectMeta{
Name: podFullName,
SelfLink: "/podstatusresult",
},
})
if err != nil {
deleteTask()
log.Errorf("failed to marshal pod status result: %v", err)
k.sendStatus(driver, newStatus(mutil.NewTaskID(taskId), mesos.TaskState_TASK_FAILED,
err.Error()))
return
}
k.lock.Lock()
defer k.lock.Unlock()
// Add the task.
task, found := k.tasks[taskId]
if !found {
log.V(1).Infof("task %v not found, probably killed: aborting launch, reporting lost", taskId)
k.reportLostTask(driver, taskId, messages.LaunchTaskFailed)
return
}
//TODO(jdef) check for duplicate pod name, if found send TASK_ERROR
// from here on, we need to delete containers associated with the task
// upon it going into a terminal state
task.podName = podFullName
k.pods[podFullName] = pod
// send the latest snapshot of the set of pods to the kubelet via the pod update channel
update := kubelet.PodUpdate{Op: kubelet.SET}
for _, p := range k.pods {
update.Pods = append(update.Pods, p)
}
k.updateChan <- update
statusUpdate := &mesos.TaskStatus{
TaskId: mutil.NewTaskID(taskId),
State: mesos.TaskState_TASK_STARTING.Enum(),
Message: proto.String(messages.CreateBindingSuccess),
Data: data,
}
k.sendStatus(driver, statusUpdate)
// Delay reporting 'task running' until container is up.
psf := podStatusFunc(func() (*api.PodStatus, error) {
return k.podStatusFunc(k.kl, pod)
})
go k._launchTask(driver, taskId, podFullName, psf)
}
func (k *KubernetesExecutor) _launchTask(driver bindings.ExecutorDriver, taskId, podFullName string, psf podStatusFunc) {
expired := make(chan struct{})
time.AfterFunc(launchGracePeriod, func() { close(expired) })
getMarshalledInfo := func() (data []byte, cancel bool) {
// potentially long call..
if podStatus, err := psf(); err == nil && podStatus != nil {
select {
case <-expired:
cancel = true
default:
k.lock.Lock()
defer k.lock.Unlock()
if _, found := k.tasks[taskId]; !found {
// don't bother with the pod status if the task is already gone
cancel = true
break
} else if podStatus.Phase != api.PodRunning {
// avoid sending back a running status before it's really running
break
}
log.V(2).Infof("Found pod status: '%v'", podStatus)
result := api.PodStatusResult{
ObjectMeta: api.ObjectMeta{
Name: podFullName,
SelfLink: "/podstatusresult",
},
Status: *podStatus,
}
if data, err = json.Marshal(result); err != nil {
log.Errorf("failed to marshal pod status result: %v", err)
}
}
}
return
}
waitForRunningPod:
for {
select {
case <-expired:
log.Warningf("Launch expired grace period of '%v'", launchGracePeriod)
break waitForRunningPod
case <-time.After(containerPollTime):
if data, cancel := getMarshalledInfo(); cancel {
break waitForRunningPod
} else if data == nil {
continue waitForRunningPod
} else {
k.lock.Lock()
defer k.lock.Unlock()
if _, found := k.tasks[taskId]; !found {
goto reportLost
}
statusUpdate := &mesos.TaskStatus{
TaskId: mutil.NewTaskID(taskId),
State: mesos.TaskState_TASK_RUNNING.Enum(),
Message: proto.String(fmt.Sprintf("pod-running:%s", podFullName)),
Data: data,
}
k.sendStatus(driver, statusUpdate)
// continue to monitor the health of the pod
go k.__launchTask(driver, taskId, podFullName, psf)
return
}
}
}
k.lock.Lock()
defer k.lock.Unlock()
reportLost:
k.reportLostTask(driver, taskId, messages.LaunchTaskFailed)
}
func (k *KubernetesExecutor) __launchTask(driver bindings.ExecutorDriver, taskId, podFullName string, psf podStatusFunc) {
// TODO(nnielsen): Monitor health of pod and report if lost.
// Should we also allow this to fail a couple of times before reporting lost?
// What if the docker daemon is restarting and we can't connect, but it's
// going to bring the pods back online as soon as it restarts?
knownPod := func() bool {
_, err := psf()
return err == nil
}
// Wait for the pod to go away and stop monitoring once it does
// TODO (jdefelice) replace with an /events watch?
for {
time.Sleep(containerPollTime)
if k.checkForLostPodTask(driver, taskId, knownPod) {
return
}
}
}
// Intended to be executed as part of the pod monitoring loop, this fn (ultimately) checks with Docker
// whether the pod is running. It will only return false if the task is still registered and the pod is
// registered in Docker. Otherwise it returns true. If there's still a task record on file, but no pod
// in Docker, then we'll also send a TASK_LOST event.
func (k *KubernetesExecutor) checkForLostPodTask(driver bindings.ExecutorDriver, taskId string, isKnownPod func() bool) bool {
// TODO (jdefelice) don't send false alarms for deleted pods (KILLED tasks)
k.lock.Lock()
defer k.lock.Unlock()
// TODO(jdef) we should really consider k.pods here, along with what docker is reporting, since the
// kubelet may constantly attempt to instantiate a pod as long as it's in the pod state that we're
// handing to it. otherwise, we're probably reporting a TASK_LOST prematurely. Should probably
// consult RestartPolicy to determine appropriate behavior. Should probably also gracefully handle
// docker daemon restarts.
if _, ok := k.tasks[taskId]; ok {
if isKnownPod() {
return false
} else {
log.Warningf("Detected lost pod, reporting lost task %v", taskId)
k.reportLostTask(driver, taskId, messages.ContainersDisappeared)
}
} else {
log.V(2).Infof("Task %v no longer registered, stop monitoring for lost pods", taskId)
}
return true
}
// KillTask is called when the executor receives a request to kill a task.
func (k *KubernetesExecutor) KillTask(driver bindings.ExecutorDriver, taskId *mesos.TaskID) {
if k.isDone() {
return
}
log.Infof("Kill task %v\n", taskId)
if !k.isConnected() {
//TODO(jdefelice) sent TASK_LOST here?
log.Warningf("Ignore kill task because the executor is disconnected\n")
return
}
k.lock.Lock()
defer k.lock.Unlock()
k.removePodTask(driver, taskId.GetValue(), messages.TaskKilled, mesos.TaskState_TASK_KILLED)
}
// Reports a lost task to the slave and updates internal task and pod tracking state.
// Assumes that the caller is locking around pod and task state.
func (k *KubernetesExecutor) reportLostTask(driver bindings.ExecutorDriver, tid, reason string) {
k.removePodTask(driver, tid, reason, mesos.TaskState_TASK_LOST)
}
// deletes the pod and task associated with the task identified by tid and sends a task
// status update to mesos. also attempts to reset the suicide watch.
// Assumes that the caller is locking around pod and task state.
func (k *KubernetesExecutor) removePodTask(driver bindings.ExecutorDriver, tid, reason string, state mesos.TaskState) {
task, ok := k.tasks[tid]
if !ok {
log.V(1).Infof("Failed to remove task, unknown task %v\n", tid)
return
}
delete(k.tasks, tid)
k.resetSuicideWatch(driver)
pid := task.podName
if _, found := k.pods[pid]; !found {
log.Warningf("Cannot remove unknown pod %v for task %v", pid, tid)
} else {
log.V(2).Infof("deleting pod %v for task %v", pid, tid)
delete(k.pods, pid)
// Send the pod updates to the channel.
update := kubelet.PodUpdate{Op: kubelet.SET}
for _, p := range k.pods {
update.Pods = append(update.Pods, p)
}
k.updateChan <- update
}
// TODO(jdef): ensure that the update propagates, perhaps return a signal chan?
k.sendStatus(driver, newStatus(mutil.NewTaskID(tid), state, reason))
}
// FrameworkMessage is called when the framework sends some message to the executor
func (k *KubernetesExecutor) FrameworkMessage(driver bindings.ExecutorDriver, message string) {
if k.isDone() {
return
}
if !k.isConnected() {
log.Warningf("Ignore framework message because the executor is disconnected\n")
return
}
log.Infof("Receives message from framework %v\n", message)
//TODO(jdef) master reported a lost task, reconcile this! @see scheduler.go:handleTaskLost
if strings.HasPrefix(message, "task-lost:") && len(message) > 10 {
taskId := message[10:]
if taskId != "" {
// clean up pod state
k.lock.Lock()
defer k.lock.Unlock()
k.reportLostTask(driver, taskId, messages.TaskLostAck)
}
}
switch message {
case messages.Kamikaze:
k.attemptSuicide(driver, nil)
}
}
// Shutdown is called when the executor receives a shutdown request.
func (k *KubernetesExecutor) Shutdown(driver bindings.ExecutorDriver) {
k.lock.Lock()
defer k.lock.Unlock()
k.doShutdown(driver)
}
// assumes that caller has obtained state lock
func (k *KubernetesExecutor) doShutdown(driver bindings.ExecutorDriver) {
defer func() {
log.Errorf("exiting with unclean shutdown: %v", recover())
if k.exitFunc != nil {
k.exitFunc(1)
}
}()
(&k.state).transitionTo(terminalState)
// signal to all listeners that this KubeletExecutor is done!
close(k.done)
if k.shutdownAlert != nil {
func() {
util.HandleCrash()
k.shutdownAlert()
}()
}
log.Infoln("Stopping executor driver")
_, err := driver.Stop()
if err != nil {
log.Warningf("failed to stop executor driver: %v", err)
}
log.Infoln("Shutdown the executor")
// according to docs, mesos will generate TASK_LOST updates for us
// if needed, so don't take extra time to do that here.
k.tasks = map[string]*kuberTask{}
select {
// the main Run() func may still be running... wait for it to finish: it will
// clear the pod configuration cleanly, telling k8s "there are no pods" and
// clean up resources (pods, volumes, etc).
case <-k.kubeletFinished:
//TODO(jdef) attempt to wait for events to propagate to API server?
// TODO(jdef) extract constant, should be smaller than whatever the
// slave graceful shutdown timeout period is.
case <-time.After(15 * time.Second):
log.Errorf("timed out waiting for kubelet Run() to die")
}
log.Infoln("exiting")
if k.exitFunc != nil {
k.exitFunc(0)
}
}
// Destroy existing k8s containers
func (k *KubernetesExecutor) killKubeletContainers() {
if containers, err := dockertools.GetKubeletDockerContainers(k.dockerClient, true); err == nil {
opts := docker.RemoveContainerOptions{
RemoveVolumes: true,
Force: true,
}
for _, container := range containers {
opts.ID = container.ID
log.V(2).Infof("Removing container: %v", opts.ID)
if err := k.dockerClient.RemoveContainer(opts); err != nil {
log.Warning(err)
}
}
} else {
log.Warningf("Failed to list kubelet docker containers: %v", err)
}
}
// Error is called when some error happens.
func (k *KubernetesExecutor) Error(driver bindings.ExecutorDriver, message string) {
log.Errorln(message)
}
func newStatus(taskId *mesos.TaskID, state mesos.TaskState, message string) *mesos.TaskStatus {
return &mesos.TaskStatus{
TaskId: taskId,
State: &state,
Message: proto.String(message),
}
}
func (k *KubernetesExecutor) sendStatus(driver bindings.ExecutorDriver, status *mesos.TaskStatus) {
select {
case <-k.done:
default:
k.outgoing <- func() (mesos.Status, error) { return driver.SendStatusUpdate(status) }
}
}
func (k *KubernetesExecutor) sendFrameworkMessage(driver bindings.ExecutorDriver, msg string) {
select {
case <-k.done:
default:
k.outgoing <- func() (mesos.Status, error) { return driver.SendFrameworkMessage(msg) }
}
}
func (k *KubernetesExecutor) sendLoop() {
defer log.V(1).Info("sender loop exiting")
for {
select {
case <-k.done:
return
default:
if !k.isConnected() {
select {
case <-k.done:
case <-time.After(1 * time.Second):
}
continue
}
sender, ok := <-k.outgoing
if !ok {
// programming error
panic("someone closed the outgoing channel")
}
if status, err := sender(); err == nil {
continue
} else {
log.Error(err)
if status == mesos.Status_DRIVER_ABORTED {
return
}
}
// attempt to re-queue the sender
select {
case <-k.done:
case k.outgoing <- sender:
}
}
}
}

View File

@@ -0,0 +1,618 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package executor
import (
"fmt"
"net/http"
"net/http/httptest"
"reflect"
"sync"
"sync/atomic"
"testing"
"time"
assertext "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/assert"
"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/executor/messages"
kmruntime "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/runtime"
"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/scheduler/podtask"
"github.com/GoogleCloudPlatform/kubernetes/pkg/api"
"github.com/GoogleCloudPlatform/kubernetes/pkg/api/testapi"
"github.com/GoogleCloudPlatform/kubernetes/pkg/client"
"github.com/GoogleCloudPlatform/kubernetes/pkg/client/cache"
"github.com/GoogleCloudPlatform/kubernetes/pkg/kubelet"
"github.com/GoogleCloudPlatform/kubernetes/pkg/kubelet/dockertools"
"github.com/GoogleCloudPlatform/kubernetes/pkg/runtime"
"github.com/GoogleCloudPlatform/kubernetes/pkg/watch"
"github.com/golang/glog"
bindings "github.com/mesos/mesos-go/executor"
"github.com/mesos/mesos-go/mesosproto"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/mock"
)
type suicideTracker struct {
suicideWatcher
stops uint32
resets uint32
timers uint32
jumps *uint32
}
func (t *suicideTracker) Reset(d time.Duration) bool {
defer func() { t.resets++ }()
return t.suicideWatcher.Reset(d)
}
func (t *suicideTracker) Stop() bool {
defer func() { t.stops++ }()
return t.suicideWatcher.Stop()
}
func (t *suicideTracker) Next(d time.Duration, driver bindings.ExecutorDriver, f jumper) suicideWatcher {
tracker := &suicideTracker{
stops: t.stops,
resets: t.resets,
jumps: t.jumps,
timers: t.timers + 1,
}
jumper := tracker.makeJumper(f)
tracker.suicideWatcher = t.suicideWatcher.Next(d, driver, jumper)
return tracker
}
func (t *suicideTracker) makeJumper(_ jumper) jumper {
return jumper(func(driver bindings.ExecutorDriver, cancel <-chan struct{}) {
glog.Warningln("jumping?!")
if t.jumps != nil {
atomic.AddUint32(t.jumps, 1)
}
})
}
func TestSuicide_zeroTimeout(t *testing.T) {
defer glog.Flush()
k := New(Config{})
tracker := &suicideTracker{suicideWatcher: k.suicideWatch}
k.suicideWatch = tracker
ch := k.resetSuicideWatch(nil)
select {
case <-ch:
case <-time.After(2 * time.Second):
t.Fatalf("timeout waiting for reset of suicide watch")
}
if tracker.stops != 0 {
t.Fatalf("expected no stops since suicideWatchTimeout was never set")
}
if tracker.resets != 0 {
t.Fatalf("expected no resets since suicideWatchTimeout was never set")
}
if tracker.timers != 0 {
t.Fatalf("expected no timers since suicideWatchTimeout was never set")
}
}
func TestSuicide_WithTasks(t *testing.T) {
defer glog.Flush()
k := New(Config{
SuicideTimeout: 50 * time.Millisecond,
})
jumps := uint32(0)
tracker := &suicideTracker{suicideWatcher: k.suicideWatch, jumps: &jumps}
k.suicideWatch = tracker
k.tasks["foo"] = &kuberTask{} // prevent suicide attempts from succeeding
// call reset with a nil timer
glog.Infoln("resetting suicide watch with 1 task")
select {
case <-k.resetSuicideWatch(nil):
tracker = k.suicideWatch.(*suicideTracker)
if tracker.stops != 1 {
t.Fatalf("expected suicide attempt to Stop() since there are registered tasks")
}
if tracker.resets != 0 {
t.Fatalf("expected no resets since")
}
if tracker.timers != 0 {
t.Fatalf("expected no timers since")
}
case <-time.After(1 * time.Second):
t.Fatalf("initial suicide watch setup failed")
}
delete(k.tasks, "foo") // zero remaining tasks
k.suicideTimeout = 1500 * time.Millisecond
suicideStart := time.Now()
// reset the suicide watch, which should actually start a timer now
glog.Infoln("resetting suicide watch with 0 tasks")
select {
case <-k.resetSuicideWatch(nil):
tracker = k.suicideWatch.(*suicideTracker)
if tracker.stops != 1 {
t.Fatalf("did not expect suicide attempt to Stop() since there are no registered tasks")
}
if tracker.resets != 1 {
t.Fatalf("expected 1 resets instead of %d", tracker.resets)
}
if tracker.timers != 1 {
t.Fatalf("expected 1 timers instead of %d", tracker.timers)
}
case <-time.After(1 * time.Second):
t.Fatalf("2nd suicide watch setup failed")
}
k.lock.Lock()
k.tasks["foo"] = &kuberTask{} // prevent suicide attempts from succeeding
k.lock.Unlock()
// reset the suicide watch, which should stop the existing timer
glog.Infoln("resetting suicide watch with 1 task")
select {
case <-k.resetSuicideWatch(nil):
tracker = k.suicideWatch.(*suicideTracker)
if tracker.stops != 2 {
t.Fatalf("expected 2 stops instead of %d since there are registered tasks", tracker.stops)
}
if tracker.resets != 1 {
t.Fatalf("expected 1 resets instead of %d", tracker.resets)
}
if tracker.timers != 1 {
t.Fatalf("expected 1 timers instead of %d", tracker.timers)
}
case <-time.After(1 * time.Second):
t.Fatalf("3rd suicide watch setup failed")
}
k.lock.Lock()
delete(k.tasks, "foo") // allow suicide attempts to schedule
k.lock.Unlock()
// reset the suicide watch, which should reset a stopped timer
glog.Infoln("resetting suicide watch with 0 tasks")
select {
case <-k.resetSuicideWatch(nil):
tracker = k.suicideWatch.(*suicideTracker)
if tracker.stops != 2 {
t.Fatalf("expected 2 stops instead of %d since there are no registered tasks", tracker.stops)
}
if tracker.resets != 2 {
t.Fatalf("expected 2 resets instead of %d", tracker.resets)
}
if tracker.timers != 1 {
t.Fatalf("expected 1 timers instead of %d", tracker.timers)
}
case <-time.After(1 * time.Second):
t.Fatalf("4th suicide watch setup failed")
}
sinceWatch := time.Since(suicideStart)
time.Sleep(3*time.Second - sinceWatch) // give the first timer to misfire (it shouldn't since Stop() was called)
if j := atomic.LoadUint32(&jumps); j != 1 {
t.Fatalf("expected 1 jumps instead of %d since stop was called", j)
} else {
glog.Infoln("jumps verified") // glog so we get a timestamp
}
}
// TestExecutorRegister ensures that the executor thinks it is connected
// after Register is called.
func TestExecutorRegister(t *testing.T) {
mockDriver := &MockExecutorDriver{}
updates := make(chan interface{}, 1024)
executor := New(Config{
Docker: dockertools.ConnectToDockerOrDie("fake://"),
Updates: updates,
SourceName: "executor_test",
})
executor.Init(mockDriver)
executor.Registered(mockDriver, nil, nil, nil)
initialPodUpdate := kubelet.PodUpdate{
Pods: []*api.Pod{},
Op: kubelet.SET,
Source: executor.sourcename,
}
receivedInitialPodUpdate := false
select {
case m := <-updates:
update, ok := m.(kubelet.PodUpdate)
if ok {
if reflect.DeepEqual(initialPodUpdate, update) {
receivedInitialPodUpdate = true
}
}
case <-time.After(time.Second):
}
assert.Equal(t, true, receivedInitialPodUpdate,
"executor should have sent an initial PodUpdate "+
"to the updates chan upon registration")
assert.Equal(t, true, executor.isConnected(), "executor should be connected")
mockDriver.AssertExpectations(t)
}
// TestExecutorDisconnect ensures that the executor thinks that it is not
// connected after a call to Disconnected has occured.
func TestExecutorDisconnect(t *testing.T) {
mockDriver := &MockExecutorDriver{}
executor := NewTestKubernetesExecutor()
executor.Init(mockDriver)
executor.Registered(mockDriver, nil, nil, nil)
executor.Disconnected(mockDriver)
assert.Equal(t, false, executor.isConnected(),
"executor should not be connected after Disconnected")
mockDriver.AssertExpectations(t)
}
// TestExecutorReregister ensures that the executor thinks it is connected
// after a connection problem happens, followed by a call to Reregistered.
func TestExecutorReregister(t *testing.T) {
mockDriver := &MockExecutorDriver{}
executor := NewTestKubernetesExecutor()
executor.Init(mockDriver)
executor.Registered(mockDriver, nil, nil, nil)
executor.Disconnected(mockDriver)
executor.Reregistered(mockDriver, nil)
assert.Equal(t, true, executor.isConnected(), "executor should be connected")
mockDriver.AssertExpectations(t)
}
// TestExecutorLaunchAndKillTask ensures that the executor is able to launch
// and kill tasks while properly bookkeping its tasks.
func TestExecutorLaunchAndKillTask(t *testing.T) {
// create a fake pod watch. We use that below to submit new pods to the scheduler
podListWatch := NewMockPodsListWatch(api.PodList{})
// create fake apiserver
testApiServer := NewTestServer(t, api.NamespaceDefault, &podListWatch.list)
defer testApiServer.server.Close()
mockDriver := &MockExecutorDriver{}
updates := make(chan interface{}, 1024)
config := Config{
Docker: dockertools.ConnectToDockerOrDie("fake://"),
Updates: updates,
APIClient: client.NewOrDie(&client.Config{
Host: testApiServer.server.URL,
Version: testapi.Version(),
}),
Kubelet: &kubelet.Kubelet{},
PodStatusFunc: func(kl *kubelet.Kubelet, pod *api.Pod) (*api.PodStatus, error) {
return &api.PodStatus{
ContainerStatuses: []api.ContainerStatus{
{
Name: "foo",
State: api.ContainerState{
Running: &api.ContainerStateRunning{},
},
},
},
Phase: api.PodRunning,
}, nil
},
}
executor := New(config)
executor.Init(mockDriver)
executor.Registered(mockDriver, nil, nil, nil)
select {
case <-updates:
case <-time.After(time.Second):
t.Fatalf("Executor should send an intial update on Registration")
}
pod := NewTestPod(1)
podTask, err := podtask.New(api.NewDefaultContext(), "",
*pod, &mesosproto.ExecutorInfo{})
assert.Equal(t, nil, err, "must be able to create a task from a pod")
taskInfo := podTask.BuildTaskInfo()
data, err := testapi.Codec().Encode(pod)
assert.Equal(t, nil, err, "must be able to encode a pod's spec data")
taskInfo.Data = data
var statusUpdateCalls sync.WaitGroup
statusUpdateDone := func(_ mock.Arguments) { statusUpdateCalls.Done() }
statusUpdateCalls.Add(1)
mockDriver.On(
"SendStatusUpdate",
mesosproto.TaskState_TASK_STARTING,
).Return(mesosproto.Status_DRIVER_RUNNING, nil).Run(statusUpdateDone).Once()
statusUpdateCalls.Add(1)
mockDriver.On(
"SendStatusUpdate",
mesosproto.TaskState_TASK_RUNNING,
).Return(mesosproto.Status_DRIVER_RUNNING, nil).Run(statusUpdateDone).Once()
executor.LaunchTask(mockDriver, taskInfo)
assertext.EventuallyTrue(t, 5*time.Second, func() bool {
executor.lock.Lock()
defer executor.lock.Unlock()
return len(executor.tasks) == 1 && len(executor.pods) == 1
}, "executor must be able to create a task and a pod")
gotPodUpdate := false
select {
case m := <-updates:
update, ok := m.(kubelet.PodUpdate)
if ok && len(update.Pods) == 1 {
gotPodUpdate = true
}
case <-time.After(time.Second):
}
assert.Equal(t, true, gotPodUpdate,
"the executor should send an update about a new pod to "+
"the updates chan when creating a new one.")
// Allow some time for asynchronous requests to the driver.
finished := kmruntime.After(statusUpdateCalls.Wait)
select {
case <-finished:
case <-time.After(5 * time.Second):
t.Fatalf("timed out waiting for status update calls to finish")
}
statusUpdateCalls.Add(1)
mockDriver.On(
"SendStatusUpdate",
mesosproto.TaskState_TASK_KILLED,
).Return(mesosproto.Status_DRIVER_RUNNING, nil).Run(statusUpdateDone).Once()
executor.KillTask(mockDriver, taskInfo.TaskId)
assertext.EventuallyTrue(t, 5*time.Second, func() bool {
executor.lock.Lock()
defer executor.lock.Unlock()
return len(executor.tasks) == 0 && len(executor.pods) == 0
}, "executor must be able to kill a created task and pod")
// Allow some time for asynchronous requests to the driver.
finished = kmruntime.After(statusUpdateCalls.Wait)
select {
case <-finished:
case <-time.After(5 * time.Second):
t.Fatalf("timed out waiting for status update calls to finish")
}
mockDriver.AssertExpectations(t)
}
// TestExecutorFrameworkMessage ensures that the executor is able to
// handle messages from the framework, specifically about lost tasks
// and Kamikaze. When a task is lost, the executor needs to clean up
// its state. When a Kamikaze message is received, the executor should
// attempt suicide.
func TestExecutorFrameworkMessage(t *testing.T) {
mockDriver := &MockExecutorDriver{}
kubeletFinished := make(chan struct{})
config := Config{
Docker: dockertools.ConnectToDockerOrDie("fake://"),
Updates: make(chan interface{}, 1024),
APIClient: client.NewOrDie(&client.Config{
Host: "fakehost",
Version: testapi.Version(),
}),
ShutdownAlert: func() {
close(kubeletFinished)
},
KubeletFinished: kubeletFinished,
}
executor := New(config)
executor.Init(mockDriver)
executor.Registered(mockDriver, nil, nil, nil)
executor.FrameworkMessage(mockDriver, "test framework message")
// set up a pod to then lose
pod := NewTestPod(1)
podTask, _ := podtask.New(api.NewDefaultContext(), "foo",
*pod, &mesosproto.ExecutorInfo{})
taskInfo := podTask.BuildTaskInfo()
data, _ := testapi.Codec().Encode(pod)
taskInfo.Data = data
executor.LaunchTask(mockDriver, taskInfo)
// send task-lost message for it
called := make(chan struct{})
mockDriver.On(
"SendStatusUpdate",
mesosproto.TaskState_TASK_LOST,
).Return(mesosproto.Status_DRIVER_RUNNING, nil).Run(func(_ mock.Arguments) { close(called) }).Once()
executor.FrameworkMessage(mockDriver, "task-lost:foo")
assertext.EventuallyTrue(t, 5*time.Second, func() bool {
executor.lock.Lock()
defer executor.lock.Unlock()
return len(executor.tasks) == 0 && len(executor.pods) == 0
}, "executor must be able to kill a created task and pod")
select {
case <-called:
case <-time.After(5 * time.Second):
t.Fatalf("timed out waiting for SendStatusUpdate")
}
mockDriver.On("Stop").Return(mesosproto.Status_DRIVER_STOPPED, nil).Once()
executor.FrameworkMessage(mockDriver, messages.Kamikaze)
assert.Equal(t, true, executor.isDone(),
"executor should have shut down after receiving a Kamikaze message")
mockDriver.AssertExpectations(t)
}
// Create a pod with a given index, requiring one port
func NewTestPod(i int) *api.Pod {
name := fmt.Sprintf("pod%d", i)
return &api.Pod{
TypeMeta: api.TypeMeta{APIVersion: testapi.Version()},
ObjectMeta: api.ObjectMeta{
Name: name,
Namespace: api.NamespaceDefault,
SelfLink: testapi.SelfLink("pods", string(i)),
},
Spec: api.PodSpec{
Containers: []api.Container{
{
Ports: []api.ContainerPort{
{
ContainerPort: 8000 + i,
Protocol: api.ProtocolTCP,
},
},
},
},
},
Status: api.PodStatus{
Conditions: []api.PodCondition{
{
Type: api.PodReady,
Status: api.ConditionTrue,
},
},
},
}
}
// Create mock of pods ListWatch, usually listening on the apiserver pods watch endpoint
type MockPodsListWatch struct {
ListWatch cache.ListWatch
fakeWatcher *watch.FakeWatcher
list api.PodList
}
// A apiserver mock which partially mocks the pods API
type TestServer struct {
server *httptest.Server
Stats map[string]uint
lock sync.Mutex
}
func NewTestServer(t *testing.T, namespace string, pods *api.PodList) *TestServer {
ts := TestServer{
Stats: map[string]uint{},
}
mux := http.NewServeMux()
mux.HandleFunc(testapi.ResourcePath("bindings", namespace, ""), func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusOK)
})
ts.server = httptest.NewServer(mux)
return &ts
}
func NewMockPodsListWatch(initialPodList api.PodList) *MockPodsListWatch {
lw := MockPodsListWatch{
fakeWatcher: watch.NewFake(),
list: initialPodList,
}
lw.ListWatch = cache.ListWatch{
WatchFunc: func(resourceVersion string) (watch.Interface, error) {
return lw.fakeWatcher, nil
},
ListFunc: func() (runtime.Object, error) {
return &lw.list, nil
},
}
return &lw
}
// TestExecutorShutdown ensures that the executor properly shuts down
// when Shutdown is called.
func TestExecutorShutdown(t *testing.T) {
mockDriver := &MockExecutorDriver{}
kubeletFinished := make(chan struct{})
var exitCalled int32 = 0
config := Config{
Docker: dockertools.ConnectToDockerOrDie("fake://"),
Updates: make(chan interface{}, 1024),
ShutdownAlert: func() {
close(kubeletFinished)
},
KubeletFinished: kubeletFinished,
ExitFunc: func(_ int) {
atomic.AddInt32(&exitCalled, 1)
},
}
executor := New(config)
executor.Init(mockDriver)
executor.Registered(mockDriver, nil, nil, nil)
mockDriver.On("Stop").Return(mesosproto.Status_DRIVER_STOPPED, nil).Once()
executor.Shutdown(mockDriver)
assert.Equal(t, false, executor.isConnected(),
"executor should not be connected after Shutdown")
assert.Equal(t, true, executor.isDone(),
"executor should be in Done state after Shutdown")
select {
case <-executor.Done():
default:
t.Fatal("done channel should be closed after shutdown")
}
assert.Equal(t, true, atomic.LoadInt32(&exitCalled) > 0,
"the executor should call its ExitFunc when it is ready to close down")
mockDriver.AssertExpectations(t)
}
func TestExecutorsendFrameworkMessage(t *testing.T) {
mockDriver := &MockExecutorDriver{}
executor := NewTestKubernetesExecutor()
executor.Init(mockDriver)
executor.Registered(mockDriver, nil, nil, nil)
called := make(chan struct{})
mockDriver.On(
"SendFrameworkMessage",
"foo bar baz",
).Return(mesosproto.Status_DRIVER_RUNNING, nil).Run(func(_ mock.Arguments) { close(called) }).Once()
executor.sendFrameworkMessage(mockDriver, "foo bar baz")
// guard against data race in mock driver between AssertExpectations and Called
select {
case <-called: // expected
case <-time.After(5 * time.Second):
t.Fatalf("expected call to SendFrameworkMessage")
}
mockDriver.AssertExpectations(t)
}

View File

@@ -0,0 +1,18 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Package messages exposes executor event/message names as constants.
package messages

View File

@@ -0,0 +1,32 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package messages
// messages that ship with TaskStatus objects
const (
ContainersDisappeared = "containers-disappeared"
CreateBindingFailure = "create-binding-failure"
CreateBindingSuccess = "create-binding-success"
ExecutorUnregistered = "executor-unregistered"
ExecutorShutdown = "executor-shutdown"
LaunchTaskFailed = "launch-task-failed"
TaskKilled = "task-killed"
UnmarshalTaskDataFailure = "unmarshal-task-data-failure"
TaskLostAck = "task-lost-ack" // executor acknowledgement of forwarded TASK_LOST framework message
Kamikaze = "kamikaze"
)

View File

@@ -0,0 +1,81 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package executor
import (
"testing"
"github.com/GoogleCloudPlatform/kubernetes/pkg/kubelet/dockertools"
"github.com/mesos/mesos-go/mesosproto"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/mock"
)
type MockExecutorDriver struct {
mock.Mock
}
func (m *MockExecutorDriver) Start() (mesosproto.Status, error) {
args := m.Called()
return args.Get(0).(mesosproto.Status), args.Error(1)
}
func (m *MockExecutorDriver) Stop() (mesosproto.Status, error) {
args := m.Called()
return args.Get(0).(mesosproto.Status), args.Error(1)
}
func (m *MockExecutorDriver) Abort() (mesosproto.Status, error) {
args := m.Called()
return args.Get(0).(mesosproto.Status), args.Error(1)
}
func (m *MockExecutorDriver) Join() (mesosproto.Status, error) {
args := m.Called()
return args.Get(0).(mesosproto.Status), args.Error(1)
}
func (m *MockExecutorDriver) Run() (mesosproto.Status, error) {
args := m.Called()
return args.Get(0).(mesosproto.Status), args.Error(1)
}
func (m *MockExecutorDriver) SendStatusUpdate(taskStatus *mesosproto.TaskStatus) (mesosproto.Status, error) {
args := m.Called(*taskStatus.State)
return args.Get(0).(mesosproto.Status), args.Error(1)
}
func (m *MockExecutorDriver) SendFrameworkMessage(msg string) (mesosproto.Status, error) {
args := m.Called(msg)
return args.Get(0).(mesosproto.Status), args.Error(1)
}
func NewTestKubernetesExecutor() *KubernetesExecutor {
return New(Config{
Docker: dockertools.ConnectToDockerOrDie("fake://"),
Updates: make(chan interface{}, 1024),
})
}
func TestExecutorNew(t *testing.T) {
mockDriver := &MockExecutorDriver{}
executor := NewTestKubernetesExecutor()
executor.Init(mockDriver)
assert.Equal(t, executor.isDone(), false, "executor should not be in Done state on initialization")
assert.Equal(t, executor.isConnected(), false, "executor should not be connected on initialization")
}

View File

@@ -0,0 +1,18 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Package service contains the cmd/k8sm-executor glue code.
package service

View File

@@ -0,0 +1,600 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package service
import (
"bufio"
"fmt"
"io"
"math/rand"
"net"
"net/http"
"os"
"os/exec"
"strconv"
"strings"
"sync"
"time"
"github.com/GoogleCloudPlatform/kubernetes/cmd/kubelet/app"
"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/executor"
"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/executor/config"
"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/hyperkube"
"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/redirfd"
"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/runtime"
"github.com/GoogleCloudPlatform/kubernetes/pkg/api"
"github.com/GoogleCloudPlatform/kubernetes/pkg/client"
"github.com/GoogleCloudPlatform/kubernetes/pkg/credentialprovider"
"github.com/GoogleCloudPlatform/kubernetes/pkg/healthz"
"github.com/GoogleCloudPlatform/kubernetes/pkg/kubelet"
"github.com/GoogleCloudPlatform/kubernetes/pkg/kubelet/cadvisor"
kconfig "github.com/GoogleCloudPlatform/kubernetes/pkg/kubelet/config"
"github.com/GoogleCloudPlatform/kubernetes/pkg/kubelet/dockertools"
"github.com/GoogleCloudPlatform/kubernetes/pkg/util"
"github.com/GoogleCloudPlatform/kubernetes/pkg/util/mount"
log "github.com/golang/glog"
"github.com/kardianos/osext"
bindings "github.com/mesos/mesos-go/executor"
"github.com/spf13/pflag"
)
const (
// if we don't use this source then the kubelet will do funny, mirror things.
// @see ConfigSourceAnnotationKey
MESOS_CFG_SOURCE = kubelet.ApiserverSource
)
type KubeletExecutorServer struct {
*app.KubeletServer
RunProxy bool
ProxyLogV int
ProxyExec string
ProxyLogfile string
ProxyBindall bool
SuicideTimeout time.Duration
ShutdownFD int
ShutdownFIFO string
}
func NewKubeletExecutorServer() *KubeletExecutorServer {
k := &KubeletExecutorServer{
KubeletServer: app.NewKubeletServer(),
RunProxy: true,
ProxyExec: "./kube-proxy",
ProxyLogfile: "./proxy-log",
SuicideTimeout: config.DefaultSuicideTimeout,
}
if pwd, err := os.Getwd(); err != nil {
log.Warningf("failed to determine current directory: %v", err)
} else {
k.RootDirectory = pwd // mesos sandbox dir
}
k.Address = util.IP(net.ParseIP(defaultBindingAddress()))
k.ShutdownFD = -1 // indicates unspecified FD
return k
}
func NewHyperKubeletExecutorServer() *KubeletExecutorServer {
s := NewKubeletExecutorServer()
// cache this for later use
binary, err := osext.Executable()
if err != nil {
log.Fatalf("failed to determine currently running executable: %v", err)
}
s.ProxyExec = binary
return s
}
func (s *KubeletExecutorServer) addCoreFlags(fs *pflag.FlagSet) {
s.KubeletServer.AddFlags(fs)
fs.BoolVar(&s.RunProxy, "run-proxy", s.RunProxy, "Maintain a running kube-proxy instance as a child proc of this kubelet-executor.")
fs.IntVar(&s.ProxyLogV, "proxy-logv", s.ProxyLogV, "Log verbosity of the child kube-proxy.")
fs.StringVar(&s.ProxyLogfile, "proxy-logfile", s.ProxyLogfile, "Path to the kube-proxy log file.")
fs.BoolVar(&s.ProxyBindall, "proxy-bindall", s.ProxyBindall, "When true will cause kube-proxy to bind to 0.0.0.0.")
fs.DurationVar(&s.SuicideTimeout, "suicide-timeout", s.SuicideTimeout, "Self-terminate after this period of inactivity. Zero disables suicide watch.")
fs.IntVar(&s.ShutdownFD, "shutdown-fd", s.ShutdownFD, "File descriptor used to signal shutdown to external watchers, requires shutdown-fifo flag")
fs.StringVar(&s.ShutdownFIFO, "shutdown-fifo", s.ShutdownFIFO, "FIFO used to signal shutdown to external watchers, requires shutdown-fd flag")
}
func (s *KubeletExecutorServer) AddStandaloneFlags(fs *pflag.FlagSet) {
s.addCoreFlags(fs)
fs.StringVar(&s.ProxyExec, "proxy-exec", s.ProxyExec, "Path to the kube-proxy executable.")
}
func (s *KubeletExecutorServer) AddHyperkubeFlags(fs *pflag.FlagSet) {
s.addCoreFlags(fs)
}
// returns a Closer that should be closed to signal impending shutdown, but only if ShutdownFD
// and ShutdownFIFO were specified. if they are specified, then this func blocks until there's
// a reader on the FIFO stream.
func (s *KubeletExecutorServer) syncExternalShutdownWatcher() (io.Closer, error) {
if s.ShutdownFD == -1 || s.ShutdownFIFO == "" {
return nil, nil
}
// redirfd -w n fifo ... # (blocks until the fifo is read)
log.Infof("blocked, waiting for shutdown reader for FD %d FIFO at %s", s.ShutdownFD, s.ShutdownFIFO)
return redirfd.Write.Redirect(true, false, redirfd.FileDescriptor(s.ShutdownFD), s.ShutdownFIFO)
}
// Run runs the specified KubeletExecutorServer.
func (s *KubeletExecutorServer) Run(hks hyperkube.Interface, _ []string) error {
rand.Seed(time.Now().UTC().UnixNano())
if err := util.ApplyOomScoreAdj(0, s.OOMScoreAdj); err != nil {
log.Info(err)
}
var apiclient *client.Client
clientConfig, err := s.CreateAPIServerClientConfig()
if err == nil {
apiclient, err = client.New(clientConfig)
}
if err != nil {
// required for k8sm since we need to send api.Binding information
// back to the apiserver
log.Fatalf("No API client: %v", err)
}
log.Infof("Using root directory: %v", s.RootDirectory)
credentialprovider.SetPreferredDockercfgPath(s.RootDirectory)
shutdownCloser, err := s.syncExternalShutdownWatcher()
if err != nil {
return err
}
cadvisorInterface, err := cadvisor.New(s.CadvisorPort)
if err != nil {
return err
}
imageGCPolicy := kubelet.ImageGCPolicy{
HighThresholdPercent: s.ImageGCHighThresholdPercent,
LowThresholdPercent: s.ImageGCLowThresholdPercent,
}
diskSpacePolicy := kubelet.DiskSpacePolicy{
DockerFreeDiskMB: s.LowDiskSpaceThresholdMB,
RootFreeDiskMB: s.LowDiskSpaceThresholdMB,
}
//TODO(jdef) intentionally NOT initializing a cloud provider here since:
//(a) the kubelet doesn't actually use it
//(b) we don't need to create N-kubelet connections to zookeeper for no good reason
//cloud := cloudprovider.InitCloudProvider(s.CloudProvider, s.CloudConfigFile)
//log.Infof("Successfully initialized cloud provider: %q from the config file: %q\n", s.CloudProvider, s.CloudConfigFile)
hostNetworkSources, err := kubelet.GetValidatedSources(strings.Split(s.HostNetworkSources, ","))
if err != nil {
return err
}
tlsOptions, err := s.InitializeTLS()
if err != nil {
return err
}
mounter := mount.New()
if s.Containerized {
log.V(2).Info("Running kubelet in containerized mode (experimental)")
mounter = &mount.NsenterMounter{}
}
var dockerExecHandler dockertools.ExecHandler
switch s.DockerExecHandlerName {
case "native":
dockerExecHandler = &dockertools.NativeExecHandler{}
case "nsenter":
dockerExecHandler = &dockertools.NsenterExecHandler{}
default:
log.Warningf("Unknown Docker exec handler %q; defaulting to native", s.DockerExecHandlerName)
dockerExecHandler = &dockertools.NativeExecHandler{}
}
kcfg := app.KubeletConfig{
Address: s.Address,
AllowPrivileged: s.AllowPrivileged,
HostNetworkSources: hostNetworkSources,
HostnameOverride: s.HostnameOverride,
RootDirectory: s.RootDirectory,
// ConfigFile: ""
// ManifestURL: ""
// FileCheckFrequency
// HTTPCheckFrequency
PodInfraContainerImage: s.PodInfraContainerImage,
SyncFrequency: s.SyncFrequency,
RegistryPullQPS: s.RegistryPullQPS,
RegistryBurst: s.RegistryBurst,
MinimumGCAge: s.MinimumGCAge,
MaxPerPodContainerCount: s.MaxPerPodContainerCount,
MaxContainerCount: s.MaxContainerCount,
RegisterNode: s.RegisterNode,
ClusterDomain: s.ClusterDomain,
ClusterDNS: s.ClusterDNS,
Runonce: s.RunOnce,
Port: s.Port,
ReadOnlyPort: s.ReadOnlyPort,
CadvisorInterface: cadvisorInterface,
EnableServer: s.EnableServer,
EnableDebuggingHandlers: s.EnableDebuggingHandlers,
DockerClient: dockertools.ConnectToDockerOrDie(s.DockerEndpoint),
KubeClient: apiclient,
MasterServiceNamespace: s.MasterServiceNamespace,
VolumePlugins: app.ProbeVolumePlugins(),
NetworkPlugins: app.ProbeNetworkPlugins(),
NetworkPluginName: s.NetworkPluginName,
StreamingConnectionIdleTimeout: s.StreamingConnectionIdleTimeout,
TLSOptions: tlsOptions,
ImageGCPolicy: imageGCPolicy,
DiskSpacePolicy: diskSpacePolicy,
Cloud: nil, // TODO(jdef) Cloud, specifying null here because we don't want all kubelets polling mesos-master; need to account for this in the cloudprovider impl
NodeStatusUpdateFrequency: s.NodeStatusUpdateFrequency,
ResourceContainer: s.ResourceContainer,
CgroupRoot: s.CgroupRoot,
ContainerRuntime: s.ContainerRuntime,
Mounter: mounter,
DockerDaemonContainer: s.DockerDaemonContainer,
SystemContainer: s.SystemContainer,
ConfigureCBR0: s.ConfigureCBR0,
MaxPods: s.MaxPods,
DockerExecHandler: dockerExecHandler,
}
err = app.RunKubelet(&kcfg, app.KubeletBuilder(func(kc *app.KubeletConfig) (app.KubeletBootstrap, *kconfig.PodConfig, error) {
return s.createAndInitKubelet(kc, hks, clientConfig, shutdownCloser)
}))
if err != nil {
return err
}
if s.HealthzPort > 0 {
healthz.DefaultHealthz()
go util.Forever(func() {
err := http.ListenAndServe(net.JoinHostPort(s.HealthzBindAddress.String(), strconv.Itoa(s.HealthzPort)), nil)
if err != nil {
log.Errorf("Starting health server failed: %v", err)
}
}, 5*time.Second)
}
// block until executor is shut down or commits shutdown
select {}
}
func defaultBindingAddress() string {
libProcessIP := os.Getenv("LIBPROCESS_IP")
if libProcessIP == "" {
return "0.0.0.0"
} else {
return libProcessIP
}
}
func (ks *KubeletExecutorServer) createAndInitKubelet(
kc *app.KubeletConfig,
hks hyperkube.Interface,
clientConfig *client.Config,
shutdownCloser io.Closer,
) (app.KubeletBootstrap, *kconfig.PodConfig, error) {
// TODO(k8s): block until all sources have delivered at least one update to the channel, or break the sync loop
// up into "per source" synchronizations
// TODO(k8s): KubeletConfig.KubeClient should be a client interface, but client interface misses certain methods
// used by kubelet. Since NewMainKubelet expects a client interface, we need to make sure we are not passing
// a nil pointer to it when what we really want is a nil interface.
var kubeClient client.Interface
if kc.KubeClient == nil {
kubeClient = nil
} else {
kubeClient = kc.KubeClient
}
gcPolicy := kubelet.ContainerGCPolicy{
MinAge: kc.MinimumGCAge,
MaxPerPodContainer: kc.MaxPerPodContainerCount,
MaxContainers: kc.MaxContainerCount,
}
pc := kconfig.NewPodConfig(kconfig.PodConfigNotificationSnapshotAndUpdates, kc.Recorder)
updates := pc.Channel(MESOS_CFG_SOURCE)
klet, err := kubelet.NewMainKubelet(
kc.Hostname,
kc.DockerClient,
kubeClient,
kc.RootDirectory,
kc.PodInfraContainerImage,
kc.SyncFrequency,
float32(kc.RegistryPullQPS),
kc.RegistryBurst,
gcPolicy,
pc.SeenAllSources,
kc.RegisterNode,
kc.ClusterDomain,
net.IP(kc.ClusterDNS),
kc.MasterServiceNamespace,
kc.VolumePlugins,
kc.NetworkPlugins,
kc.NetworkPluginName,
kc.StreamingConnectionIdleTimeout,
kc.Recorder,
kc.CadvisorInterface,
kc.ImageGCPolicy,
kc.DiskSpacePolicy,
kc.Cloud,
kc.NodeStatusUpdateFrequency,
kc.ResourceContainer,
kc.OSInterface,
kc.CgroupRoot,
kc.ContainerRuntime,
kc.Mounter,
kc.DockerDaemonContainer,
kc.SystemContainer,
kc.ConfigureCBR0,
kc.MaxPods,
kc.DockerExecHandler,
)
if err != nil {
return nil, nil, err
}
//TODO(jdef) either configure Watch here with something useful, or else
// get rid of it from executor.Config
kubeletFinished := make(chan struct{})
exec := executor.New(executor.Config{
Kubelet: klet,
Updates: updates,
SourceName: MESOS_CFG_SOURCE,
APIClient: kc.KubeClient,
Docker: kc.DockerClient,
SuicideTimeout: ks.SuicideTimeout,
KubeletFinished: kubeletFinished,
ShutdownAlert: func() {
if shutdownCloser != nil {
if e := shutdownCloser.Close(); e != nil {
log.Warningf("failed to signal shutdown to external watcher: %v", e)
}
}
},
ExitFunc: os.Exit,
PodStatusFunc: func(kl *kubelet.Kubelet, pod *api.Pod) (*api.PodStatus, error) {
return kl.GetRuntime().GetPodStatus(pod)
},
})
k := &kubeletExecutor{
Kubelet: klet,
runProxy: ks.RunProxy,
proxyLogV: ks.ProxyLogV,
proxyExec: ks.ProxyExec,
proxyLogfile: ks.ProxyLogfile,
proxyBindall: ks.ProxyBindall,
address: ks.Address,
dockerClient: kc.DockerClient,
hks: hks,
kubeletFinished: kubeletFinished,
executorDone: exec.Done(),
clientConfig: clientConfig,
}
dconfig := bindings.DriverConfig{
Executor: exec,
HostnameOverride: ks.HostnameOverride,
BindingAddress: net.IP(ks.Address),
}
if driver, err := bindings.NewMesosExecutorDriver(dconfig); err != nil {
log.Fatalf("failed to create executor driver: %v", err)
} else {
k.driver = driver
}
log.V(2).Infof("Initialize executor driver...")
k.BirthCry()
exec.Init(k.driver)
k.StartGarbageCollection()
return k, pc, nil
}
// kubelet decorator
type kubeletExecutor struct {
*kubelet.Kubelet
initialize sync.Once
driver bindings.ExecutorDriver
runProxy bool
proxyLogV int
proxyExec string
proxyLogfile string
proxyBindall bool
address util.IP
dockerClient dockertools.DockerInterface
hks hyperkube.Interface
kubeletFinished chan struct{} // closed once kubelet.Run() returns
executorDone <-chan struct{} // from KubeletExecutor.Done()
clientConfig *client.Config
}
func (kl *kubeletExecutor) ListenAndServe(address net.IP, port uint, tlsOptions *kubelet.TLSOptions, enableDebuggingHandlers bool) {
// this func could be called many times, depending how often the HTTP server crashes,
// so only execute certain initialization procs once
kl.initialize.Do(func() {
if kl.runProxy {
go runtime.Until(kl.runProxyService, 5*time.Second, kl.executorDone)
}
go func() {
if _, err := kl.driver.Run(); err != nil {
log.Fatalf("executor driver failed: %v", err)
}
log.Info("executor Run completed")
}()
})
log.Infof("Starting kubelet server...")
kubelet.ListenAndServeKubeletServer(kl, address, port, tlsOptions, enableDebuggingHandlers)
}
// this function blocks as long as the proxy service is running; intended to be
// executed asynchronously.
func (kl *kubeletExecutor) runProxyService() {
log.Infof("Starting proxy process...")
const KM_PROXY = "proxy" //TODO(jdef) constant should be shared with km package
args := []string{}
if kl.hks.FindServer(KM_PROXY) {
args = append(args, KM_PROXY)
log.V(1).Infof("attempting to using km proxy service")
} else if _, err := os.Stat(kl.proxyExec); os.IsNotExist(err) {
log.Errorf("failed to locate proxy executable at '%v' and km not present: %v", kl.proxyExec, err)
return
}
bindAddress := "0.0.0.0"
if !kl.proxyBindall {
bindAddress = kl.address.String()
}
args = append(args,
fmt.Sprintf("--bind-address=%s", bindAddress),
fmt.Sprintf("--v=%d", kl.proxyLogV),
"--logtostderr=true",
)
// add client.Config args here. proxy still calls client.BindClientConfigFlags
appendStringArg := func(name, value string) {
if value != "" {
args = append(args, fmt.Sprintf("--%s=%s", name, value))
}
}
appendStringArg("master", kl.clientConfig.Host)
/* TODO(jdef) move these flags to a config file pointed to by --kubeconfig
appendStringArg("api-version", kl.clientConfig.Version)
appendStringArg("client-certificate", kl.clientConfig.CertFile)
appendStringArg("client-key", kl.clientConfig.KeyFile)
appendStringArg("certificate-authority", kl.clientConfig.CAFile)
args = append(args, fmt.Sprintf("--insecure-skip-tls-verify=%t", kl.clientConfig.Insecure))
*/
log.Infof("Spawning process executable %s with args '%+v'", kl.proxyExec, args)
cmd := exec.Command(kl.proxyExec, args...)
if _, err := cmd.StdoutPipe(); err != nil {
log.Fatal(err)
}
proxylogs, err := cmd.StderrPipe()
if err != nil {
log.Fatal(err)
}
//TODO(jdef) append instead of truncate? what if the disk is full?
logfile, err := os.Create(kl.proxyLogfile)
if err != nil {
log.Fatal(err)
}
defer logfile.Close()
ch := make(chan struct{})
go func() {
defer func() {
select {
case <-ch:
log.Infof("killing proxy process..")
if err = cmd.Process.Kill(); err != nil {
log.Errorf("failed to kill proxy process: %v", err)
}
default:
}
}()
writer := bufio.NewWriter(logfile)
defer writer.Flush()
<-ch
written, err := io.Copy(writer, proxylogs)
if err != nil {
log.Errorf("error writing data to proxy log: %v", err)
}
log.Infof("wrote %d bytes to proxy log", written)
}()
// if the proxy fails to start then we exit the executor, otherwise
// wait for the proxy process to end (and release resources after).
if err := cmd.Start(); err != nil {
log.Fatal(err)
}
close(ch)
if err := cmd.Wait(); err != nil {
log.Error(err)
}
}
// runs the main kubelet loop, closing the kubeletFinished chan when the loop exits.
// never returns.
func (kl *kubeletExecutor) Run(updates <-chan kubelet.PodUpdate) {
defer func() {
close(kl.kubeletFinished)
util.HandleCrash()
log.Infoln("kubelet run terminated") //TODO(jdef) turn down verbosity
// important: never return! this is in our contract
select {}
}()
// push updates through a closable pipe. when the executor indicates shutdown
// via Done() we want to stop the Kubelet from processing updates.
pipe := make(chan kubelet.PodUpdate)
go func() {
// closing pipe will cause our patched kubelet's syncLoop() to exit
defer close(pipe)
pipeLoop:
for {
select {
case <-kl.executorDone:
break pipeLoop
default:
select {
case u := <-updates:
select {
case pipe <- u: // noop
case <-kl.executorDone:
break pipeLoop
}
case <-kl.executorDone:
break pipeLoop
}
}
}
}()
// we expect that Run() will complete after the pipe is closed and the
// kubelet's syncLoop() has finished processing its backlog, which hopefully
// will not take very long. Peeking into the future (current k8s master) it
// seems that the backlog has grown from 1 to 50 -- this may negatively impact
// us going forward, time will tell.
util.Until(func() { kl.Kubelet.Run(pipe) }, 0, kl.executorDone)
//TODO(jdef) revisit this if/when executor failover lands
err := kl.SyncPods([]*api.Pod{}, nil, nil, time.Now())
if err != nil {
log.Errorf("failed to cleanly remove all pods and associated state: %v", err)
}
}

View File

@@ -0,0 +1,21 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Package hyperkube facilitates the combination of multiple
// kubernetes-mesos components into a single binary form, providing a
// simple mechanism for intra-component discovery as per the original
// Kubernetes hyperkube package.
package hyperkube

View File

@@ -0,0 +1,54 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package hyperkube
import (
"github.com/spf13/pflag"
)
var (
nilKube = &nilKubeType{}
)
type Interface interface {
// FindServer will find a specific server named name.
FindServer(name string) bool
// The executable name, used for help and soft-link invocation
Name() string
// Flags returns a flagset for "global" flags.
Flags() *pflag.FlagSet
}
type nilKubeType struct{}
func (n *nilKubeType) FindServer(_ string) bool {
return false
}
func (n *nilKubeType) Name() string {
return ""
}
func (n *nilKubeType) Flags() *pflag.FlagSet {
return nil
}
func Nil() Interface {
return nilKube
}

View File

@@ -0,0 +1,18 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Package offers contains code that manages Mesos offers.
package offers

View File

@@ -0,0 +1,19 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Package metrics defines and exposes instrumentation metrics related to
// Mesos offers.
package metrics

View File

@@ -0,0 +1,89 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package metrics
import (
"sync"
"time"
"github.com/prometheus/client_golang/prometheus"
)
const (
offerSubsystem = "mesos_offers"
)
type OfferDeclinedReason string
const (
OfferExpired = OfferDeclinedReason("expired")
OfferRescinded = OfferDeclinedReason("rescinded")
OfferCompat = OfferDeclinedReason("compat")
)
var (
OffersReceived = prometheus.NewCounterVec(
prometheus.CounterOpts{
Subsystem: offerSubsystem,
Name: "received",
Help: "Counter of offers received from Mesos broken out by slave host.",
},
[]string{"hostname"},
)
OffersDeclined = prometheus.NewCounterVec(
prometheus.CounterOpts{
Subsystem: offerSubsystem,
Name: "declined",
Help: "Counter of offers declined by the framework broken out by slave host.",
},
[]string{"hostname", "reason"},
)
OffersAcquired = prometheus.NewCounterVec(
prometheus.CounterOpts{
Subsystem: offerSubsystem,
Name: "acquired",
Help: "Counter of offers acquired for task launch broken out by slave host.",
},
[]string{"hostname"},
)
OffersReleased = prometheus.NewCounterVec(
prometheus.CounterOpts{
Subsystem: offerSubsystem,
Name: "released",
Help: "Counter of previously-acquired offers later released, broken out by slave host.",
},
[]string{"hostname"},
)
)
var registerMetrics sync.Once
func Register() {
registerMetrics.Do(func() {
prometheus.MustRegister(OffersReceived)
prometheus.MustRegister(OffersDeclined)
prometheus.MustRegister(OffersAcquired)
prometheus.MustRegister(OffersReleased)
})
}
func InMicroseconds(d time.Duration) float64 {
return float64(d.Nanoseconds() / time.Microsecond.Nanoseconds())
}

View File

@@ -0,0 +1,570 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package offers
import (
"fmt"
"reflect"
"sync"
"sync/atomic"
"time"
"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/offers/metrics"
"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/proc"
"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/queue"
"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/runtime"
"github.com/GoogleCloudPlatform/kubernetes/pkg/client/cache"
"github.com/GoogleCloudPlatform/kubernetes/pkg/util"
log "github.com/golang/glog"
mesos "github.com/mesos/mesos-go/mesosproto"
)
const (
offerListenerMaxAge = 12 // max number of times we'll attempt to fit an offer to a listener before requiring them to re-register themselves
offerIdCacheTTL = 1 * time.Second // determines expiration of cached offer ids, used in listener notification
deferredDeclineTtlFactor = 2 // this factor, multiplied by the offer ttl, determines how long to wait before attempting to decline previously claimed offers that were subsequently deleted, then released. see offerStorage.Delete
notifyListenersDelay = 0 // delay between offer listener notification attempts
)
type Filter func(*mesos.Offer) bool
type Registry interface {
// Initialize the instance, spawning necessary housekeeping go routines.
Init(<-chan struct{})
// Add offers to this registry, rejecting those that are deemed incompatible.
Add([]*mesos.Offer)
// Listen for arriving offers that are acceptable to the filter, sending
// a signal on (by closing) the returned channel. A listener will only
// ever be notified once, if at all.
Listen(id string, f Filter) <-chan struct{}
// invoked when offers are rescinded or expired
Delete(string, metrics.OfferDeclinedReason)
// when true, returns the offer that's registered for the given ID
Get(offerId string) (Perishable, bool)
// iterate through non-expired offers in this registry
Walk(Walker) error
// invalidate one or all (when offerId="") offers; offers are not declined,
// but are simply flagged as expired in the offer history
Invalidate(offerId string)
// invalidate all offers associated with the slave identified by slaveId.
InvalidateForSlave(slaveId string)
}
// callback that is invoked during a walk through a series of live offers,
// returning with stop=true (or err != nil) if the walk should stop permaturely.
type Walker func(offer Perishable) (stop bool, err error)
type RegistryConfig struct {
DeclineOffer func(offerId string) <-chan error // tell Mesos that we're declining the offer
Compat func(*mesos.Offer) bool // returns true if offer is compatible; incompatible offers are declined
TTL time.Duration // determines a perishable offer's expiration deadline: now+ttl
LingerTTL time.Duration // if zero, offers will not linger in the FIFO past their expiration deadline
ListenerDelay time.Duration // specifies the sleep time between offer listener notifications
}
type offerStorage struct {
RegistryConfig
offers *cache.FIFO // collection of Perishable, both live and expired
listeners *queue.DelayFIFO // collection of *offerListener
delayed *queue.DelayQueue // deadline-oriented offer-event queue
slaves *slaveStorage // slave to offer mappings
}
type liveOffer struct {
*mesos.Offer
expiration time.Time
acquired int32 // 1 = acquired, 0 = free
}
type expiredOffer struct {
offerSpec
deadline time.Time
}
// subset of mesos.OfferInfo useful for recordkeeping
type offerSpec struct {
id string
hostname string
}
// offers that may perish (all of them?) implement this interface.
// callers may expect to access these funcs concurrently so implementations
// must provide their own form of synchronization around mutable state.
type Perishable interface {
// returns true if this offer has expired
HasExpired() bool
// if not yet expired, return mesos offer details; otherwise nil
Details() *mesos.Offer
// mark this offer as acquired, returning true if it was previously unacquired. thread-safe.
Acquire() bool
// mark this offer as un-acquired. thread-safe.
Release()
// expire or delete this offer from storage
age(s *offerStorage)
// return a unique identifier for this offer
Id() string
// return the slave host for this offer
Host() string
addTo(*queue.DelayQueue)
}
func (e *expiredOffer) addTo(q *queue.DelayQueue) {
q.Add(e)
}
func (e *expiredOffer) Id() string {
return e.id
}
func (e *expiredOffer) Host() string {
return e.hostname
}
func (e *expiredOffer) HasExpired() bool {
return true
}
func (e *expiredOffer) Details() *mesos.Offer {
return nil
}
func (e *expiredOffer) Acquire() bool {
return false
}
func (e *expiredOffer) Release() {}
func (e *expiredOffer) age(s *offerStorage) {
log.V(3).Infof("Delete lingering offer: %v", e.id)
s.offers.Delete(e)
s.slaves.deleteOffer(e.id)
}
// return the time left to linger
func (e *expiredOffer) GetDelay() time.Duration {
return e.deadline.Sub(time.Now())
}
func (to *liveOffer) HasExpired() bool {
return time.Now().After(to.expiration)
}
func (to *liveOffer) Details() *mesos.Offer {
return to.Offer
}
func (to *liveOffer) Acquire() (acquired bool) {
if acquired = atomic.CompareAndSwapInt32(&to.acquired, 0, 1); acquired {
metrics.OffersAcquired.WithLabelValues(to.Host()).Inc()
}
return
}
func (to *liveOffer) Release() {
if released := atomic.CompareAndSwapInt32(&to.acquired, 1, 0); released {
metrics.OffersReleased.WithLabelValues(to.Host()).Inc()
}
}
func (to *liveOffer) age(s *offerStorage) {
s.Delete(to.Id(), metrics.OfferExpired)
}
func (to *liveOffer) Id() string {
return to.Offer.Id.GetValue()
}
func (to *liveOffer) Host() string {
return to.Offer.GetHostname()
}
func (to *liveOffer) addTo(q *queue.DelayQueue) {
q.Add(to)
}
// return the time remaining before the offer expires
func (to *liveOffer) GetDelay() time.Duration {
return to.expiration.Sub(time.Now())
}
func CreateRegistry(c RegistryConfig) Registry {
metrics.Register()
return &offerStorage{
RegistryConfig: c,
offers: cache.NewFIFO(cache.KeyFunc(func(v interface{}) (string, error) {
if perishable, ok := v.(Perishable); !ok {
return "", fmt.Errorf("expected perishable offer, not '%+v'", v)
} else {
return perishable.Id(), nil
}
})),
listeners: queue.NewDelayFIFO(),
delayed: queue.NewDelayQueue(),
slaves: newSlaveStorage(),
}
}
func (s *offerStorage) declineOffer(offerId, hostname string, reason metrics.OfferDeclinedReason) {
//TODO(jdef) might be nice to spec an abort chan here
runtime.Signal(proc.OnError(s.DeclineOffer(offerId), func(err error) {
log.Warningf("decline failed for offer id %v: %v", offerId, err)
}, nil)).Then(func() {
metrics.OffersDeclined.WithLabelValues(hostname, string(reason)).Inc()
})
}
func (s *offerStorage) Add(offers []*mesos.Offer) {
now := time.Now()
for _, offer := range offers {
if !s.Compat(offer) {
//TODO(jdef) would be nice to batch these up
offerId := offer.Id.GetValue()
log.V(3).Infof("Declining incompatible offer %v", offerId)
s.declineOffer(offerId, offer.GetHostname(), metrics.OfferCompat)
return
}
timed := &liveOffer{
Offer: offer,
expiration: now.Add(s.TTL),
acquired: 0,
}
log.V(3).Infof("Receiving offer %v", timed.Id())
s.offers.Add(timed)
s.delayed.Add(timed)
s.slaves.add(offer.SlaveId.GetValue(), timed.Id())
metrics.OffersReceived.WithLabelValues(timed.Host()).Inc()
}
}
// delete an offer from storage, implicitly expires the offer
func (s *offerStorage) Delete(offerId string, reason metrics.OfferDeclinedReason) {
if offer, ok := s.Get(offerId); ok {
log.V(3).Infof("Deleting offer %v", offerId)
// attempt to block others from consuming the offer. if it's already been
// claimed and is not yet lingering then don't decline it - just mark it as
// expired in the history: allow a prior claimant to attempt to launch with it
notYetClaimed := offer.Acquire()
if offer.Details() != nil {
if notYetClaimed {
log.V(3).Infof("Declining offer %v", offerId)
s.declineOffer(offerId, offer.Host(), reason)
} else {
// some pod has acquired this and may attempt to launch a task with it
// failed schedule/launch attempts are requried to Release() any claims on the offer
// TODO(jdef): not sure what a good value is here. the goal is to provide a
// launchTasks (driver) operation enough time to complete so that we don't end
// up declining an offer that we're actually attempting to use.
time.AfterFunc(deferredDeclineTtlFactor*s.TTL, func() {
// at this point the offer is in one of five states:
// a) permanently deleted: expired due to timeout
// b) permanently deleted: expired due to having been rescinded
// c) lingering: expired due to timeout
// d) lingering: expired due to having been rescinded
// e) claimed: task launched and it using resources from this offer
// we want to **avoid** declining an offer that's claimed: attempt to acquire
if offer.Acquire() {
// previously claimed offer was released, perhaps due to a launch
// failure, so we should attempt to decline
log.V(3).Infof("attempting to decline (previously claimed) offer %v", offerId)
s.declineOffer(offerId, offer.Host(), reason)
}
})
}
}
s.expireOffer(offer)
} // else, ignore offers not in the history
}
func (s *offerStorage) InvalidateForSlave(slaveId string) {
offerIds := s.slaves.deleteSlave(slaveId)
for oid := range offerIds {
s.invalidateOne(oid)
}
}
// if offerId == "" then expire all known, live offers, otherwise only the offer indicated
func (s *offerStorage) Invalidate(offerId string) {
if offerId != "" {
s.invalidateOne(offerId)
return
}
obj := s.offers.List()
for _, o := range obj {
offer, ok := o.(Perishable)
if !ok {
log.Errorf("Expected perishable offer, not %v", o)
continue
}
offer.Acquire() // attempt to block others from using it
s.expireOffer(offer)
// don't decline, we already know that it's an invalid offer
}
}
func (s *offerStorage) invalidateOne(offerId string) {
if offer, ok := s.Get(offerId); ok {
offer.Acquire() // attempt to block others from using it
s.expireOffer(offer)
// don't decline, we already know that it's an invalid offer
}
}
// Walk the collection of offers. The walk stops either as indicated by the
// Walker or when the end of the offer list is reached. Expired offers are
// never passed to a Walker.
func (s *offerStorage) Walk(w Walker) error {
for _, v := range s.offers.List() {
offer, ok := v.(Perishable)
if !ok {
// offer disappeared...
continue
}
if offer.HasExpired() {
// never pass expired offers to walkers
continue
}
if stop, err := w(offer); err != nil {
return err
} else if stop {
return nil
}
}
return nil
}
func Expired(offerId, hostname string, ttl time.Duration) *expiredOffer {
return &expiredOffer{offerSpec{id: offerId, hostname: hostname}, time.Now().Add(ttl)}
}
func (s *offerStorage) expireOffer(offer Perishable) {
// the offer may or may not be expired due to TTL so check for details
// since that's a more reliable determinant of lingering status
if details := offer.Details(); details != nil {
// recently expired, should linger
offerId := details.Id.GetValue()
log.V(3).Infof("Expiring offer %v", offerId)
if s.LingerTTL > 0 {
log.V(3).Infof("offer will linger: %v", offerId)
expired := Expired(offerId, offer.Host(), s.LingerTTL)
s.offers.Update(expired)
s.delayed.Add(expired)
} else {
log.V(3).Infof("Permanently deleting offer %v", offerId)
s.offers.Delete(offerId)
s.slaves.deleteOffer(offerId)
}
} // else, it's still lingering...
}
func (s *offerStorage) Get(id string) (Perishable, bool) {
if obj, ok, _ := s.offers.GetByKey(id); !ok {
return nil, false
} else {
to, ok := obj.(Perishable)
if !ok {
log.Errorf("invalid offer object in fifo '%v'", obj)
}
return to, ok
}
}
type offerListener struct {
id string
accepts Filter
notify chan<- struct{}
age int
deadline time.Time
sawVersion uint64
}
func (l *offerListener) GetUID() string {
return l.id
}
func (l *offerListener) Deadline() (time.Time, bool) {
return l.deadline, true
}
// register a listener for new offers, whom we'll notify upon receiving such.
// notification is delivered in the form of closing the channel, nothing is ever sent.
func (s *offerStorage) Listen(id string, f Filter) <-chan struct{} {
if f == nil {
return nil
}
ch := make(chan struct{})
listen := &offerListener{
id: id,
accepts: f,
notify: ch,
deadline: time.Now().Add(s.ListenerDelay),
}
log.V(3).Infof("Registering offer listener %s", listen.id)
s.listeners.Offer(listen, queue.ReplaceExisting)
return ch
}
func (s *offerStorage) ageOffers() {
offer, ok := s.delayed.Pop().(Perishable)
if !ok {
log.Errorf("Expected Perishable, not %v", offer)
return
}
if details := offer.Details(); details != nil && !offer.HasExpired() {
// live offer has not expired yet: timed out early
// FWIW: early timeouts are more frequent when GOMAXPROCS is > 1
offer.addTo(s.delayed)
} else {
offer.age(s)
}
}
func (s *offerStorage) nextListener() *offerListener {
obj := s.listeners.Pop()
if listen, ok := obj.(*offerListener); !ok {
//programming error
panic(fmt.Sprintf("unexpected listener object %v", obj))
} else {
return listen
}
}
// notify listeners if we find an acceptable offer for them. listeners
// are garbage collected after a certain age (see offerListenerMaxAge).
// ids lists offer IDs that are retrievable from offer storage.
func (s *offerStorage) notifyListeners(ids func() (util.StringSet, uint64)) {
listener := s.nextListener() // blocking
offerIds, version := ids()
if listener.sawVersion == version {
// no changes to offer list, avoid growing older - just wait for new offers to arrive
listener.deadline = time.Now().Add(s.ListenerDelay)
s.listeners.Offer(listener, queue.KeepExisting)
return
}
listener.sawVersion = version
// notify if we find an acceptable offer
for id := range offerIds {
if offer, ok := s.Get(id); !ok || offer.HasExpired() {
continue
} else if listener.accepts(offer.Details()) {
log.V(3).Infof("Notifying offer listener %s", listener.id)
close(listener.notify)
return
}
}
// no interesting offers found, re-queue the listener
listener.age++
if listener.age < offerListenerMaxAge {
listener.deadline = time.Now().Add(s.ListenerDelay)
s.listeners.Offer(listener, queue.KeepExisting)
} else {
// garbage collection is as simple as not re-adding the listener to the queue
log.V(3).Infof("garbage collecting offer listener %s", listener.id)
}
}
func (s *offerStorage) Init(done <-chan struct{}) {
// zero delay, reap offers as soon as they expire
go runtime.Until(s.ageOffers, 0, done)
// cached offer ids for the purposes of listener notification
idCache := &stringsCache{
refill: func() util.StringSet {
result := util.NewStringSet()
for _, v := range s.offers.List() {
if offer, ok := v.(Perishable); ok {
result.Insert(offer.Id())
}
}
return result
},
ttl: offerIdCacheTTL,
}
go runtime.Until(func() { s.notifyListeners(idCache.Strings) }, notifyListenersDelay, done)
}
type stringsCache struct {
expiresAt time.Time
cached util.StringSet
ttl time.Duration
refill func() util.StringSet
version uint64
}
// not thread-safe
func (c *stringsCache) Strings() (util.StringSet, uint64) {
now := time.Now()
if c.expiresAt.Before(now) {
old := c.cached
c.cached = c.refill()
c.expiresAt = now.Add(c.ttl)
if !reflect.DeepEqual(old, c.cached) {
c.version++
}
}
return c.cached, c.version
}
type slaveStorage struct {
sync.Mutex
index map[string]string // map offerId to slaveId
}
func newSlaveStorage() *slaveStorage {
return &slaveStorage{
index: make(map[string]string),
}
}
// create a mapping between a slave and an offer
func (self *slaveStorage) add(slaveId, offerId string) {
self.Lock()
defer self.Unlock()
self.index[offerId] = slaveId
}
// delete the slave-offer mappings for slaveId, returns the IDs of the offers that were unmapped
func (self *slaveStorage) deleteSlave(slaveId string) util.StringSet {
offerIds := util.NewStringSet()
self.Lock()
defer self.Unlock()
for oid, sid := range self.index {
if sid == slaveId {
offerIds.Insert(oid)
delete(self.index, oid)
}
}
return offerIds
}
// delete the slave-offer mappings for offerId
func (self *slaveStorage) deleteOffer(offerId string) {
self.Lock()
defer self.Unlock()
delete(self.index, offerId)
}

View File

@@ -0,0 +1,391 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package offers
import (
"errors"
"sync/atomic"
"testing"
"time"
"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/proc"
mesos "github.com/mesos/mesos-go/mesosproto"
util "github.com/mesos/mesos-go/mesosutil"
)
func TestExpiredOffer(t *testing.T) {
t.Parallel()
ttl := 2 * time.Second
o := Expired("test", "testhost", ttl)
if o.Id() != "test" {
t.Error("expiredOffer does not return its Id")
}
if o.Host() != "testhost" {
t.Error("expiredOffer does not return its hostname")
}
if o.HasExpired() != true {
t.Error("expiredOffer is not expired")
}
if o.Details() != nil {
t.Error("expiredOffer does not return nil Details")
}
if o.Acquire() != false {
t.Error("expiredOffer must not be able to be acquired")
}
if delay := o.GetDelay(); !(0 < delay && delay <= ttl) {
t.Error("expiredOffer does not return a valid deadline")
}
} // TestExpiredOffer
func TestTimedOffer(t *testing.T) {
t.Parallel()
ttl := 2 * time.Second
now := time.Now()
o := &liveOffer{nil, now.Add(ttl), 0}
if o.HasExpired() {
t.Errorf("offer ttl was %v and should not have expired yet", ttl)
}
if !o.Acquire() {
t.Fatal("1st acquisition of offer failed")
}
o.Release()
if !o.Acquire() {
t.Fatal("2nd acquisition of offer failed")
}
if o.Acquire() {
t.Fatal("3rd acquisition of offer passed but prior claim was not released")
}
o.Release()
if !o.Acquire() {
t.Fatal("4th acquisition of offer failed")
}
o.Release()
time.Sleep(ttl)
if !o.HasExpired() {
t.Fatal("offer not expired after ttl passed")
}
if !o.Acquire() {
t.Fatal("5th acquisition of offer failed; should not be tied to expiration")
}
if o.Acquire() {
t.Fatal("6th acquisition of offer succeeded; should already be acquired")
}
} // TestTimedOffer
func TestOfferStorage(t *testing.T) {
ttl := time.Second / 4
var declinedNum int32
getDeclinedNum := func() int32 { return atomic.LoadInt32(&declinedNum) }
config := RegistryConfig{
DeclineOffer: func(offerId string) <-chan error {
atomic.AddInt32(&declinedNum, 1)
return proc.ErrorChan(nil)
},
Compat: func(o *mesos.Offer) bool {
return o.Hostname == nil || *o.Hostname != "incompatiblehost"
},
TTL: ttl,
LingerTTL: 2 * ttl,
}
storage := CreateRegistry(config)
done := make(chan struct{})
storage.Init(done)
// Add offer
id := util.NewOfferID("foo")
o := &mesos.Offer{Id: id}
storage.Add([]*mesos.Offer{o})
// Added offer should be in the storage
if obj, ok := storage.Get(id.GetValue()); obj == nil || !ok {
t.Error("offer not added")
}
if obj, _ := storage.Get(id.GetValue()); obj.Details() != o {
t.Error("added offer differs from returned offer")
}
// Not-added offer is not in storage
if obj, ok := storage.Get("bar"); obj != nil || ok {
t.Error("offer bar should not exist in storage")
}
// Deleted offer lingers in storage, is acquired and declined
offer, _ := storage.Get(id.GetValue())
declinedNumBefore := getDeclinedNum()
storage.Delete(id.GetValue(), "deleted for test")
if obj, _ := storage.Get(id.GetValue()); obj == nil {
t.Error("deleted offer is not lingering")
}
if obj, _ := storage.Get(id.GetValue()); !obj.HasExpired() {
t.Error("deleted offer is no expired")
}
if ok := offer.Acquire(); ok {
t.Error("deleted offer can be acquired")
}
if getDeclinedNum() <= declinedNumBefore {
t.Error("deleted offer was not declined")
}
// Acquired offer is only declined after 2*ttl
id = util.NewOfferID("foo2")
o = &mesos.Offer{Id: id}
storage.Add([]*mesos.Offer{o})
offer, _ = storage.Get(id.GetValue())
declinedNumBefore = getDeclinedNum()
offer.Acquire()
storage.Delete(id.GetValue(), "deleted for test")
if getDeclinedNum() > declinedNumBefore {
t.Error("acquired offer is declined")
}
offer.Release()
time.Sleep(3 * ttl)
if getDeclinedNum() <= declinedNumBefore {
t.Error("released offer is not declined after 2*ttl")
}
// Added offer should be expired after ttl, but lingering
id = util.NewOfferID("foo3")
o = &mesos.Offer{Id: id}
storage.Add([]*mesos.Offer{o})
time.Sleep(2 * ttl)
obj, ok := storage.Get(id.GetValue())
if obj == nil || !ok {
t.Error("offer not lingering after ttl")
}
if !obj.HasExpired() {
t.Error("offer is not expired after ttl")
}
// Should be deleted when waiting longer than LingerTTL
time.Sleep(2 * ttl)
if obj, ok := storage.Get(id.GetValue()); obj != nil || ok {
t.Error("offer not deleted after LingerTTL")
}
// Incompatible offer is declined
id = util.NewOfferID("foo4")
incompatibleHostname := "incompatiblehost"
o = &mesos.Offer{Id: id, Hostname: &incompatibleHostname}
declinedNumBefore = getDeclinedNum()
storage.Add([]*mesos.Offer{o})
if obj, ok := storage.Get(id.GetValue()); obj != nil || ok {
t.Error("incompatible offer not rejected")
}
if getDeclinedNum() <= declinedNumBefore {
t.Error("incompatible offer is not declined")
}
// Invalidated offer are not declined, but expired
id = util.NewOfferID("foo5")
o = &mesos.Offer{Id: id}
storage.Add([]*mesos.Offer{o})
offer, _ = storage.Get(id.GetValue())
declinedNumBefore = getDeclinedNum()
storage.Invalidate(id.GetValue())
if obj, _ := storage.Get(id.GetValue()); !obj.HasExpired() {
t.Error("invalidated offer is not expired")
}
if getDeclinedNum() > declinedNumBefore {
t.Error("invalidated offer is declined")
}
if ok := offer.Acquire(); ok {
t.Error("invalidated offer can be acquired")
}
// Invalidate "" will invalidate all offers
id = util.NewOfferID("foo6")
o = &mesos.Offer{Id: id}
storage.Add([]*mesos.Offer{o})
id2 := util.NewOfferID("foo7")
o2 := &mesos.Offer{Id: id2}
storage.Add([]*mesos.Offer{o2})
storage.Invalidate("")
if obj, _ := storage.Get(id.GetValue()); !obj.HasExpired() {
t.Error("invalidated offer is not expired")
}
if obj2, _ := storage.Get(id2.GetValue()); !obj2.HasExpired() {
t.Error("invalidated offer is not expired")
}
// InvalidateForSlave invalides all offers for that slave, but only those
id = util.NewOfferID("foo8")
slaveId := util.NewSlaveID("test-slave")
o = &mesos.Offer{Id: id, SlaveId: slaveId}
storage.Add([]*mesos.Offer{o})
id2 = util.NewOfferID("foo9")
o2 = &mesos.Offer{Id: id2}
storage.Add([]*mesos.Offer{o2})
storage.InvalidateForSlave(slaveId.GetValue())
if obj, _ := storage.Get(id.GetValue()); !obj.HasExpired() {
t.Error("invalidated offer for test-slave is not expired")
}
if obj2, _ := storage.Get(id2.GetValue()); obj2.HasExpired() {
t.Error("invalidated offer another slave is expired")
}
close(done)
} // TestOfferStorage
func TestListen(t *testing.T) {
ttl := time.Second / 4
config := RegistryConfig{
DeclineOffer: func(offerId string) <-chan error {
return proc.ErrorChan(nil)
},
Compat: func(o *mesos.Offer) bool {
return true
},
TTL: ttl,
ListenerDelay: ttl / 2,
}
storage := CreateRegistry(config)
done := make(chan struct{})
storage.Init(done)
// Create two listeners with a hostname filter
hostname1 := "hostname1"
hostname2 := "hostname2"
listener1 := storage.Listen("listener1", func(offer *mesos.Offer) bool {
return offer.GetHostname() == hostname1
})
listener2 := storage.Listen("listener2", func(offer *mesos.Offer) bool {
return offer.GetHostname() == hostname2
})
// Add hostname1 offer
id := util.NewOfferID("foo")
o := &mesos.Offer{Id: id, Hostname: &hostname1}
storage.Add([]*mesos.Offer{o})
// listener1 is notified by closing channel
select {
case _, more := <-listener1:
if more {
t.Error("listener1 is not closed")
}
}
// listener2 is not notified within ttl
select {
case <-listener2:
t.Error("listener2 is notified")
case <-time.After(ttl):
}
close(done)
} // TestListen
func TestWalk(t *testing.T) {
t.Parallel()
config := RegistryConfig{
DeclineOffer: func(offerId string) <-chan error {
return proc.ErrorChan(nil)
},
TTL: 0 * time.Second,
LingerTTL: 0 * time.Second,
ListenerDelay: 0 * time.Second,
}
storage := CreateRegistry(config)
acceptedOfferId := ""
walked := 0
walker1 := func(p Perishable) (bool, error) {
walked++
if p.Acquire() {
acceptedOfferId = p.Details().Id.GetValue()
return true, nil
}
return false, nil
}
// sanity check
err := storage.Walk(walker1)
if err != nil {
t.Fatalf("received impossible error %v", err)
}
if walked != 0 {
t.Fatal("walked empty storage")
}
if acceptedOfferId != "" {
t.Fatal("somehow found an offer when registry was empty")
}
impl, ok := storage.(*offerStorage)
if !ok {
t.Fatal("unexpected offer storage impl")
}
// single offer
ttl := 2 * time.Second
now := time.Now()
o := &liveOffer{&mesos.Offer{Id: util.NewOfferID("foo")}, now.Add(ttl), 0}
impl.offers.Add(o)
err = storage.Walk(walker1)
if err != nil {
t.Fatalf("received impossible error %v", err)
}
if walked != 1 {
t.Fatalf("walk count %d", walked)
}
if acceptedOfferId != "foo" {
t.Fatalf("found offer %v", acceptedOfferId)
}
acceptedOfferId = ""
err = storage.Walk(walker1)
if err != nil {
t.Fatalf("received impossible error %v", err)
}
if walked != 2 {
t.Fatalf("walk count %d", walked)
}
if acceptedOfferId != "" {
t.Fatalf("found offer %v", acceptedOfferId)
}
walker2 := func(p Perishable) (bool, error) {
walked++
return true, nil
}
err = storage.Walk(walker2)
if err != nil {
t.Fatalf("received impossible error %v", err)
}
if walked != 3 {
t.Fatalf("walk count %d", walked)
}
if acceptedOfferId != "" {
t.Fatalf("found offer %v", acceptedOfferId)
}
walker3 := func(p Perishable) (bool, error) {
walked++
return true, errors.New("baz")
}
err = storage.Walk(walker3)
if err == nil {
t.Fatal("expected error")
}
if walked != 4 {
t.Fatalf("walk count %d", walked)
}
}

View File

@@ -0,0 +1,19 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Package proc provides opinionated utilities for processing background
// operations and future errors, somewhat inspired by libprocess.
package proc

View File

@@ -0,0 +1,34 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package proc
import (
"errors"
)
var (
errProcessTerminated = errors.New("cannot execute action because process has terminated")
errIllegalState = errors.New("illegal state, cannot execute action")
)
func IsProcessTerminated(err error) bool {
return err == errProcessTerminated
}
func IsIllegalState(err error) bool {
return err == errIllegalState
}

View File

@@ -0,0 +1,377 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package proc
import (
"fmt"
"sync"
"sync/atomic"
"time"
"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/runtime"
log "github.com/golang/glog"
)
const (
// if the action processor crashes (if some Action panics) then we
// wait this long before spinning up the action processor again.
defaultActionHandlerCrashDelay = 100 * time.Millisecond
// how many actions we can store in the backlog
defaultActionQueueDepth = 1024
)
type procImpl struct {
Config
backlog chan Action // action queue
terminate chan struct{} // signaled via close()
wg sync.WaitGroup // End() terminates when the wait is over
done runtime.Signal
state *stateType
pid uint32
writeLock sync.Mutex // avoid data race between write and close of backlog
changed *sync.Cond // wait/signal for backlog changes
engine DoerFunc // isolated this for easier unit testing later on
running chan struct{} // closes once event loop processing starts
dead chan struct{} // closes upon completion of process termination
}
type Config struct {
// cooldown period in between deferred action crashes
actionHandlerCrashDelay time.Duration
// determines the size of the deferred action backlog
actionQueueDepth uint32
}
var (
defaultConfig = Config{
actionHandlerCrashDelay: defaultActionHandlerCrashDelay,
actionQueueDepth: defaultActionQueueDepth,
}
pid uint32
closedErrChan <-chan error
)
func init() {
ch := make(chan error)
close(ch)
closedErrChan = ch
}
func New() Process {
return newConfigured(defaultConfig)
}
func newConfigured(config Config) Process {
state := stateNew
pi := &procImpl{
Config: config,
backlog: make(chan Action, config.actionQueueDepth),
terminate: make(chan struct{}),
state: &state,
pid: atomic.AddUint32(&pid, 1),
running: make(chan struct{}),
dead: make(chan struct{}),
}
pi.engine = DoerFunc(pi.doLater)
pi.changed = sync.NewCond(&pi.writeLock)
pi.wg.Add(1) // symmetrical to wg.Done() in End()
pi.done = pi.begin()
return pi
}
// returns a chan that closes upon termination of the action processing loop
func (self *procImpl) Done() <-chan struct{} {
return self.done
}
func (self *procImpl) Running() <-chan struct{} {
return self.running
}
func (self *procImpl) begin() runtime.Signal {
if !self.state.transition(stateNew, stateRunning) {
panic(fmt.Errorf("failed to transition from New to Idle state"))
}
defer log.V(2).Infof("started process %d", self.pid)
var entered runtime.Latch
// execute actions on the backlog chan
return runtime.After(func() {
runtime.Until(func() {
if entered.Acquire() {
close(self.running)
self.wg.Add(1)
}
for action := range self.backlog {
select {
case <-self.terminate:
return
default:
// signal to indicate there's room in the backlog now
self.changed.Broadcast()
// rely on Until to handle action panics
action()
}
}
}, self.actionHandlerCrashDelay, self.terminate)
}).Then(func() {
log.V(2).Infof("finished processing action backlog for process %d", self.pid)
if !entered.Acquire() {
self.wg.Done()
}
})
}
// execute some action in the context of the current process. Actions
// executed via this func are to be executed in a concurrency-safe manner:
// no two actions should execute at the same time. invocations of this func
// should not block for very long, unless the action backlog is full or the
// process is terminating.
// returns errProcessTerminated if the process already ended.
func (self *procImpl) doLater(deferredAction Action) (err <-chan error) {
a := Action(func() {
self.wg.Add(1)
defer self.wg.Done()
deferredAction()
})
scheduled := false
self.writeLock.Lock()
defer self.writeLock.Unlock()
for err == nil && !scheduled {
switch s := self.state.get(); s {
case stateRunning:
select {
case self.backlog <- a:
scheduled = true
default:
self.changed.Wait()
}
case stateTerminal:
err = ErrorChan(errProcessTerminated)
default:
err = ErrorChan(errIllegalState)
}
}
return
}
// implementation of Doer interface, schedules some action to be executed via
// the current execution engine
func (self *procImpl) Do(a Action) <-chan error {
return self.engine(a)
}
// spawn a goroutine that waits for an error. if a non-nil error is read from the
// channel then the handler func is invoked, otherwise (nil error or closed chan)
// the handler is skipped. if a nil handler is specified then it's not invoked.
// the signal chan that's returned closes once the error process logic (and handler,
// if any) has completed.
func OnError(ch <-chan error, f func(error), abort <-chan struct{}) <-chan struct{} {
return runtime.After(func() {
if ch == nil {
return
}
select {
case err, ok := <-ch:
if ok && err != nil && f != nil {
f(err)
}
case <-abort:
if f != nil {
f(errProcessTerminated)
}
}
})
}
func (self *procImpl) OnError(ch <-chan error, f func(error)) <-chan struct{} {
return OnError(ch, f, self.Done())
}
func (self *procImpl) flush() {
log.V(2).Infof("flushing action backlog for process %d", self.pid)
i := 0
//TODO: replace with `for range self.backlog` once Go 1.3 support is dropped
for {
_, open := <-self.backlog
if !open {
break
}
i++
}
log.V(2).Infof("flushed %d backlog actions for process %d", i, self.pid)
}
func (self *procImpl) End() <-chan struct{} {
if self.state.transitionTo(stateTerminal, stateTerminal) {
go func() {
defer close(self.dead)
self.writeLock.Lock()
defer self.writeLock.Unlock()
log.V(2).Infof("terminating process %d", self.pid)
close(self.backlog)
close(self.terminate)
self.wg.Done()
self.changed.Broadcast()
log.V(2).Infof("waiting for deferred actions to complete")
// wait for all pending actions to complete, then flush the backlog
self.wg.Wait()
self.flush()
}()
}
return self.dead
}
type errorOnce struct {
once sync.Once
err chan error
abort <-chan struct{}
}
func NewErrorOnce(abort <-chan struct{}) ErrorOnce {
return &errorOnce{
err: make(chan error, 1),
abort: abort,
}
}
func (b *errorOnce) Err() <-chan error {
return b.err
}
func (b *errorOnce) Reportf(msg string, args ...interface{}) {
b.Report(fmt.Errorf(msg, args...))
}
func (b *errorOnce) Report(err error) {
b.once.Do(func() {
select {
case b.err <- err:
default:
}
})
}
func (b *errorOnce) Send(errIn <-chan error) ErrorOnce {
go b.forward(errIn)
return b
}
func (b *errorOnce) forward(errIn <-chan error) {
if errIn == nil {
b.Report(nil)
return
}
select {
case err, _ := <-errIn:
b.Report(err)
case <-b.abort:
b.Report(errProcessTerminated)
}
}
type processAdapter struct {
parent Process
delegate Doer
}
func (p *processAdapter) Do(a Action) <-chan error {
if p == nil || p.parent == nil || p.delegate == nil {
return ErrorChan(errIllegalState)
}
errCh := NewErrorOnce(p.Done())
go func() {
errOuter := p.parent.Do(func() {
errInner := p.delegate.Do(a)
errCh.forward(errInner)
})
// if the outer err is !nil then either the parent failed to schedule the
// the action, or else it backgrounded the scheduling task.
if errOuter != nil {
errCh.forward(errOuter)
}
}()
return errCh.Err()
}
func (p *processAdapter) End() <-chan struct{} {
if p != nil && p.parent != nil {
return p.parent.End()
}
return nil
}
func (p *processAdapter) Done() <-chan struct{} {
if p != nil && p.parent != nil {
return p.parent.Done()
}
return nil
}
func (p *processAdapter) Running() <-chan struct{} {
if p != nil && p.parent != nil {
return p.parent.Running()
}
return nil
}
func (p *processAdapter) OnError(ch <-chan error, f func(error)) <-chan struct{} {
if p != nil && p.parent != nil {
return p.parent.OnError(ch, f)
}
return nil
}
// returns a process that, within its execution context, delegates to the specified Doer.
// if the given Doer instance is nil, a valid Process is still returned though calls to its
// Do() implementation will always return errIllegalState.
// if the given Process instance is nil then in addition to the behavior in the prior sentence,
// calls to End() and Done() are effectively noops.
func DoWith(other Process, d Doer) Process {
return &processAdapter{
parent: other,
delegate: d,
}
}
func ErrorChanf(msg string, args ...interface{}) <-chan error {
return ErrorChan(fmt.Errorf(msg, args...))
}
func ErrorChan(err error) <-chan error {
if err == nil {
return closedErrChan
}
ch := make(chan error, 1)
ch <- err
return ch
}
// invoke the f on action a. returns an illegal state error if f is nil.
func (f DoerFunc) Do(a Action) <-chan error {
if f != nil {
return f(a)
}
return ErrorChan(errIllegalState)
}

View File

@@ -0,0 +1,373 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package proc
import (
"fmt"
"sync"
"testing"
"time"
"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/runtime"
log "github.com/golang/glog"
)
// logs a testing.Fatalf if the elapsed time d passes before signal chan done is closed
func fatalAfter(t *testing.T, done <-chan struct{}, d time.Duration, msg string, args ...interface{}) {
select {
case <-done:
case <-time.After(d):
t.Fatalf(msg, args...)
}
}
func errorAfter(errOnce ErrorOnce, done <-chan struct{}, d time.Duration, msg string, args ...interface{}) {
select {
case <-done:
case <-time.After(d):
errOnce.Reportf(msg, args...)
}
}
// logs a testing.Fatalf if the signal chan closes before the elapsed time d passes
func fatalOn(t *testing.T, done <-chan struct{}, d time.Duration, msg string, args ...interface{}) {
select {
case <-done:
t.Fatalf(msg, args...)
case <-time.After(d):
}
}
func TestProc_manyEndings(t *testing.T) {
p := New()
const COUNT = 20
var wg sync.WaitGroup
wg.Add(COUNT)
for i := 0; i < COUNT; i++ {
runtime.On(p.End(), wg.Done)
}
fatalAfter(t, runtime.After(wg.Wait), 5*time.Second, "timed out waiting for loose End()s")
fatalAfter(t, p.Done(), 5*time.Second, "timed out waiting for process death")
}
func TestProc_singleAction(t *testing.T) {
p := New()
scheduled := make(chan struct{})
called := make(chan struct{})
go func() {
log.Infof("do'ing deferred action")
defer close(scheduled)
err := p.Do(func() {
defer close(called)
log.Infof("deferred action invoked")
})
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
}()
fatalAfter(t, scheduled, 5*time.Second, "timed out waiting for deferred action to be scheduled")
fatalAfter(t, called, 5*time.Second, "timed out waiting for deferred action to be invoked")
p.End()
fatalAfter(t, p.Done(), 5*time.Second, "timed out waiting for process death")
}
func TestProc_singleActionEnd(t *testing.T) {
p := New()
scheduled := make(chan struct{})
called := make(chan struct{})
go func() {
log.Infof("do'ing deferred action")
defer close(scheduled)
err := p.Do(func() {
defer close(called)
log.Infof("deferred action invoked")
p.End()
})
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
}()
fatalAfter(t, scheduled, 5*time.Second, "timed out waiting for deferred action to be scheduled")
fatalAfter(t, called, 5*time.Second, "timed out waiting for deferred action to be invoked")
fatalAfter(t, p.Done(), 5*time.Second, "timed out waiting for process death")
}
func TestProc_multiAction(t *testing.T) {
p := New()
const COUNT = 10
var called sync.WaitGroup
called.Add(COUNT)
// test FIFO property
next := 0
for i := 0; i < COUNT; i++ {
log.Infof("do'ing deferred action %d", i)
idx := i
err := p.Do(func() {
defer called.Done()
log.Infof("deferred action invoked")
if next != idx {
t.Fatalf("expected index %d instead of %d", idx, next)
}
next++
})
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
}
fatalAfter(t, runtime.After(called.Wait), 2*time.Second, "timed out waiting for deferred actions to be invoked")
p.End()
fatalAfter(t, p.Done(), 5*time.Second, "timed out waiting for process death")
}
func TestProc_goodLifecycle(t *testing.T) {
p := New()
p.End()
fatalAfter(t, p.Done(), 5*time.Second, "timed out waiting for process death")
}
func TestProc_doWithDeadProc(t *testing.T) {
p := New()
p.End()
time.Sleep(100 * time.Millisecond)
errUnexpected := fmt.Errorf("unexpected execution of delegated action")
decorated := DoWith(p, DoerFunc(func(_ Action) <-chan error {
return ErrorChan(errUnexpected)
}))
decorated.Do(func() {})
fatalAfter(t, decorated.Done(), 5*time.Second, "timed out waiting for process death")
}
func TestProc_doWith(t *testing.T) {
p := New()
delegated := false
decorated := DoWith(p, DoerFunc(func(a Action) <-chan error {
delegated = true
a()
return nil
}))
executed := make(chan struct{})
err := decorated.Do(func() {
defer close(executed)
if !delegated {
t.Fatalf("expected delegated execution")
}
})
if err == nil {
t.Fatalf("expected !nil error chan")
}
fatalAfter(t, executed, 5*time.Second, "timed out waiting deferred execution")
fatalAfter(t, decorated.OnError(err, func(e error) {
t.Fatalf("unexpected error: %v", err)
}), 1*time.Second, "timed out waiting for doer result")
decorated.End()
fatalAfter(t, p.Done(), 5*time.Second, "timed out waiting for process death")
}
func TestProc_doWithNestedTwice(t *testing.T) {
p := New()
delegated := false
decorated := DoWith(p, DoerFunc(func(a Action) <-chan error {
a()
return nil
}))
decorated2 := DoWith(decorated, DoerFunc(func(a Action) <-chan error {
delegated = true
a()
return nil
}))
executed := make(chan struct{})
err := decorated2.Do(func() {
defer close(executed)
if !delegated {
t.Fatalf("expected delegated execution")
}
})
if err == nil {
t.Fatalf("expected !nil error chan")
}
fatalAfter(t, executed, 5*time.Second, "timed out waiting deferred execution")
fatalAfter(t, decorated2.OnError(err, func(e error) {
t.Fatalf("unexpected error: %v", err)
}), 1*time.Second, "timed out waiting for doer result")
decorated2.End()
fatalAfter(t, p.Done(), 5*time.Second, "timed out waiting for process death")
}
func TestProc_doWithNestedErrorPropagation(t *testing.T) {
p := New()
delegated := false
decorated := DoWith(p, DoerFunc(func(a Action) <-chan error {
a()
return nil
}))
expectedErr := fmt.Errorf("expecting this")
errOnce := NewErrorOnce(p.Done())
decorated2 := DoWith(decorated, DoerFunc(func(a Action) <-chan error {
delegated = true
a()
errOnce.Reportf("unexpected error in decorator2")
return ErrorChanf("another unexpected error in decorator2")
}))
executed := make(chan struct{})
err := decorated2.Do(func() {
defer close(executed)
if !delegated {
t.Fatalf("expected delegated execution")
}
errOnce.Report(expectedErr)
})
if err == nil {
t.Fatalf("expected !nil error chan")
}
errOnce.Send(err)
foundError := false
fatalAfter(t, executed, 1*time.Second, "timed out waiting deferred execution")
fatalAfter(t, decorated2.OnError(errOnce.Err(), func(e error) {
if e != expectedErr {
t.Fatalf("unexpected error: %v", err)
} else {
foundError = true
}
}), 1*time.Second, "timed out waiting for doer result")
if !foundError {
t.Fatalf("expected a propagated error")
}
decorated2.End()
fatalAfter(t, p.Done(), 5*time.Second, "timed out waiting for process death")
}
func runDelegationTest(t *testing.T, p Process, name string, errOnce ErrorOnce) {
defer func() {
t.Logf("runDelegationTest finished at " + time.Now().String())
}()
var decorated Process
decorated = p
const DEPTH = 100
var wg sync.WaitGroup
wg.Add(DEPTH)
y := 0
for x := 1; x <= DEPTH; x++ {
x := x
nextp := DoWith(decorated, DoerFunc(func(a Action) <-chan error {
if x == 1 {
t.Logf("delegate chain invoked for " + name)
}
y++
if y != x {
return ErrorChanf("out of order delegated execution")
}
defer wg.Done()
a()
return nil
}))
decorated = nextp
}
executed := make(chan struct{})
errCh := decorated.Do(func() {
defer close(executed)
if y != DEPTH {
errOnce.Reportf("expected delegated execution")
}
t.Logf("executing deferred action: " + name + " at " + time.Now().String())
errOnce.Send(nil) // we completed without error, let the listener know
})
if errCh == nil {
t.Fatalf("expected !nil error chan")
}
// forward any scheduling errors to the listener; NOTHING else should attempt to read
// from errCh after this point
errOnce.Send(errCh)
errorAfter(errOnce, executed, 5*time.Second, "timed out waiting deferred execution")
t.Logf("runDelegationTest received executed signal at " + time.Now().String())
}
func TestProc_doWithNestedX(t *testing.T) {
t.Logf("starting test case at " + time.Now().String())
p := New()
errOnce := NewErrorOnce(p.Done())
runDelegationTest(t, p, "nested", errOnce)
<-p.End()
select {
case err := <-errOnce.Err():
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
case <-time.After(5 * time.Second):
t.Fatalf("timed out waiting for doer result")
}
fatalAfter(t, p.Done(), 5*time.Second, "timed out waiting for process death")
}
// intended to be run with -race
func TestProc_doWithNestedXConcurrent(t *testing.T) {
p := New()
errOnce := NewErrorOnce(p.Done())
var wg sync.WaitGroup
const CONC = 20
wg.Add(CONC)
for i := 0; i < CONC; i++ {
i := i
runtime.After(func() { runDelegationTest(t, p, fmt.Sprintf("nested%d", i), errOnce) }).Then(wg.Done)
}
ch := runtime.After(wg.Wait)
fatalAfter(t, ch, 10*time.Second, "timed out waiting for concurrent delegates")
<-p.End()
select {
case err := <-errOnce.Err():
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
case <-time.After(5 * time.Second):
t.Fatalf("timed out waiting for doer result")
}
fatalAfter(t, p.Done(), 5*time.Second, "timed out waiting for process death")
}

View File

@@ -0,0 +1,55 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package proc
import (
"sync/atomic"
)
type stateType int32
const (
stateNew stateType = iota
stateRunning
stateTerminal
)
func (s *stateType) get() stateType {
return stateType(atomic.LoadInt32((*int32)(s)))
}
func (s *stateType) transition(from, to stateType) bool {
return atomic.CompareAndSwapInt32((*int32)(s), int32(from), int32(to))
}
func (s *stateType) transitionTo(to stateType, unless ...stateType) bool {
if len(unless) == 0 {
atomic.StoreInt32((*int32)(s), int32(to))
return true
}
for {
state := s.get()
for _, x := range unless {
if state == x {
return false
}
}
if s.transition(state, to) {
return true
}
}
}

View File

@@ -0,0 +1,71 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package proc
// something that executes in the context of a process
type Action func()
type Context interface {
// end (terminate) the execution context
End() <-chan struct{}
// return a signal chan that will close upon the termination of this process
Done() <-chan struct{}
}
type Doer interface {
// execute some action in some context. actions are to be executed in a
// concurrency-safe manner: no two actions should execute at the same time.
// errors are generated if the action cannot be executed (not by the execution
// of the action) and should be testable with the error API of this package,
// for example, IsProcessTerminated.
Do(Action) <-chan error
}
// adapter func for Doer interface
type DoerFunc func(Action) <-chan error
type Process interface {
Context
Doer
// see top level OnError func. this implementation will terminate upon the arrival of
// an error (and subsequently invoke the error handler, if given) or else the termination
// of the process (testable via IsProcessTerminated).
OnError(<-chan error, func(error)) <-chan struct{}
// return a signal chan that will close once the process is ready to run actions
Running() <-chan struct{}
}
// this is an error promise. if we ever start building out support for other promise types it will probably
// make sense to group them in some sort of "promises" package.
type ErrorOnce interface {
// return a chan that only ever sends one error, either obtained via Report() or Forward()
Err() <-chan error
// reports the given error via Err(), but only if no other errors have been reported or forwarded
Report(error)
Reportf(string, ...interface{})
// waits for an error on the incoming chan, the result of which is later obtained via Err() (if no
// other errors have been reported or forwarded)
forward(<-chan error)
// non-blocking, spins up a goroutine that reports an error (if any) that occurs on the error chan.
Send(<-chan error) ErrorOnce
}

View File

@@ -0,0 +1,18 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Package profile contains reusable code for profiling Go programs with pprof.
package profile

View File

@@ -0,0 +1,27 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package profile
import "net/http"
import "net/http/pprof"
func InstallHandler(m *http.ServeMux) {
// register similar endpoints as net/http/pprof.init() does
m.Handle("/debug/pprof/", http.HandlerFunc(pprof.Index))
m.Handle("/debug/pprof/profile", http.HandlerFunc(pprof.Profile))
m.Handle("/debug/pprof/symbol", http.HandlerFunc(pprof.Symbol))
}

View File

@@ -0,0 +1,373 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package queue
import (
"container/heap"
"sync"
"time"
"github.com/GoogleCloudPlatform/kubernetes/pkg/util"
)
type qitem struct {
value interface{}
priority Priority
index int
readd func(item *qitem) // re-add the value of the item to the queue
}
// A priorityQueue implements heap.Interface and holds qitems.
type priorityQueue []*qitem
func (pq priorityQueue) Len() int { return len(pq) }
func (pq priorityQueue) Less(i, j int) bool {
return pq[i].priority.ts.Before(pq[j].priority.ts)
}
func (pq priorityQueue) Swap(i, j int) {
pq[i], pq[j] = pq[j], pq[i]
pq[i].index = i
pq[j].index = j
}
func (pq *priorityQueue) Push(x interface{}) {
n := len(*pq)
item := x.(*qitem)
item.index = n
*pq = append(*pq, item)
}
func (pq *priorityQueue) Pop() interface{} {
old := *pq
n := len(old)
item := old[n-1]
item.index = -1 // for safety
*pq = old[0 : n-1]
return item
}
// concurrency-safe, deadline-oriented queue that returns items after their
// delay period has expired.
type DelayQueue struct {
queue priorityQueue
lock sync.RWMutex
cond sync.Cond
}
func NewDelayQueue() *DelayQueue {
q := &DelayQueue{}
q.cond.L = &q.lock
return q
}
func (q *DelayQueue) Add(d Delayed) {
deadline := extractFromDelayed(d)
q.lock.Lock()
defer q.lock.Unlock()
// readd using the original deadline computed from the original delay
var readd func(*qitem)
readd = func(qp *qitem) {
q.lock.Lock()
defer q.lock.Unlock()
heap.Push(&q.queue, &qitem{
value: d,
priority: deadline,
readd: readd,
})
q.cond.Broadcast()
}
heap.Push(&q.queue, &qitem{
value: d,
priority: deadline,
readd: readd,
})
q.cond.Broadcast()
}
// If there's a deadline reported by d.Deadline() then `d` is added to the
// queue and this func returns true.
func (q *DelayQueue) Offer(d Deadlined) bool {
deadline, ok := extractFromDeadlined(d)
if ok {
q.lock.Lock()
defer q.lock.Unlock()
heap.Push(&q.queue, &qitem{
value: d,
priority: deadline,
readd: func(qp *qitem) {
q.Offer(qp.value.(Deadlined))
},
})
q.cond.Broadcast()
}
return ok
}
// wait for the delay of the next item in the queue to expire, blocking if
// there are no items in the queue. does not guarantee first-come-first-serve
// ordering with respect to clients.
func (q *DelayQueue) Pop() interface{} {
// doesn't implement cancellation, will always return a non-nil value
return q.pop(func() *qitem {
q.lock.Lock()
defer q.lock.Unlock()
for q.queue.Len() == 0 {
q.cond.Wait()
}
x := heap.Pop(&q.queue)
item := x.(*qitem)
return item
}, nil)
}
// returns a non-nil value from the queue, or else nil if/when cancelled; if cancel
// is nil then cancellation is disabled and this func must return a non-nil value.
func (q *DelayQueue) pop(next func() *qitem, cancel <-chan struct{}) interface{} {
var ch chan struct{}
for {
item := next()
if item == nil {
// cancelled
return nil
}
x := item.value
waitingPeriod := item.priority.ts.Sub(time.Now())
if waitingPeriod >= 0 {
// listen for calls to Add() while we're waiting for the deadline
if ch == nil {
ch = make(chan struct{}, 1)
}
go func() {
q.lock.Lock()
defer q.lock.Unlock()
q.cond.Wait()
ch <- struct{}{}
}()
select {
case <-cancel:
item.readd(item)
return nil
case <-ch:
// we may no longer have the earliest deadline, re-try
item.readd(item)
continue
case <-time.After(waitingPeriod):
// noop
case <-item.priority.notify:
// noop
}
}
return x
}
}
// If multiple adds/updates of a single item happen while an item is in the
// queue before it has been processed, it will only be processed once, and
// when it is processed, the most recent version will be processed. Items are
// popped in order of their priority, currently controlled by a delay or
// deadline assigned to each item in the queue.
type DelayFIFO struct {
// internal deadline-based priority queue
delegate *DelayQueue
// We depend on the property that items in the set are in the queue and vice versa.
items map[string]*qitem
deadlinePolicy DeadlinePolicy
}
func (q *DelayFIFO) lock() {
q.delegate.lock.Lock()
}
func (q *DelayFIFO) unlock() {
q.delegate.lock.Unlock()
}
func (q *DelayFIFO) rlock() {
q.delegate.lock.RLock()
}
func (q *DelayFIFO) runlock() {
q.delegate.lock.RUnlock()
}
func (q *DelayFIFO) queue() *priorityQueue {
return &q.delegate.queue
}
func (q *DelayFIFO) cond() *sync.Cond {
return &q.delegate.cond
}
// Add inserts an item, and puts it in the queue. The item is only enqueued
// if it doesn't already exist in the set.
func (q *DelayFIFO) Add(d UniqueDelayed, rp ReplacementPolicy) {
deadline := extractFromDelayed(d)
id := d.GetUID()
var adder func(*qitem)
adder = func(*qitem) {
q.add(id, deadline, d, KeepExisting, adder)
}
q.add(id, deadline, d, rp, adder)
}
func (q *DelayFIFO) Offer(d UniqueDeadlined, rp ReplacementPolicy) bool {
if deadline, ok := extractFromDeadlined(d); ok {
id := d.GetUID()
q.add(id, deadline, d, rp, func(qp *qitem) { q.Offer(qp.value.(UniqueDeadlined), KeepExisting) })
return true
}
return false
}
func (q *DelayFIFO) add(id string, deadline Priority, value interface{}, rp ReplacementPolicy, adder func(*qitem)) {
q.lock()
defer q.unlock()
if item, exists := q.items[id]; !exists {
item = &qitem{
value: value,
priority: deadline,
readd: adder,
}
heap.Push(q.queue(), item)
q.items[id] = item
} else {
// this is an update of an existing item
item.value = rp.replacementValue(item.value, value)
item.priority = q.deadlinePolicy.nextDeadline(item.priority, deadline)
heap.Fix(q.queue(), item.index)
}
q.cond().Broadcast()
}
// Delete removes an item. It doesn't add it to the queue, because
// this implementation assumes the consumer only cares about the objects,
// not their priority order.
func (f *DelayFIFO) Delete(id string) {
f.lock()
defer f.unlock()
delete(f.items, id)
}
// List returns a list of all the items.
func (f *DelayFIFO) List() []UniqueID {
f.rlock()
defer f.runlock()
list := make([]UniqueID, 0, len(f.items))
for _, item := range f.items {
list = append(list, item.value.(UniqueDelayed))
}
return list
}
// ContainedIDs returns a util.StringSet containing all IDs of the stored items.
// This is a snapshot of a moment in time, and one should keep in mind that
// other go routines can add or remove items after you call this.
func (c *DelayFIFO) ContainedIDs() util.StringSet {
c.rlock()
defer c.runlock()
set := util.StringSet{}
for id := range c.items {
set.Insert(id)
}
return set
}
// Get returns the requested item, or sets exists=false.
func (f *DelayFIFO) Get(id string) (UniqueID, bool) {
f.rlock()
defer f.runlock()
if item, exists := f.items[id]; exists {
return item.value.(UniqueID), true
}
return nil, false
}
// Variant of DelayQueue.Pop() for UniqueDelayed items
func (q *DelayFIFO) Await(timeout time.Duration) UniqueID {
cancel := make(chan struct{})
ch := make(chan interface{}, 1)
go func() { ch <- q.pop(cancel) }()
var x interface{}
select {
case <-time.After(timeout):
close(cancel)
x = <-ch
case x = <-ch:
// noop
}
if x != nil {
return x.(UniqueID)
}
return nil
}
// Variant of DelayQueue.Pop() for UniqueDelayed items
func (q *DelayFIFO) Pop() UniqueID {
return q.pop(nil).(UniqueID)
}
// variant of DelayQueue.Pop that implements optional cancellation
func (q *DelayFIFO) pop(cancel chan struct{}) interface{} {
next := func() *qitem {
q.lock()
defer q.unlock()
for {
for q.queue().Len() == 0 {
signal := make(chan struct{})
go func() {
defer close(signal)
q.cond().Wait()
}()
select {
case <-cancel:
// we may not have the lock yet, so
// broadcast to abort Wait, then
// return after lock re-acquisition
q.cond().Broadcast()
<-signal
return nil
case <-signal:
// we have the lock, re-check
// the queue for data...
}
}
x := heap.Pop(q.queue())
item := x.(*qitem)
unique := item.value.(UniqueID)
uid := unique.GetUID()
if _, ok := q.items[uid]; !ok {
// item was deleted, keep looking
continue
}
delete(q.items, uid)
return item
}
}
return q.delegate.pop(next, cancel)
}
func NewDelayFIFO() *DelayFIFO {
f := &DelayFIFO{
delegate: NewDelayQueue(),
items: map[string]*qitem{},
}
return f
}

View File

@@ -0,0 +1,406 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package queue
import (
"sync/atomic"
"testing"
"time"
"github.com/stretchr/testify/assert"
)
const (
tolerance = 100 * time.Millisecond // go time delays aren't perfect, this is our tolerance for errors WRT expected timeouts
)
func timedPriority(t time.Time) Priority {
return Priority{ts: t}
}
func TestPQ(t *testing.T) {
t.Parallel()
var pq priorityQueue
if pq.Len() != 0 {
t.Fatalf("pq should be empty")
}
now := timedPriority(time.Now())
now2 := timedPriority(now.ts.Add(2 * time.Second))
pq.Push(&qitem{priority: now2})
if pq.Len() != 1 {
t.Fatalf("pq.len should be 1")
}
x := pq.Pop()
if x == nil {
t.Fatalf("x is nil")
}
if pq.Len() != 0 {
t.Fatalf("pq should be empty")
}
item := x.(*qitem)
if !item.priority.Equal(now2) {
t.Fatalf("item.priority != now2")
}
pq.Push(&qitem{priority: now2})
pq.Push(&qitem{priority: now2})
pq.Push(&qitem{priority: now2})
pq.Push(&qitem{priority: now2})
pq.Push(&qitem{priority: now2})
pq.Pop()
pq.Pop()
pq.Pop()
pq.Pop()
pq.Pop()
if pq.Len() != 0 {
t.Fatalf("pq should be empty")
}
now4 := timedPriority(now.ts.Add(4 * time.Second))
now6 := timedPriority(now.ts.Add(4 * time.Second))
pq.Push(&qitem{priority: now2})
pq.Push(&qitem{priority: now4})
pq.Push(&qitem{priority: now6})
pq.Swap(0, 2)
if !pq[0].priority.Equal(now6) || !pq[2].priority.Equal(now2) {
t.Fatalf("swap failed")
}
if pq.Less(1, 2) {
t.Fatalf("now4 < now2")
}
}
func TestPopEmptyPQ(t *testing.T) {
t.Parallel()
defer func() {
if r := recover(); r == nil {
t.Fatalf("Expected panic from popping an empty PQ")
}
}()
var pq priorityQueue
pq.Pop()
}
type testjob struct {
d time.Duration
t time.Time
deadline *time.Time
uid string
instance int
}
func (j *testjob) GetDelay() time.Duration {
return j.d
}
func (j testjob) GetUID() string {
return j.uid
}
func (td *testjob) Deadline() (deadline time.Time, ok bool) {
if td.deadline != nil {
return *td.deadline, true
} else {
return time.Now(), false
}
}
func TestDQ_sanity_check(t *testing.T) {
t.Parallel()
dq := NewDelayQueue()
delay := 2 * time.Second
dq.Add(&testjob{d: delay})
before := time.Now()
x := dq.Pop()
now := time.Now()
waitPeriod := now.Sub(before)
if waitPeriod+tolerance < delay {
t.Fatalf("delay too short: %v, expected: %v", waitPeriod, delay)
}
if x == nil {
t.Fatalf("x is nil")
}
item := x.(*testjob)
if item.d != delay {
t.Fatalf("d != delay")
}
}
func TestDQ_Offer(t *testing.T) {
t.Parallel()
assert := assert.New(t)
dq := NewDelayQueue()
delay := time.Second
added := dq.Offer(&testjob{})
if added {
t.Fatalf("DelayQueue should not add offered job without deadline")
}
deadline := time.Now().Add(delay)
added = dq.Offer(&testjob{deadline: &deadline})
if !added {
t.Fatalf("DelayQueue should add offered job with deadline")
}
before := time.Now()
x := dq.Pop()
now := time.Now()
waitPeriod := now.Sub(before)
if waitPeriod+tolerance < delay {
t.Fatalf("delay too short: %v, expected: %v", waitPeriod, delay)
}
assert.NotNil(x)
assert.Equal(x.(*testjob).deadline, &deadline)
}
func TestDQ_ordered_add_pop(t *testing.T) {
t.Parallel()
dq := NewDelayQueue()
dq.Add(&testjob{d: 2 * time.Second})
dq.Add(&testjob{d: 1 * time.Second})
dq.Add(&testjob{d: 3 * time.Second})
var finished [3]*testjob
before := time.Now()
idx := int32(-1)
ch := make(chan bool, 3)
//TODO: replace with `for range finished` once Go 1.3 support is dropped
for n := 0; n < len(finished); n++ {
go func() {
var ok bool
x := dq.Pop()
i := atomic.AddInt32(&idx, 1)
if finished[i], ok = x.(*testjob); !ok {
t.Fatalf("expected a *testjob, not %v", x)
}
finished[i].t = time.Now()
ch <- true
}()
}
<-ch
<-ch
<-ch
after := time.Now()
totalDelay := after.Sub(before)
if totalDelay+tolerance < (3 * time.Second) {
t.Fatalf("totalDelay < 3s: %v", totalDelay)
}
for i, v := range finished {
if v == nil {
t.Fatalf("task %d was nil", i)
}
expected := time.Duration(i+1) * time.Second
if v.d != expected {
t.Fatalf("task %d had delay-priority %v, expected %v", i, v.d, expected)
}
actualDelay := v.t.Sub(before)
if actualDelay+tolerance < v.d {
t.Fatalf("task %d had actual-delay %v < expected delay %v", i, actualDelay, v.d)
}
}
}
func TestDQ_always_pop_earliest_deadline(t *testing.T) {
t.Parallel()
// add a testjob with delay of 2s
// spawn a func f1 that attempts to Pop() and wait for f1 to begin
// add a testjob with a delay of 1s
// check that the func f1 actually popped the 1s task (not the 2s task)
dq := NewDelayQueue()
dq.Add(&testjob{d: 2 * time.Second})
ch := make(chan *testjob)
started := make(chan bool)
go func() {
started <- true
x := dq.Pop()
job := x.(*testjob)
job.t = time.Now()
ch <- job
}()
<-started
time.Sleep(500 * time.Millisecond) // give plently of time for Pop() to enter
expected := 1 * time.Second
dq.Add(&testjob{d: expected})
job := <-ch
if expected != job.d {
t.Fatalf("Expected delay-prority of %v got instead got %v", expected, job.d)
}
job = dq.Pop().(*testjob)
expected = 2 * time.Second
if expected != job.d {
t.Fatalf("Expected delay-prority of %v got instead got %v", expected, job.d)
}
}
func TestDQ_always_pop_earliest_deadline_multi(t *testing.T) {
t.Parallel()
dq := NewDelayQueue()
dq.Add(&testjob{d: 2 * time.Second})
ch := make(chan *testjob)
multi := 10
started := make(chan bool, multi)
go func() {
started <- true
for i := 0; i < multi; i++ {
x := dq.Pop()
job := x.(*testjob)
job.t = time.Now()
ch <- job
}
}()
<-started
time.Sleep(500 * time.Millisecond) // give plently of time for Pop() to enter
expected := 1 * time.Second
for i := 0; i < multi; i++ {
dq.Add(&testjob{d: expected})
}
for i := 0; i < multi; i++ {
job := <-ch
if expected != job.d {
t.Fatalf("Expected delay-prority of %v got instead got %v", expected, job.d)
}
}
job := dq.Pop().(*testjob)
expected = 2 * time.Second
if expected != job.d {
t.Fatalf("Expected delay-prority of %v got instead got %v", expected, job.d)
}
}
func TestDQ_negative_delay(t *testing.T) {
t.Parallel()
dq := NewDelayQueue()
delay := -2 * time.Second
dq.Add(&testjob{d: delay})
before := time.Now()
x := dq.Pop()
now := time.Now()
waitPeriod := now.Sub(before)
if waitPeriod > tolerance {
t.Fatalf("delay too long: %v, expected something less than: %v", waitPeriod, tolerance)
}
if x == nil {
t.Fatalf("x is nil")
}
item := x.(*testjob)
if item.d != delay {
t.Fatalf("d != delay")
}
}
func TestDFIFO_sanity_check(t *testing.T) {
t.Parallel()
assert := assert.New(t)
df := NewDelayFIFO()
delay := 2 * time.Second
df.Add(&testjob{d: delay, uid: "a", instance: 1}, ReplaceExisting)
assert.True(df.ContainedIDs().Has("a"))
// re-add by ReplaceExisting
df.Add(&testjob{d: delay, uid: "a", instance: 2}, ReplaceExisting)
assert.True(df.ContainedIDs().Has("a"))
a, ok := df.Get("a")
assert.True(ok)
assert.Equal(a.(*testjob).instance, 2)
// re-add by KeepExisting
df.Add(&testjob{d: delay, uid: "a", instance: 3}, KeepExisting)
assert.True(df.ContainedIDs().Has("a"))
a, ok = df.Get("a")
assert.True(ok)
assert.Equal(a.(*testjob).instance, 2)
// pop last
before := time.Now()
x := df.Pop()
assert.Equal(a.(*testjob).instance, 2)
now := time.Now()
waitPeriod := now.Sub(before)
if waitPeriod+tolerance < delay {
t.Fatalf("delay too short: %v, expected: %v", waitPeriod, delay)
}
if x == nil {
t.Fatalf("x is nil")
}
item := x.(*testjob)
if item.d != delay {
t.Fatalf("d != delay")
}
}
func TestDFIFO_Offer(t *testing.T) {
t.Parallel()
assert := assert.New(t)
dq := NewDelayFIFO()
delay := time.Second
added := dq.Offer(&testjob{instance: 1}, ReplaceExisting)
if added {
t.Fatalf("DelayFIFO should not add offered job without deadline")
}
deadline := time.Now().Add(delay)
added = dq.Offer(&testjob{deadline: &deadline, instance: 2}, ReplaceExisting)
if !added {
t.Fatalf("DelayFIFO should add offered job with deadline")
}
before := time.Now()
x := dq.Pop()
now := time.Now()
waitPeriod := now.Sub(before)
if waitPeriod+tolerance < delay {
t.Fatalf("delay too short: %v, expected: %v", waitPeriod, delay)
}
assert.NotNil(x)
assert.Equal(x.(*testjob).instance, 2)
}

View File

@@ -0,0 +1,19 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Package queue provides several queue implementations, originally
// inspired by Kubernetes pkg/client/cache/fifo.
package queue

View File

@@ -0,0 +1,403 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package queue
import (
"fmt"
"reflect"
"sync"
"time"
"github.com/GoogleCloudPlatform/kubernetes/pkg/util"
)
type entry struct {
value UniqueCopyable
event EventType
}
type deletedEntry struct {
*entry
expiration time.Time
}
func (e *entry) Value() UniqueCopyable {
return e.value
}
func (e *entry) Copy() Copyable {
if e == nil {
return nil
}
return &entry{e.value.Copy().(UniqueCopyable), e.event}
}
func (e *entry) Is(types EventType) bool {
return types&e.event != 0
}
func (e *deletedEntry) Copy() Copyable {
if e == nil {
return nil
}
return &deletedEntry{e.entry.Copy().(*entry), e.expiration}
}
// deliver a message
type pigeon func(msg Entry)
func dead(msg Entry) {
// intentionally blank
}
// HistoricalFIFO receives adds and updates from a Reflector, and puts them in a queue for
// FIFO order processing. If multiple adds/updates of a single item happen while
// an item is in the queue before it has been processed, it will only be
// processed once, and when it is processed, the most recent version will be
// processed. This can't be done with a channel.
type HistoricalFIFO struct {
lock sync.RWMutex
cond sync.Cond
items map[string]Entry // We depend on the property that items in the queue are in the set.
queue []string
carrier pigeon // may be dead, but never nil
gcc int
lingerTTL time.Duration
}
// panics if obj doesn't implement UniqueCopyable; otherwise returns the same, typecast object
func checkType(obj interface{}) UniqueCopyable {
if v, ok := obj.(UniqueCopyable); !ok {
panic(fmt.Sprintf("Illegal object type, expected UniqueCopyable: %T", obj))
} else {
return v
}
}
// Add inserts an item, and puts it in the queue. The item is only enqueued
// if it doesn't already exist in the set.
func (f *HistoricalFIFO) Add(v interface{}) error {
obj := checkType(v)
notifications := []Entry(nil)
defer func() {
for _, e := range notifications {
f.carrier(e)
}
}()
f.lock.Lock()
defer f.lock.Unlock()
id := obj.GetUID()
if entry, exists := f.items[id]; !exists {
f.queue = append(f.queue, id)
} else {
if entry.Is(DELETE_EVENT | POP_EVENT) {
f.queue = append(f.queue, id)
}
}
notifications = f.merge(id, obj)
f.cond.Broadcast()
return nil
}
// Update is the same as Add in this implementation.
func (f *HistoricalFIFO) Update(obj interface{}) error {
return f.Add(obj)
}
// Delete removes an item. It doesn't add it to the queue, because
// this implementation assumes the consumer only cares about the objects,
// not the order in which they were created/added.
func (f *HistoricalFIFO) Delete(v interface{}) error {
obj := checkType(v)
deleteEvent := (Entry)(nil)
defer func() {
f.carrier(deleteEvent)
}()
f.lock.Lock()
defer f.lock.Unlock()
id := obj.GetUID()
item, exists := f.items[id]
if exists && !item.Is(DELETE_EVENT) {
e := item.(*entry)
e.event = DELETE_EVENT
deleteEvent = &deletedEntry{e, time.Now().Add(f.lingerTTL)}
f.items[id] = deleteEvent
}
return nil
}
// List returns a list of all the items.
func (f *HistoricalFIFO) List() []interface{} {
f.lock.RLock()
defer f.lock.RUnlock()
// TODO(jdef): slightly overallocates b/c of deleted items
list := make([]interface{}, 0, len(f.queue))
for _, entry := range f.items {
if entry.Is(DELETE_EVENT | POP_EVENT) {
continue
}
list = append(list, entry.Value().Copy())
}
return list
}
// List returns a list of all the items.
func (f *HistoricalFIFO) ListKeys() []string {
f.lock.RLock()
defer f.lock.RUnlock()
// TODO(jdef): slightly overallocates b/c of deleted items
list := make([]string, 0, len(f.queue))
for key, entry := range f.items {
if entry.Is(DELETE_EVENT | POP_EVENT) {
continue
}
list = append(list, key)
}
return list
}
// ContainedIDs returns a util.StringSet containing all IDs of the stored items.
// This is a snapshot of a moment in time, and one should keep in mind that
// other go routines can add or remove items after you call this.
func (c *HistoricalFIFO) ContainedIDs() util.StringSet {
c.lock.RLock()
defer c.lock.RUnlock()
set := util.StringSet{}
for id, entry := range c.items {
if entry.Is(DELETE_EVENT | POP_EVENT) {
continue
}
set.Insert(id)
}
return set
}
// Get returns the requested item, or sets exists=false.
func (f *HistoricalFIFO) Get(v interface{}) (interface{}, bool, error) {
obj := checkType(v)
return f.GetByKey(obj.GetUID())
}
// Get returns the requested item, or sets exists=false.
func (f *HistoricalFIFO) GetByKey(id string) (interface{}, bool, error) {
f.lock.RLock()
defer f.lock.RUnlock()
entry, exists := f.items[id]
if exists && !entry.Is(DELETE_EVENT|POP_EVENT) {
return entry.Value().Copy(), true, nil
}
return nil, false, nil
}
// Get returns the requested item, or sets exists=false.
func (f *HistoricalFIFO) Poll(id string, t EventType) bool {
f.lock.RLock()
defer f.lock.RUnlock()
entry, exists := f.items[id]
return exists && entry.Is(t)
}
// Variant of DelayQueue.Pop() for UniqueDelayed items
func (q *HistoricalFIFO) Await(timeout time.Duration) interface{} {
cancel := make(chan struct{})
ch := make(chan interface{}, 1)
go func() { ch <- q.pop(cancel) }()
select {
case <-time.After(timeout):
close(cancel)
return <-ch
case x := <-ch:
return x
}
}
func (f *HistoricalFIFO) Pop() interface{} {
return f.pop(nil)
}
func (f *HistoricalFIFO) pop(cancel chan struct{}) interface{} {
popEvent := (Entry)(nil)
defer func() {
f.carrier(popEvent)
}()
f.lock.Lock()
defer f.lock.Unlock()
for {
for len(f.queue) == 0 {
signal := make(chan struct{})
go func() {
defer close(signal)
f.cond.Wait()
}()
select {
case <-cancel:
// we may not have the lock yet, so
// broadcast to abort Wait, then
// return after lock re-acquisition
f.cond.Broadcast()
<-signal
return nil
case <-signal:
// we have the lock, re-check
// the queue for data...
}
}
id := f.queue[0]
f.queue = f.queue[1:]
item, ok := f.items[id]
if !ok || item.Is(DELETE_EVENT|POP_EVENT) {
// Item may have been deleted subsequently.
continue
}
value := item.Value()
popEvent = &entry{value, POP_EVENT}
f.items[id] = popEvent
return value.Copy()
}
}
func (f *HistoricalFIFO) Replace(objs []interface{}) error {
notifications := make([]Entry, 0, len(objs))
defer func() {
for _, e := range notifications {
f.carrier(e)
}
}()
idToObj := make(map[string]interface{})
for _, v := range objs {
obj := checkType(v)
idToObj[obj.GetUID()] = v
}
f.lock.Lock()
defer f.lock.Unlock()
f.queue = f.queue[:0]
now := time.Now()
for id, v := range f.items {
if _, exists := idToObj[id]; !exists && !v.Is(DELETE_EVENT) {
// a non-deleted entry in the items list that doesn't show up in the
// new list: mark it as deleted
ent := v.(*entry)
ent.event = DELETE_EVENT
e := &deletedEntry{ent, now.Add(f.lingerTTL)}
f.items[id] = e
notifications = append(notifications, e)
}
}
for id, v := range idToObj {
obj := checkType(v)
f.queue = append(f.queue, id)
n := f.merge(id, obj)
notifications = append(notifications, n...)
}
if len(f.queue) > 0 {
f.cond.Broadcast()
}
return nil
}
// garbage collect DELETEd items whose TTL has expired; the IDs of such items are removed
// from the queue. This impl assumes that caller has acquired state lock.
func (f *HistoricalFIFO) gc() {
now := time.Now()
deleted := make(map[string]struct{})
for id, v := range f.items {
if v.Is(DELETE_EVENT) {
ent := v.(*deletedEntry)
if ent.expiration.Before(now) {
delete(f.items, id)
deleted[id] = struct{}{}
}
}
}
// remove deleted items from the queue, will likely (slightly) overallocate here
queue := make([]string, 0, len(f.queue))
for _, id := range f.queue {
if _, exists := deleted[id]; !exists {
queue = append(queue, id)
}
}
f.queue = queue
}
// Assumes that the caller has acquired the state lock.
func (f *HistoricalFIFO) merge(id string, obj UniqueCopyable) (notifications []Entry) {
item, exists := f.items[id]
now := time.Now()
if !exists {
e := &entry{obj.Copy().(UniqueCopyable), ADD_EVENT}
f.items[id] = e
notifications = append(notifications, e)
} else {
if !item.Is(DELETE_EVENT) && item.Value().GetUID() != obj.GetUID() {
// hidden DELETE!
// (1) append a DELETE
// (2) append an ADD
// .. and notify listeners in that order
ent := item.(*entry)
ent.event = DELETE_EVENT
e1 := &deletedEntry{ent, now.Add(f.lingerTTL)}
e2 := &entry{obj.Copy().(UniqueCopyable), ADD_EVENT}
f.items[id] = e2
notifications = append(notifications, e1, e2)
} else if !reflect.DeepEqual(obj, item.Value()) {
//TODO(jdef): it would be nice if we could rely on resource versions
//instead of doing a DeepEqual. Maybe someday we'll be able to.
e := &entry{obj.Copy().(UniqueCopyable), UPDATE_EVENT}
f.items[id] = e
notifications = append(notifications, e)
}
}
// check for garbage collection
f.gcc++
if f.gcc%256 == 0 { //TODO(jdef): extract constant
f.gcc = 0
f.gc()
}
return
}
// NewHistorical returns a Store which can be used to queue up items to
// process. If a non-nil Mux is provided, then modifications to the
// the FIFO are delivered on a channel specific to this fifo.
func NewHistorical(ch chan<- Entry) FIFO {
carrier := dead
if ch != nil {
carrier = func(msg Entry) {
if msg != nil {
ch <- msg.Copy().(Entry)
}
}
}
f := &HistoricalFIFO{
items: map[string]Entry{},
queue: []string{},
carrier: carrier,
lingerTTL: 5 * time.Minute, // TODO(jdef): extract constant
}
f.cond.L = &f.lock
return f
}

View File

@@ -0,0 +1,191 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package queue
import (
"fmt"
"testing"
"time"
)
type _int int
type _uint uint
func (i _int) Copy() Copyable {
return i
}
func (i _int) GetUID() string {
return fmt.Sprintf("INT%d", int(i))
}
func (i _uint) Copy() Copyable {
return i
}
func (i _uint) GetUID() string {
return fmt.Sprintf("UINT%d", uint64(i))
}
type testObj struct {
id string
value int
}
func (i *testObj) Copy() Copyable {
if i == nil {
return nil
} else {
return &testObj{i.id, i.value}
}
}
func (i *testObj) GetUID() string {
return i.id
}
func TestFIFO_basic(t *testing.T) {
f := NewHistorical(nil)
const amount = 500
go func() {
for i := 0; i < amount; i++ {
f.Add(_int(i + 1))
}
}()
go func() {
for u := uint(0); u < amount; u++ {
f.Add(_uint(u + 1))
}
}()
lastInt := _int(0)
lastUint := _uint(0)
for i := 0; i < amount*2; i++ {
switch obj := f.Pop().(type) {
case _int:
if obj <= lastInt {
t.Errorf("got %v (int) out of order, last was %v", obj, lastInt)
}
lastInt = obj
case _uint:
if obj <= lastUint {
t.Errorf("got %v (uint) out of order, last was %v", obj, lastUint)
} else {
lastUint = obj
}
default:
t.Fatalf("unexpected type %#v", obj)
}
}
}
func TestFIFO_addUpdate(t *testing.T) {
f := NewHistorical(nil)
f.Add(&testObj{"foo", 10})
f.Update(&testObj{"foo", 15})
got := make(chan *testObj, 2)
go func() {
for {
got <- f.Pop().(*testObj)
}
}()
first := <-got
if e, a := 15, first.value; e != a {
t.Errorf("Didn't get updated value (%v), got %v", e, a)
}
select {
case unexpected := <-got:
t.Errorf("Got second value %v", unexpected)
case <-time.After(50 * time.Millisecond):
}
_, exists, _ := f.GetByKey("foo")
if exists {
t.Errorf("item did not get removed")
}
}
func TestFIFO_addReplace(t *testing.T) {
f := NewHistorical(nil)
f.Add(&testObj{"foo", 10})
f.Replace([]interface{}{&testObj{"foo", 15}})
got := make(chan *testObj, 2)
go func() {
for {
got <- f.Pop().(*testObj)
}
}()
first := <-got
if e, a := 15, first.value; e != a {
t.Errorf("Didn't get updated value (%v), got %v", e, a)
}
select {
case unexpected := <-got:
t.Errorf("Got second value %v", unexpected)
case <-time.After(50 * time.Millisecond):
}
_, exists, _ := f.GetByKey("foo")
if exists {
t.Errorf("item did not get removed")
}
}
func TestFIFO_detectLineJumpers(t *testing.T) {
f := NewHistorical(nil)
f.Add(&testObj{"foo", 10})
f.Add(&testObj{"bar", 1})
f.Add(&testObj{"foo", 11})
f.Add(&testObj{"foo", 13})
f.Add(&testObj{"zab", 30})
err := error(nil)
done := make(chan struct{})
go func() {
defer close(done)
if e, a := 13, f.Pop().(*testObj).value; a != e {
err = fmt.Errorf("expected %d, got %d", e, a)
return
}
f.Add(&testObj{"foo", 14}) // ensure foo doesn't jump back in line
if e, a := 1, f.Pop().(*testObj).value; a != e {
err = fmt.Errorf("expected %d, got %d", e, a)
return
}
if e, a := 30, f.Pop().(*testObj).value; a != e {
err = fmt.Errorf("expected %d, got %d", e, a)
return
}
if e, a := 14, f.Pop().(*testObj).value; a != e {
err = fmt.Errorf("expected %d, got %d", e, a)
return
}
}()
select {
case <-done:
if err != nil {
t.Fatal(err)
}
case <-time.After(1 * time.Second):
t.Fatal("Deadlocked unit test")
}
}

View File

@@ -0,0 +1,103 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package queue
import (
"time"
"github.com/GoogleCloudPlatform/kubernetes/pkg/client/cache"
)
type EventType int
const (
ADD_EVENT EventType = 1 << iota
UPDATE_EVENT
DELETE_EVENT
POP_EVENT
)
type Entry interface {
Copyable
Value() UniqueCopyable
// types is a logically OR'd combination of EventType, e.g. ADD_EVENT|UPDATE_EVENT
Is(types EventType) bool
}
type Copyable interface {
// return an independent copy (deep clone) of the current object
Copy() Copyable
}
type UniqueID interface {
GetUID() string
}
type UniqueCopyable interface {
Copyable
UniqueID
}
type FIFO interface {
cache.Store
// Pop waits until an item is ready and returns it. If multiple items are
// ready, they are returned in the order in which they were added/updated.
// The item is removed from the queue (and the store) before it is returned,
// so if you don't succesfully process it, you need to add it back with Add().
Pop() interface{}
// Await attempts to Pop within the given interval; upon success the non-nil
// item is returned, otherwise nil
Await(timeout time.Duration) interface{}
// Is there an entry for the id that matches the event mask?
Poll(id string, types EventType) bool
}
type Delayed interface {
// return the remaining delay; a non-positive value indicates no delay
GetDelay() time.Duration
}
type Deadlined interface {
// when ok, returns the time when this object should be activated/executed/evaluated
Deadline() (deadline time.Time, ok bool)
}
// No objects are ever expected to be sent over this channel. References to BreakChan
// instances may be nil (always blocking). Signalling over this channel is performed by
// closing the channel. As such there can only ever be a single signal sent over the
// lifetime of the channel.
type BreakChan <-chan struct{}
// an optional interface to be implemented by Delayed objects; returning a nil
// channel from Breaker() results in waiting the full delay duration
type Breakout interface {
// return a channel that signals early departure from a blocking delay
Breaker() BreakChan
}
type UniqueDelayed interface {
UniqueID
Delayed
}
type UniqueDeadlined interface {
UniqueID
Deadlined
}

View File

@@ -0,0 +1,70 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package queue
// Decide whether a pre-existing deadline for an item in a delay-queue should be
// updated if an attempt is made to offer/add a new deadline for said item. Whether
// the deadline changes or not has zero impact on the data blob associated with the
// entry in the queue.
type DeadlinePolicy int
const (
PreferLatest DeadlinePolicy = iota
PreferEarliest
)
// Decide whether a pre-existing data blob in a delay-queue should be replaced if an
// an attempt is made to add/offer a new data blob in its place. Whether the data is
// replaced has no bearing on the deadline (priority) of the item in the queue.
type ReplacementPolicy int
const (
KeepExisting ReplacementPolicy = iota
ReplaceExisting
)
func (rp ReplacementPolicy) replacementValue(original, replacement interface{}) (result interface{}) {
switch rp {
case KeepExisting:
result = original
case ReplaceExisting:
fallthrough
default:
result = replacement
}
return
}
func (dp DeadlinePolicy) nextDeadline(a, b Priority) (result Priority) {
switch dp {
case PreferEarliest:
if a.ts.Before(b.ts) {
result = a
} else {
result = b
}
case PreferLatest:
fallthrough
default:
if a.ts.After(b.ts) {
result = a
} else {
result = b
}
}
return
}

View File

@@ -0,0 +1,56 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package queue
import (
"time"
)
type Priority struct {
ts time.Time // timestamp
notify BreakChan // notification channel
}
func (p Priority) Equal(other Priority) bool {
return p.ts.Equal(other.ts) && p.notify == other.notify
}
func extractFromDelayed(d Delayed) Priority {
deadline := time.Now().Add(d.GetDelay())
breaker := BreakChan(nil)
if breakout, good := d.(Breakout); good {
breaker = breakout.Breaker()
}
return Priority{
ts: deadline,
notify: breaker,
}
}
func extractFromDeadlined(d Deadlined) (Priority, bool) {
if ts, ok := d.Deadline(); ok {
breaker := BreakChan(nil)
if breakout, good := d.(Breakout); good {
breaker = breakout.Breaker()
}
return Priority{
ts: ts,
notify: breaker,
}, true
}
return Priority{}, false
}

View File

@@ -0,0 +1,19 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Some file descriptor manipulation funcs (Unix-Only), inspired by
// https://github.com/skarnet/execline/blob/master/src/execline/redirfd.c
package redirfd

View File

@@ -0,0 +1,41 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package redirfd
import (
"fmt"
"strconv"
)
// FileDescriptor mirrors unix-specific indexes for cross-platform use
type FileDescriptor int
const (
InvalidFD FileDescriptor = -1
Stdin FileDescriptor = 0
Stdout FileDescriptor = 1
Stderr FileDescriptor = 2
)
// ParseFileDescriptor parses a string formatted file descriptor
func ParseFileDescriptor(fdstr string) (FileDescriptor, error) {
fdint, err := strconv.Atoi(fdstr)
if err != nil {
return InvalidFD, fmt.Errorf("file descriptor must be an integer: %q", fdstr)
}
return FileDescriptor(fdint), nil
}

View File

@@ -0,0 +1,54 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package redirfd
import (
"testing"
. "github.com/onsi/gomega"
)
func TestParseFileDescriptor(t *testing.T) {
RegisterTestingT(t)
valid := map[string]FileDescriptor{
"-1": InvalidFD,
"0": Stdin,
"1": Stdout,
"2": Stderr,
"3": FileDescriptor(3),
}
for input, expected := range valid {
fd, err := ParseFileDescriptor(input)
Expect(err).ToNot(HaveOccurred(), "Input: '%s'", input)
Expect(fd).To(Equal(expected), "Input: '%s'", input)
}
invalid := []string{
"a",
" 1",
"blue",
"stderr",
"STDERR",
}
for _, input := range invalid {
_, err := ParseFileDescriptor(input)
Expect(err).To(HaveOccurred(), "Input: '%s'", input)
}
}

View File

@@ -0,0 +1,208 @@
// +build !windows
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package redirfd
import (
"fmt"
"os"
"syscall"
)
type RedirectMode int
const (
Read RedirectMode = iota // open file for reading
Write // open file for writing, truncating if it exists
Update // open file for read & write
Append // open file for append, create if it does not exist
AppendExisting // open file for append, do not create if it does not already exist
WriteNew // open file for writing, creating it, failing if it already exists
)
// see https://github.com/skarnet/execline/blob/master/src/execline/redirfd.c
func (mode RedirectMode) Redirect(nonblock, changemode bool, fd FileDescriptor, name string) (*os.File, error) {
flags := 0
what := -1
switch mode {
case Read:
what = syscall.O_RDONLY
flags &= ^(syscall.O_APPEND | syscall.O_CREAT | syscall.O_TRUNC | syscall.O_EXCL)
case Write:
what = syscall.O_WRONLY
flags |= syscall.O_CREAT | syscall.O_TRUNC
flags &= ^(syscall.O_APPEND | syscall.O_EXCL)
case Update:
what = syscall.O_RDWR
flags &= ^(syscall.O_APPEND | syscall.O_CREAT | syscall.O_TRUNC | syscall.O_EXCL)
case Append:
what = syscall.O_WRONLY
flags |= syscall.O_CREAT | syscall.O_APPEND
flags &= ^(syscall.O_TRUNC | syscall.O_EXCL)
case AppendExisting:
what = syscall.O_WRONLY
flags |= syscall.O_APPEND
flags &= ^(syscall.O_CREAT | syscall.O_TRUNC | syscall.O_EXCL)
case WriteNew:
what = syscall.O_WRONLY
flags |= syscall.O_CREAT | syscall.O_EXCL
flags &= ^(syscall.O_APPEND | syscall.O_TRUNC)
default:
return nil, fmt.Errorf("unexpected mode %d", mode)
}
if nonblock {
flags |= syscall.O_NONBLOCK
}
flags |= what
fd2, e := open(name, flags, 0666)
if (what == syscall.O_WRONLY) && (e == syscall.ENXIO) {
// Opens file in read-only, non-blocking mode. Returns a valid fd number if it succeeds, or -1 (and sets errno) if it fails.
fdr, e2 := open(name, syscall.O_RDONLY|syscall.O_NONBLOCK, 0)
if e2 != nil {
return nil, &os.PathError{"open_read", name, e2}
}
fd2, e = open(name, flags, 0666)
fd_close(fdr)
}
if e != nil {
return nil, &os.PathError{"open", name, e}
}
if e = fd_move(fd, fd2); e != nil {
return nil, &os.PathError{"fd_move", name, e}
}
if changemode {
if nonblock {
e = ndelay_off(fd)
} else {
e = ndelay_on(fd)
}
if e != nil {
return nil, &os.PathError{"ndelay", name, e}
}
}
return os.NewFile(uintptr(fd2), name), nil
}
// proxy to return a FileDescriptor
func open(path string, openmode int, perm uint32) (FileDescriptor, error) {
fdint, err := syscall.Open(path, openmode, perm)
return FileDescriptor(fdint), err
}
// see https://github.com/skarnet/skalibs/blob/master/src/libstddjb/fd_move.c
func fd_move(to, from FileDescriptor) (err error) {
if to == from {
return
}
for {
_, _, e1 := syscall.RawSyscall(syscall.SYS_DUP2, uintptr(from), uintptr(to), 0)
if e1 != syscall.EINTR {
if e1 != 0 {
err = e1
}
break
}
}
if err != nil {
err = fd_close(from)
}
return
/*
do
r = dup2(from, to) ;
while ((r == -1) && (errno == EINTR)) ;
return (r == -1) ? -1 : fd_close(from) ;
*/
}
// see https://github.com/skarnet/skalibs/blob/master/src/libstddjb/fd_close.c
func fd_close(fd FileDescriptor) (err error) {
i := 0
var e error
for {
if e = syscall.Close(int(fd)); e != nil {
return nil
}
i++
if e != syscall.EINTR {
break
}
}
if e == syscall.EBADF && i > 1 {
return nil
}
return e
}
/*
int fd_close (int fd)
{
register unsigned int i = 0 ;
doit:
if (!close(fd)) return 0 ;
i++ ;
if (errno == EINTR) goto doit ;
return ((errno == EBADF) && (i > 1)) ? 0 : -1 ;
}
*/
// see https://github.com/skarnet/skalibs/blob/master/src/libstddjb/ndelay_on.c
func ndelay_on(fd FileDescriptor) error {
// 32-bit will likely break because it needs SYS_FCNTL64
got, _, e := syscall.Syscall(syscall.SYS_FCNTL, uintptr(fd), uintptr(syscall.F_GETFL), 0)
if e != 0 {
return e
}
_, _, e = syscall.Syscall(syscall.SYS_FCNTL, uintptr(fd), uintptr(syscall.F_SETFL), uintptr(got|syscall.O_NONBLOCK))
if e != 0 {
return e
}
return nil
}
/*
int ndelay_on (int fd)
{
register int got = fcntl(fd, F_GETFL) ;
return (got == -1) ? -1 : fcntl(fd, F_SETFL, got | O_NONBLOCK) ;
}
*/
// see https://github.com/skarnet/skalibs/blob/master/src/libstddjb/ndelay_off.c
func ndelay_off(fd FileDescriptor) error {
// 32-bit will likely break because it needs SYS_FCNTL64
got, _, e := syscall.Syscall(syscall.SYS_FCNTL, uintptr(fd), uintptr(syscall.F_GETFL), 0)
if e != 0 {
return e
}
_, _, e = syscall.Syscall(syscall.SYS_FCNTL, uintptr(fd), uintptr(syscall.F_SETFL), uintptr(int(got) & ^syscall.O_NONBLOCK))
if e != 0 {
return e
}
return nil
}
/*
int ndelay_off (int fd)
{
register int got = fcntl(fd, F_GETFL) ;
return (got == -1) ? -1 : fcntl(fd, F_SETFL, got & ^O_NONBLOCK) ;
}
*/

View File

@@ -0,0 +1,39 @@
// +build windows
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package redirfd
import (
"fmt"
"os"
)
type RedirectMode int
const (
Read RedirectMode = iota // open file for reading
Write // open file for writing, truncating if it exists
Update // open file for read & write
Append // open file for append, create if it does not exist
AppendExisting // open file for append, do not create if it does not already exist
WriteNew // open file for writing, creating it, failing if it already exists
)
func (mode RedirectMode) Redirect(nonblock, changemode bool, fd FileDescriptor, name string) (*os.File, error) {
return nil, fmt.Errorf("Redirect(%s, %s, %d, \"%s\") not supported on windows", nonblock, changemode, fd, name)
}

View File

@@ -0,0 +1,19 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Package runtime provides utilities for semaphores (chan struct{}),
// a simple Latch implementation, and metrics for reporting handled panics.
package runtime

View File

@@ -0,0 +1,35 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package runtime
import (
"sync/atomic"
)
type Latch struct {
int32
}
// return true if this latch was successfully acquired. concurrency safe. will only return true
// upon the first invocation, all subsequent invocations will return false. always returns false
// when self is nil.
func (self *Latch) Acquire() bool {
if self == nil {
return false
}
return atomic.CompareAndSwapInt32(&self.int32, 0, 1)
}

View File

@@ -0,0 +1,61 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package runtime
import (
"sync"
"sync/atomic"
"testing"
"time"
)
func Test_LatchAcquireBasic(t *testing.T) {
var x Latch
if !x.Acquire() {
t.Fatalf("expected first acquire to succeed")
}
if x.Acquire() {
t.Fatalf("expected second acquire to fail")
}
if x.Acquire() {
t.Fatalf("expected third acquire to fail")
}
}
func Test_LatchAcquireConcurrent(t *testing.T) {
var x Latch
const NUM = 10
ch := make(chan struct{})
var success int32
var wg sync.WaitGroup
wg.Add(NUM)
for i := 0; i < NUM; i++ {
go func() {
defer wg.Done()
<-ch
if x.Acquire() {
atomic.AddInt32(&success, 1)
}
}()
}
time.Sleep(200 * time.Millisecond)
close(ch)
wg.Wait()
if success != 1 {
t.Fatalf("expected single acquire to succeed instead of %d", success)
}
}

View File

@@ -0,0 +1,47 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package runtime
import (
"sync"
"github.com/GoogleCloudPlatform/kubernetes/pkg/util"
"github.com/prometheus/client_golang/prometheus"
)
const (
runtimeSubsystem = "runtime"
)
var (
panicCounter = prometheus.NewCounter(
prometheus.CounterOpts{
Subsystem: runtimeSubsystem,
Name: "panics",
Help: "Counter of panics handled by the internal crash handler.",
},
)
)
var registerMetrics sync.Once
func Register() {
registerMetrics.Do(func() {
prometheus.MustRegister(panicCounter)
util.PanicHandlers = append(util.PanicHandlers, func(interface{}) { panicCounter.Inc() })
})
}

View File

@@ -0,0 +1,122 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package runtime
import (
"os"
"sync"
"time"
"github.com/GoogleCloudPlatform/kubernetes/pkg/util"
)
type Signal <-chan struct{}
// return a func that will close the signal chan.
// multiple invocations of the returned func will not generate a panic.
// two funcs from separate invocations of Closer() (on the same sig chan) will cause a panic if both invoked.
// for example:
// // good
// x := runtime.After(func() { ... })
// f := x.Closer()
// f()
// f()
//
// // bad
// x := runtime.After(func() { ... })
// f := x.Closer()
// g := x.Closer()
// f()
// g() // this will panic
func Closer(sig chan<- struct{}) func() {
var once sync.Once
return func() {
once.Do(func() { close(sig) })
}
}
// upon receiving signal sig invoke function f and immediately return a signal
// that indicates f's completion. used to chain handler funcs, for example:
// On(job.Done(), response.Send).Then(wg.Done)
func (sig Signal) Then(f func()) Signal {
if sig == nil {
return nil
}
return On(sig, f)
}
// execute a callback function after the specified signal chan closes.
// immediately returns a signal that indicates f's completion.
func On(sig <-chan struct{}, f func()) Signal {
if sig == nil {
return nil
}
return After(func() {
<-sig
if f != nil {
f()
}
})
}
func OnOSSignal(sig <-chan os.Signal, f func(os.Signal)) Signal {
if sig == nil {
return nil
}
return After(func() {
if s, ok := <-sig; ok && f != nil {
f(s)
}
})
}
// spawn a goroutine to execute a func, immediately returns a chan that closes
// upon completion of the func. returns a nil signal chan if the given func is nil.
func After(f func()) Signal {
ch := make(chan struct{})
go func() {
defer close(ch)
defer util.HandleCrash()
if f != nil {
f()
}
}()
return Signal(ch)
}
// periodically execute the given function, stopping once stopCh is closed.
// this func blocks until stopCh is closed, it's intended to be run as a goroutine.
func Until(f func(), period time.Duration, stopCh <-chan struct{}) {
if f == nil {
return
}
for {
select {
case <-stopCh:
return
default:
}
func() {
defer util.HandleCrash()
f()
}()
select {
case <-stopCh:
case <-time.After(period):
}
}
}

View File

@@ -0,0 +1,64 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package runtime
import (
"testing"
"time"
)
func TestUntil(t *testing.T) {
ch := make(chan struct{})
close(ch)
Until(func() {
t.Fatal("should not have been invoked")
}, 0, ch)
//--
ch = make(chan struct{})
called := make(chan struct{})
After(func() {
Until(func() {
called <- struct{}{}
}, 0, ch)
}).Then(func() { close(called) })
<-called
close(ch)
<-called
//--
ch = make(chan struct{})
called = make(chan struct{})
running := make(chan struct{})
After(func() {
Until(func() {
close(running)
called <- struct{}{}
}, 2*time.Second, ch)
}).Then(func() { close(called) })
<-running
close(ch)
<-called // unblock the goroutine
now := time.Now()
<-called
if time.Since(now) > 1800*time.Millisecond {
t.Fatalf("Until should not have waited the full timeout period since we closed the stop chan")
}
}

View File

@@ -0,0 +1,109 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package config
import (
"io"
"time"
"code.google.com/p/gcfg"
)
const (
DefaultOfferTTL = 5 * time.Second // duration an offer is viable, prior to being expired
DefaultOfferLingerTTL = 120 * time.Second // duration an expired offer lingers in history
DefaultListenerDelay = 1 * time.Second // duration between offer listener notifications
DefaultUpdatesBacklog = 2048 // size of the pod updates channel
DefaultFrameworkIdRefreshInterval = 30 * time.Second // interval we update the frameworkId stored in etcd
DefaultInitialImplicitReconciliationDelay = 15 * time.Second // wait this amount of time after initial registration before attempting implicit reconciliation
DefaultExplicitReconciliationMaxBackoff = 2 * time.Minute // interval in between internal task status checks/updates
DefaultExplicitReconciliationAbortTimeout = 30 * time.Second // waiting period after attempting to cancel an ongoing reconciliation
DefaultInitialPodBackoff = 1 * time.Second
DefaultMaxPodBackoff = 60 * time.Second
DefaultHttpHandlerTimeout = 10 * time.Second
DefaultHttpBindInterval = 5 * time.Second
)
// Example scheduler configuration file:
//
// [scheduler]
// info-name = Kubernetes
// offer-ttl = 5s
// offer-linger-ttl = 2m
type ConfigWrapper struct {
Scheduler Config
}
type Config struct {
OfferTTL WrappedDuration `gcfg:"offer-ttl"`
OfferLingerTTL WrappedDuration `gcfg:"offer-linger-ttl"`
ListenerDelay WrappedDuration `gcfg:"listener-delay"`
UpdatesBacklog int `gcfg:"updates-backlog"`
FrameworkIdRefreshInterval WrappedDuration `gcfg:"framework-id-refresh-interval"`
InitialImplicitReconciliationDelay WrappedDuration `gcfg:"initial-implicit-reconciliation-delay"`
ExplicitReconciliationMaxBackoff WrappedDuration `gcfg:"explicit-reconciliantion-max-backoff"`
ExplicitReconciliationAbortTimeout WrappedDuration `gcfg:"explicit-reconciliantion-abort-timeout"`
InitialPodBackoff WrappedDuration `gcfg:"initial-pod-backoff"`
MaxPodBackoff WrappedDuration `gcfg:"max-pod-backoff"`
HttpHandlerTimeout WrappedDuration `gcfg:"http-handler-timeout"`
HttpBindInterval WrappedDuration `gcfg:"http-bind-interval"`
}
type WrappedDuration struct {
time.Duration
}
func (wd *WrappedDuration) UnmarshalText(data []byte) error {
d, err := time.ParseDuration(string(data))
if err == nil {
wd.Duration = d
}
return err
}
func (c *Config) SetDefaults() {
c.OfferTTL = WrappedDuration{DefaultOfferTTL}
c.OfferLingerTTL = WrappedDuration{DefaultOfferLingerTTL}
c.ListenerDelay = WrappedDuration{DefaultListenerDelay}
c.UpdatesBacklog = DefaultUpdatesBacklog
c.FrameworkIdRefreshInterval = WrappedDuration{DefaultFrameworkIdRefreshInterval}
c.InitialImplicitReconciliationDelay = WrappedDuration{DefaultInitialImplicitReconciliationDelay}
c.ExplicitReconciliationMaxBackoff = WrappedDuration{DefaultExplicitReconciliationMaxBackoff}
c.ExplicitReconciliationAbortTimeout = WrappedDuration{DefaultExplicitReconciliationAbortTimeout}
c.InitialPodBackoff = WrappedDuration{DefaultInitialPodBackoff}
c.MaxPodBackoff = WrappedDuration{DefaultMaxPodBackoff}
c.HttpHandlerTimeout = WrappedDuration{DefaultHttpHandlerTimeout}
c.HttpBindInterval = WrappedDuration{DefaultHttpBindInterval}
}
func CreateDefaultConfig() *Config {
c := &Config{}
c.SetDefaults()
return c
}
func (c *Config) Read(configReader io.Reader) error {
wrapper := &ConfigWrapper{Scheduler: *c}
if configReader != nil {
if err := gcfg.ReadInto(wrapper, configReader); err != nil {
return err
}
*c = wrapper.Scheduler
}
return nil
}

View File

@@ -0,0 +1,112 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package config
import (
"strings"
"testing"
"time"
"github.com/stretchr/testify/assert"
)
func is_default(c *Config, t *testing.T) {
assert := assert.New(t)
assert.Equal(DefaultOfferTTL, c.OfferTTL.Duration)
assert.Equal(DefaultOfferLingerTTL, c.OfferLingerTTL.Duration)
assert.Equal(DefaultListenerDelay, c.ListenerDelay.Duration)
assert.Equal(DefaultUpdatesBacklog, c.UpdatesBacklog)
assert.Equal(DefaultFrameworkIdRefreshInterval, c.FrameworkIdRefreshInterval.Duration)
assert.Equal(DefaultInitialImplicitReconciliationDelay, c.InitialImplicitReconciliationDelay.Duration)
assert.Equal(DefaultExplicitReconciliationMaxBackoff, c.ExplicitReconciliationMaxBackoff.Duration)
assert.Equal(DefaultExplicitReconciliationAbortTimeout, c.ExplicitReconciliationAbortTimeout.Duration)
assert.Equal(DefaultInitialPodBackoff, c.InitialPodBackoff.Duration)
assert.Equal(DefaultMaxPodBackoff, c.MaxPodBackoff.Duration)
assert.Equal(DefaultHttpHandlerTimeout, c.HttpHandlerTimeout.Duration)
assert.Equal(DefaultHttpBindInterval, c.HttpBindInterval.Duration)
}
// Check that SetDefaults sets the default values
func TestConfig_SetDefaults(t *testing.T) {
c := &Config{}
c.SetDefaults()
is_default(c, t)
}
// Check that CreateDefaultConfig returns a default config
func TestConfig_CreateDefaultConfig(t *testing.T) {
c := CreateDefaultConfig()
is_default(c, t)
}
// Check that a config string can be parsed
func TestConfig_Read(t *testing.T) {
assert := assert.New(t)
c := CreateDefaultConfig()
reader := strings.NewReader(`
[scheduler]
offer-ttl=42s
offer-linger-ttl=42s
listener-delay=42s
updates-backlog=42
framework-id-refresh-interval=42s
initial-implicit-reconciliation-delay=42s
explicit-reconciliantion-max-backoff=42s
explicit-reconciliantion-abort-timeout=42s
initial-pod-backoff=42s
max-pod-backoff=42s
http-handler-timeout=42s
http-bind-interval=42s
`)
err := c.Read(reader)
if err != nil {
t.Fatal("Cannot parse scheduler config: " + err.Error())
}
assert.Equal(42*time.Second, c.OfferTTL.Duration)
assert.Equal(42*time.Second, c.OfferLingerTTL.Duration)
assert.Equal(42*time.Second, c.ListenerDelay.Duration)
assert.Equal(42, c.UpdatesBacklog)
assert.Equal(42*time.Second, c.FrameworkIdRefreshInterval.Duration)
assert.Equal(42*time.Second, c.InitialImplicitReconciliationDelay.Duration)
assert.Equal(42*time.Second, c.ExplicitReconciliationMaxBackoff.Duration)
assert.Equal(42*time.Second, c.ExplicitReconciliationAbortTimeout.Duration)
assert.Equal(42*time.Second, c.InitialPodBackoff.Duration)
assert.Equal(42*time.Second, c.MaxPodBackoff.Duration)
assert.Equal(42*time.Second, c.HttpHandlerTimeout.Duration)
assert.Equal(42*time.Second, c.HttpBindInterval.Duration)
}
// check that an invalid config is rejected and non of the values to overwritten
func TestConfig_ReadError(t *testing.T) {
assert := assert.New(t)
c := CreateDefaultConfig()
reader := strings.NewReader(`
[scheduler]
offer-ttl = 42s
invalid-setting = 42s
`)
err := c.Read(reader)
if err == nil {
t.Fatal("Invalid scheduler config should lead to an error")
}
assert.NotEqual(42*time.Second, c.OfferTTL.Duration)
}

View File

@@ -0,0 +1,18 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Package config provides mechanisms for low-level scheduler tuning.
package config

View File

@@ -0,0 +1,106 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package constraint
import (
"encoding/json"
"fmt"
)
type OperatorType int
const (
UniqueOperator OperatorType = iota
LikeOperator
ClusterOperator
GroupByOperator
UnlikeOperator
)
var (
labels = []string{
"UNIQUE",
"LIKE",
"CLUSTER",
"GROUP_BY",
"UNLIKE",
}
labelToType map[string]OperatorType
)
func init() {
labelToType = make(map[string]OperatorType)
for i, s := range labels {
labelToType[s] = OperatorType(i)
}
}
func (t OperatorType) String() string {
switch t {
case UniqueOperator, LikeOperator, ClusterOperator, GroupByOperator, UnlikeOperator:
return labels[int(t)]
default:
panic(fmt.Sprintf("unrecognized operator type: %d", int(t)))
}
}
func parseOperatorType(s string) (OperatorType, error) {
t, found := labelToType[s]
if !found {
return UniqueOperator, fmt.Errorf("unrecognized operator %q", s)
}
return t, nil
}
type Constraint struct {
Field string // required
Operator OperatorType // required
Value string // optional
}
func (c *Constraint) MarshalJSON() ([]byte, error) {
var a []string
if c != nil {
if c.Value != "" {
a = append(a, c.Field, c.Operator.String(), c.Value)
} else {
a = append(a, c.Field, c.Operator.String())
}
}
return json.Marshal(a)
}
func (c *Constraint) UnmarshalJSON(buf []byte) (err error) {
var a []string
if err = json.Unmarshal(buf, &a); err != nil {
return err
}
switch x := len(a); {
case x < 2:
err = fmt.Errorf("not enough arguments to form constraint")
case x > 3:
err = fmt.Errorf("too many arguments to form constraint")
case x == 3:
c.Value = a[2]
fallthrough
case x == 2:
c.Field = a[0]
c.Operator, err = parseOperatorType(a[1])
}
return err
}

View File

@@ -0,0 +1,79 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package constraint
import (
"encoding/json"
"testing"
)
func TestDeserialize(t *testing.T) {
shouldMatch := func(js string, field string, operator OperatorType, value string) (err error) {
constraint := Constraint{}
if err = json.Unmarshal(([]byte)(js), &constraint); err != nil {
return
}
if field != constraint.Field {
t.Fatalf("expected field %q instead of %q", field, constraint.Field)
}
if operator != constraint.Operator {
t.Fatalf("expected operator %v instead of %v", operator, constraint.Operator)
}
if value != constraint.Value {
t.Fatalf("expected value %q instead of %q", value, constraint.Value)
}
return
}
failOnError := func(err error) {
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
}
failOnError(shouldMatch(`["hostname","UNIQUE"]`, "hostname", UniqueOperator, ""))
failOnError(shouldMatch(`["rackid","GROUP_BY","1"]`, "rackid", GroupByOperator, "1"))
failOnError(shouldMatch(`["jdk","LIKE","7"]`, "jdk", LikeOperator, "7"))
failOnError(shouldMatch(`["jdk","UNLIKE","7"]`, "jdk", UnlikeOperator, "7"))
failOnError(shouldMatch(`["bob","CLUSTER","foo"]`, "bob", ClusterOperator, "foo"))
err := shouldMatch(`["bill","NOT_REALLY_AN_OPERATOR","pete"]`, "bill", ClusterOperator, "pete")
if err == nil {
t.Fatalf("expected unmarshalling error for invalid operator")
}
}
func TestSerialize(t *testing.T) {
shouldMatch := func(expected string, constraint *Constraint) error {
data, err := json.Marshal(constraint)
if err != nil {
return err
}
js := string(data)
if js != expected {
t.Fatalf("expected json %q instead of %q", expected, js)
}
return nil
}
failOnError := func(err error) {
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
}
failOnError(shouldMatch(`["hostname","UNIQUE"]`, &Constraint{"hostname", UniqueOperator, ""}))
failOnError(shouldMatch(`["rackid","GROUP_BY","1"]`, &Constraint{"rackid", GroupByOperator, "1"}))
failOnError(shouldMatch(`["jdk","LIKE","7"]`, &Constraint{"jdk", LikeOperator, "7"}))
failOnError(shouldMatch(`["jdk","UNLIKE","7"]`, &Constraint{"jdk", UnlikeOperator, "7"}))
failOnError(shouldMatch(`["bob","CLUSTER","foo"]`, &Constraint{"bob", ClusterOperator, "foo"}))
}

View File

@@ -0,0 +1,19 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Package constraint exposes Marathon-like constraints for scheduling pods.
// Incomplete.
package constraint

View File

@@ -0,0 +1,18 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Package scheduler implements the Kubernetes Mesos scheduler.
package scheduler

View File

@@ -0,0 +1,57 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package scheduler
import (
"fmt"
log "github.com/golang/glog"
"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/offers"
"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/scheduler/podtask"
)
// A first-come-first-serve scheduler: acquires the first offer that can support the task
func FCFSScheduleFunc(r offers.Registry, unused SlaveIndex, task *podtask.T) (offers.Perishable, error) {
podName := fmt.Sprintf("%s/%s", task.Pod.Namespace, task.Pod.Name)
var acceptedOffer offers.Perishable
err := r.Walk(func(p offers.Perishable) (bool, error) {
offer := p.Details()
if offer == nil {
return false, fmt.Errorf("nil offer while scheduling task %v", task.ID)
}
if task.AcceptOffer(offer) {
if p.Acquire() {
acceptedOffer = p
log.V(3).Infof("Pod %s accepted offer %v", podName, offer.Id.GetValue())
return true, nil // stop, we found an offer
}
}
return false, nil // continue
})
if acceptedOffer != nil {
if err != nil {
log.Warningf("problems walking the offer registry: %v, attempting to continue", err)
}
return acceptedOffer, nil
}
if err != nil {
log.V(2).Infof("failed to find a fit for pod: %s, err = %v", podName, err)
return nil, err
}
log.V(2).Infof("failed to find a fit for pod: %s", podName)
return nil, noSuitableOffersErr
}

View File

@@ -0,0 +1,18 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Package ha encapsulates high-availability scheduler concerns.
package ha

View File

@@ -0,0 +1,73 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package ha
import (
"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/election"
log "github.com/golang/glog"
)
type roleType int
const (
followerRole roleType = iota
masterRole
retiredRole
)
type candidateService struct {
sched *SchedulerProcess
newDriver DriverFactory
role roleType
valid ValidationFunc
}
type ValidationFunc func(desiredUid, currentUid string)
func NewCandidate(s *SchedulerProcess, f DriverFactory, v ValidationFunc) election.Service {
return &candidateService{
sched: s,
newDriver: f,
role: followerRole,
valid: v,
}
}
func (self *candidateService) Validate(desired, current election.Master) {
if self.valid != nil {
self.valid(string(desired), string(current))
}
}
func (self *candidateService) Start() {
if self.role == followerRole {
log.Info("elected as master")
self.role = masterRole
self.sched.Elect(self.newDriver)
}
}
func (self *candidateService) Stop() {
if self.role == masterRole {
log.Info("retiring from master")
self.role = retiredRole
// order is important here, watchers of a SchedulerProcess will
// check SchedulerProcess.Failover() once Done() is closed.
close(self.sched.failover)
self.sched.End()
}
}

View File

@@ -0,0 +1,285 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package ha
import (
"fmt"
"sync/atomic"
"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/proc"
"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/runtime"
log "github.com/golang/glog"
mesos "github.com/mesos/mesos-go/mesosproto"
bindings "github.com/mesos/mesos-go/scheduler"
)
type DriverFactory func() (bindings.SchedulerDriver, error)
type stageType int32
const (
initStage stageType = iota
standbyStage
masterStage
finStage
)
func (stage *stageType) transition(from, to stageType) bool {
return atomic.CompareAndSwapInt32((*int32)(stage), int32(from), int32(to))
}
func (s *stageType) transitionTo(to stageType, unless ...stageType) bool {
if len(unless) == 0 {
atomic.StoreInt32((*int32)(s), int32(to))
return true
}
for {
state := s.get()
for _, x := range unless {
if state == x {
return false
}
}
if s.transition(state, to) {
return true
}
}
}
func (stage *stageType) get() stageType {
return stageType(atomic.LoadInt32((*int32)(stage)))
}
// execute some action in the deferred context of the process, but only if we
// match the stage of the process at the time the action is executed.
func (stage stageType) Do(p *SchedulerProcess, a proc.Action) <-chan error {
errOnce := proc.NewErrorOnce(p.fin)
errOuter := p.Do(proc.Action(func() {
switch stage {
case standbyStage:
//await standby signal or death
select {
case <-p.standby:
case <-p.Done():
}
case masterStage:
//await elected signal or death
select {
case <-p.elected:
case <-p.Done():
}
case finStage:
errOnce.Reportf("scheduler process is dying, dropping action")
return
default:
}
errOnce.Report(stage.When(p, a))
}))
return errOnce.Send(errOuter).Err()
}
// execute some action only if we match the stage of the scheduler process
func (stage stageType) When(p *SchedulerProcess, a proc.Action) (err error) {
if stage != (&p.stage).get() {
err = fmt.Errorf("failed to execute deferred action, expected lifecycle stage %v instead of %v", stage, p.stage)
} else {
a()
}
return
}
type SchedulerProcess struct {
proc.Process
bindings.Scheduler
stage stageType
elected chan struct{} // upon close we've been elected
failover chan struct{} // closed indicates that we should failover upon End()
standby chan struct{}
fin chan struct{}
}
func New(sched bindings.Scheduler) *SchedulerProcess {
p := &SchedulerProcess{
Process: proc.New(),
Scheduler: sched,
stage: initStage,
elected: make(chan struct{}),
failover: make(chan struct{}),
standby: make(chan struct{}),
fin: make(chan struct{}),
}
runtime.On(p.Running(), p.begin)
return p
}
func (self *SchedulerProcess) begin() {
if (&self.stage).transition(initStage, standbyStage) {
close(self.standby)
log.Infoln("scheduler process entered standby stage")
} else {
log.Errorf("failed to transition from init to standby stage")
}
}
func (self *SchedulerProcess) End() <-chan struct{} {
if (&self.stage).transitionTo(finStage, finStage) {
defer close(self.fin)
log.Infoln("scheduler process entered fin stage")
}
return self.Process.End()
}
func (self *SchedulerProcess) Elect(newDriver DriverFactory) {
errOnce := proc.NewErrorOnce(self.fin)
proc.OnError(errOnce.Send(standbyStage.Do(self, proc.Action(func() {
if !(&self.stage).transition(standbyStage, masterStage) {
log.Errorf("failed to transition from standby to master stage, aborting")
self.End()
return
}
log.Infoln("scheduler process entered master stage")
drv, err := newDriver()
if err != nil {
log.Errorf("failed to fetch scheduler driver: %v", err)
self.End()
return
}
log.V(1).Infoln("starting driver...")
stat, err := drv.Start()
if stat == mesos.Status_DRIVER_RUNNING && err == nil {
log.Infoln("driver started successfully and is running")
close(self.elected)
go func() {
defer self.End()
_, err := drv.Join()
if err != nil {
log.Errorf("driver failed with error: %v", err)
}
errOnce.Report(err)
}()
return
}
defer self.End()
if err != nil {
log.Errorf("failed to start scheduler driver: %v", err)
} else {
log.Errorf("expected RUNNING status, not %v", stat)
}
}))).Err(), func(err error) {
defer self.End()
log.Errorf("failed to handle election event, aborting: %v", err)
}, self.fin)
}
func (self *SchedulerProcess) Terminal() <-chan struct{} {
return self.fin
}
func (self *SchedulerProcess) Elected() <-chan struct{} {
return self.elected
}
func (self *SchedulerProcess) Failover() <-chan struct{} {
return self.failover
}
type masterProcess struct {
*SchedulerProcess
doer proc.Doer
}
func (self *masterProcess) Done() <-chan struct{} {
return self.SchedulerProcess.Terminal()
}
func (self *masterProcess) Do(a proc.Action) <-chan error {
return self.doer.Do(a)
}
// returns a Process instance that will only execute a proc.Action if the scheduler is the elected master
func (self *SchedulerProcess) Master() proc.Process {
return &masterProcess{
SchedulerProcess: self,
doer: proc.DoWith(self, proc.DoerFunc(func(a proc.Action) <-chan error {
return proc.ErrorChan(masterStage.When(self, a))
})),
}
}
func (self *SchedulerProcess) logError(ch <-chan error) {
self.OnError(ch, func(err error) {
log.Errorf("failed to execute scheduler action: %v", err)
})
}
func (self *SchedulerProcess) Registered(drv bindings.SchedulerDriver, fid *mesos.FrameworkID, mi *mesos.MasterInfo) {
self.logError(self.Master().Do(proc.Action(func() {
self.Scheduler.Registered(drv, fid, mi)
})))
}
func (self *SchedulerProcess) Reregistered(drv bindings.SchedulerDriver, mi *mesos.MasterInfo) {
self.logError(self.Master().Do(proc.Action(func() {
self.Scheduler.Reregistered(drv, mi)
})))
}
func (self *SchedulerProcess) Disconnected(drv bindings.SchedulerDriver) {
self.logError(self.Master().Do(proc.Action(func() {
self.Scheduler.Disconnected(drv)
})))
}
func (self *SchedulerProcess) ResourceOffers(drv bindings.SchedulerDriver, off []*mesos.Offer) {
self.logError(self.Master().Do(proc.Action(func() {
self.Scheduler.ResourceOffers(drv, off)
})))
}
func (self *SchedulerProcess) OfferRescinded(drv bindings.SchedulerDriver, oid *mesos.OfferID) {
self.logError(self.Master().Do(proc.Action(func() {
self.Scheduler.OfferRescinded(drv, oid)
})))
}
func (self *SchedulerProcess) StatusUpdate(drv bindings.SchedulerDriver, ts *mesos.TaskStatus) {
self.logError(self.Master().Do(proc.Action(func() {
self.Scheduler.StatusUpdate(drv, ts)
})))
}
func (self *SchedulerProcess) FrameworkMessage(drv bindings.SchedulerDriver, eid *mesos.ExecutorID, sid *mesos.SlaveID, m string) {
self.logError(self.Master().Do(proc.Action(func() {
self.Scheduler.FrameworkMessage(drv, eid, sid, m)
})))
}
func (self *SchedulerProcess) SlaveLost(drv bindings.SchedulerDriver, sid *mesos.SlaveID) {
self.logError(self.Master().Do(proc.Action(func() {
self.Scheduler.SlaveLost(drv, sid)
})))
}
func (self *SchedulerProcess) ExecutorLost(drv bindings.SchedulerDriver, eid *mesos.ExecutorID, sid *mesos.SlaveID, x int) {
self.logError(self.Master().Do(proc.Action(func() {
self.Scheduler.ExecutorLost(drv, eid, sid, x)
})))
}
func (self *SchedulerProcess) Error(drv bindings.SchedulerDriver, msg string) {
self.Scheduler.Error(drv, msg)
}

View File

@@ -0,0 +1,30 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package meta
// kubernetes api object annotations
const (
BindingHostKey = "k8s.mesosphere.io/bindingHost"
TaskIdKey = "k8s.mesosphere.io/taskId"
SlaveIdKey = "k8s.mesosphere.io/slaveId"
OfferIdKey = "k8s.mesosphere.io/offerId"
ExecutorIdKey = "k8s.mesosphere.io/executorId"
PortMappingKeyPrefix = "k8s.mesosphere.io/port_"
PortMappingKeyFormat = PortMappingKeyPrefix + "%s_%d"
PortNameMappingKeyPrefix = "k8s.mesosphere.io/portName_"
PortNameMappingKeyFormat = PortNameMappingKeyPrefix + "%s_%s"
)

View File

@@ -0,0 +1,19 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Package meta defines framework constants used as keys in k8s annotations
// that are attached to k8s pods
package meta

View File

@@ -0,0 +1,24 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package meta
// keys for things that we store
const (
//TODO(jdef) this should also be a format instead of a fixed path
FrameworkIDKey = "/mesos/k8sm/frameworkid"
DefaultElectionFormat = "/mesos/k8sm/framework/%s/leader"
)

View File

@@ -0,0 +1,18 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Package metrics defines and exposes instrumentation metrics of the scheduler.
package metrics

View File

@@ -0,0 +1,102 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package metrics
import (
"sync"
"time"
"github.com/prometheus/client_golang/prometheus"
)
const (
schedulerSubsystem = "k8sm_scheduler"
)
var (
QueueWaitTime = prometheus.NewSummary(
prometheus.SummaryOpts{
Subsystem: schedulerSubsystem,
Name: "queue_wait_time_microseconds",
Help: "Launch queue wait time in microseconds",
},
)
BindLatency = prometheus.NewSummary(
prometheus.SummaryOpts{
Subsystem: schedulerSubsystem,
Name: "bind_latency_microseconds",
Help: "Latency in microseconds between pod-task launch and pod binding.",
},
)
StatusUpdates = prometheus.NewCounterVec(
prometheus.CounterOpts{
Subsystem: schedulerSubsystem,
Name: "status_updates",
Help: "Counter of TaskStatus updates, broken out by source, reason, state.",
},
[]string{"source", "reason", "state"},
)
ReconciliationLatency = prometheus.NewSummary(
prometheus.SummaryOpts{
Subsystem: schedulerSubsystem,
Name: "reconciliation_latency_microseconds",
Help: "Latency in microseconds to execute explicit task reconciliation.",
},
)
ReconciliationRequested = prometheus.NewCounterVec(
prometheus.CounterOpts{
Subsystem: schedulerSubsystem,
Name: "reconciliation_requested",
Help: "Counter of requested task reconciliations, broken out by kind.",
},
[]string{"kind"},
)
ReconciliationExecuted = prometheus.NewCounterVec(
prometheus.CounterOpts{
Subsystem: schedulerSubsystem,
Name: "reconciliation_executed",
Help: "Counter of executed task reconciliations requests, broken out by kind.",
},
[]string{"kind"},
)
ReconciliationCancelled = prometheus.NewCounterVec(
prometheus.CounterOpts{
Subsystem: schedulerSubsystem,
Name: "reconciliation_cancelled",
Help: "Counter of cancelled task reconciliations requests, broken out by kind.",
},
[]string{"kind"},
)
)
var registerMetrics sync.Once
func Register() {
registerMetrics.Do(func() {
prometheus.MustRegister(QueueWaitTime)
prometheus.MustRegister(BindLatency)
prometheus.MustRegister(StatusUpdates)
prometheus.MustRegister(ReconciliationLatency)
prometheus.MustRegister(ReconciliationRequested)
prometheus.MustRegister(ReconciliationExecuted)
prometheus.MustRegister(ReconciliationCancelled)
})
}
func InMicroseconds(d time.Duration) float64 {
return float64(d.Nanoseconds() / time.Microsecond.Nanoseconds())
}

View File

@@ -0,0 +1,203 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package scheduler
import (
"sync"
"testing"
"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/offers"
"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/scheduler/podtask"
"github.com/GoogleCloudPlatform/kubernetes/pkg/api"
mesos "github.com/mesos/mesos-go/mesosproto"
"github.com/stretchr/testify/mock"
)
// implements SchedulerInterface
type MockScheduler struct {
sync.RWMutex
mock.Mock
}
func (m *MockScheduler) slaveFor(id string) (slave *Slave, ok bool) {
args := m.Called(id)
x := args.Get(0)
if x != nil {
slave = x.(*Slave)
}
ok = args.Bool(1)
return
}
func (m *MockScheduler) algorithm() (f PodScheduleFunc) {
args := m.Called()
x := args.Get(0)
if x != nil {
f = x.(PodScheduleFunc)
}
return
}
func (m *MockScheduler) createPodTask(ctx api.Context, pod *api.Pod) (task *podtask.T, err error) {
args := m.Called(ctx, pod)
x := args.Get(0)
if x != nil {
task = x.(*podtask.T)
}
err = args.Error(1)
return
}
func (m *MockScheduler) offers() (f offers.Registry) {
args := m.Called()
x := args.Get(0)
if x != nil {
f = x.(offers.Registry)
}
return
}
func (m *MockScheduler) tasks() (f podtask.Registry) {
args := m.Called()
x := args.Get(0)
if x != nil {
f = x.(podtask.Registry)
}
return
}
func (m *MockScheduler) killTask(taskId string) error {
args := m.Called(taskId)
return args.Error(0)
}
func (m *MockScheduler) launchTask(task *podtask.T) error {
args := m.Called(task)
return args.Error(0)
}
// @deprecated this is a placeholder for me to test the mock package
func TestNoSlavesYet(t *testing.T) {
obj := &MockScheduler{}
obj.On("slaveFor", "foo").Return(nil, false)
obj.slaveFor("foo")
obj.AssertExpectations(t)
}
/*-----------------------------------------------------------------------------
|
| this really belongs in the mesos-go package, but that's being updated soon
| any way so just keep it here for now unless we *really* need it there.
|
\-----------------------------------------------------------------------------
// Scheduler defines the interfaces that needed to be implemented.
type Scheduler interface {
Registered(SchedulerDriver, *FrameworkID, *MasterInfo)
Reregistered(SchedulerDriver, *MasterInfo)
Disconnected(SchedulerDriver)
ResourceOffers(SchedulerDriver, []*Offer)
OfferRescinded(SchedulerDriver, *OfferID)
StatusUpdate(SchedulerDriver, *TaskStatus)
FrameworkMessage(SchedulerDriver, *ExecutorID, *SlaveID, string)
SlaveLost(SchedulerDriver, *SlaveID)
ExecutorLost(SchedulerDriver, *ExecutorID, *SlaveID, int)
Error(SchedulerDriver, string)
}
*/
func status(args mock.Arguments, at int) (val mesos.Status) {
if x := args.Get(at); x != nil {
val = x.(mesos.Status)
}
return
}
type extendedMock struct {
mock.Mock
}
// Upon returns a chan that closes upon the execution of the most recently registered call.
func (m *extendedMock) Upon() <-chan struct{} {
ch := make(chan struct{})
call := &m.ExpectedCalls[len(m.ExpectedCalls)-1]
f := call.Run
call.Run = func(args mock.Arguments) {
defer close(ch)
if f != nil {
f(args)
}
}
return ch
}
type MockSchedulerDriver struct {
extendedMock
}
func (m *MockSchedulerDriver) Init() error {
args := m.Called()
return args.Error(0)
}
func (m *MockSchedulerDriver) Start() (mesos.Status, error) {
args := m.Called()
return status(args, 0), args.Error(1)
}
func (m *MockSchedulerDriver) Stop(b bool) (mesos.Status, error) {
args := m.Called(b)
return status(args, 0), args.Error(1)
}
func (m *MockSchedulerDriver) Abort() (mesos.Status, error) {
args := m.Called()
return status(args, 0), args.Error(1)
}
func (m *MockSchedulerDriver) Join() (mesos.Status, error) {
args := m.Called()
return status(args, 0), args.Error(1)
}
func (m *MockSchedulerDriver) Run() (mesos.Status, error) {
args := m.Called()
return status(args, 0), args.Error(1)
}
func (m *MockSchedulerDriver) RequestResources(r []*mesos.Request) (mesos.Status, error) {
args := m.Called(r)
return status(args, 0), args.Error(1)
}
func (m *MockSchedulerDriver) ReconcileTasks(statuses []*mesos.TaskStatus) (mesos.Status, error) {
args := m.Called(statuses)
return status(args, 0), args.Error(1)
}
func (m *MockSchedulerDriver) LaunchTasks(offerIds []*mesos.OfferID, ti []*mesos.TaskInfo, f *mesos.Filters) (mesos.Status, error) {
args := m.Called(offerIds, ti, f)
return status(args, 0), args.Error(1)
}
func (m *MockSchedulerDriver) KillTask(tid *mesos.TaskID) (mesos.Status, error) {
args := m.Called(tid)
return status(args, 0), args.Error(1)
}
func (m *MockSchedulerDriver) DeclineOffer(oid *mesos.OfferID, f *mesos.Filters) (mesos.Status, error) {
args := m.Called(oid, f)
return status(args, 0), args.Error(1)
}
func (m *MockSchedulerDriver) ReviveOffers() (mesos.Status, error) {
args := m.Called()
return status(args, 0), args.Error(0)
}
func (m *MockSchedulerDriver) SendFrameworkMessage(eid *mesos.ExecutorID, sid *mesos.SlaveID, s string) (mesos.Status, error) {
args := m.Called(eid, sid, s)
return status(args, 0), args.Error(1)
}
func (m *MockSchedulerDriver) Destroy() {
m.Called()
}
func (m *MockSchedulerDriver) Wait() {
m.Called()
}

View File

@@ -0,0 +1,875 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package scheduler
import (
"fmt"
"io"
"net/http"
"strconv"
"sync"
"time"
"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/backoff"
"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/offers"
"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/queue"
"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/runtime"
annotation "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/scheduler/meta"
"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/scheduler/podtask"
"github.com/GoogleCloudPlatform/kubernetes/pkg/api"
"github.com/GoogleCloudPlatform/kubernetes/pkg/api/errors"
"github.com/GoogleCloudPlatform/kubernetes/pkg/client"
"github.com/GoogleCloudPlatform/kubernetes/pkg/client/cache"
"github.com/GoogleCloudPlatform/kubernetes/pkg/client/record"
"github.com/GoogleCloudPlatform/kubernetes/pkg/fields"
"github.com/GoogleCloudPlatform/kubernetes/pkg/util"
plugin "github.com/GoogleCloudPlatform/kubernetes/plugin/pkg/scheduler"
"github.com/GoogleCloudPlatform/kubernetes/plugin/pkg/scheduler/algorithm"
log "github.com/golang/glog"
mesos "github.com/mesos/mesos-go/mesosproto"
mutil "github.com/mesos/mesos-go/mesosutil"
)
const (
enqueuePopTimeout = 200 * time.Millisecond
enqueueWaitTimeout = 1 * time.Second
yieldPopTimeout = 200 * time.Millisecond
yieldWaitTimeout = 1 * time.Second
pluginRecoveryDelay = 100 * time.Millisecond // delay after scheduler plugin crashes, before we resume scheduling
)
// scheduler abstraction to allow for easier unit testing
type schedulerInterface interface {
sync.Locker // synchronize scheduler plugin operations
SlaveIndex
algorithm() PodScheduleFunc
offers() offers.Registry
tasks() podtask.Registry
// driver calls
killTask(taskId string) error
launchTask(*podtask.T) error
// convenience
createPodTask(api.Context, *api.Pod) (*podtask.T, error)
}
type k8smScheduler struct {
sync.Mutex
internal *KubernetesScheduler
}
func (k *k8smScheduler) algorithm() PodScheduleFunc {
return k.internal.scheduleFunc
}
func (k *k8smScheduler) offers() offers.Registry {
return k.internal.offers
}
func (k *k8smScheduler) tasks() podtask.Registry {
return k.internal.taskRegistry
}
func (k *k8smScheduler) createPodTask(ctx api.Context, pod *api.Pod) (*podtask.T, error) {
return podtask.New(ctx, "", *pod, k.internal.executor)
}
func (k *k8smScheduler) slaveFor(id string) (slave *Slave, ok bool) {
slave, ok = k.internal.slaves.getSlave(id)
return
}
func (k *k8smScheduler) killTask(taskId string) error {
killTaskId := mutil.NewTaskID(taskId)
_, err := k.internal.driver.KillTask(killTaskId)
return err
}
func (k *k8smScheduler) launchTask(task *podtask.T) error {
// assume caller is holding scheduler lock
taskList := []*mesos.TaskInfo{task.BuildTaskInfo()}
offerIds := []*mesos.OfferID{task.Offer.Details().Id}
filters := &mesos.Filters{}
_, err := k.internal.driver.LaunchTasks(offerIds, taskList, filters)
return err
}
type binder struct {
api schedulerInterface
}
// implements binding.Registry, launches the pod-associated-task in mesos
func (b *binder) Bind(binding *api.Binding) error {
ctx := api.WithNamespace(api.NewContext(), binding.Namespace)
// default upstream scheduler passes pod.Name as binding.Name
podKey, err := podtask.MakePodKey(ctx, binding.Name)
if err != nil {
return err
}
b.api.Lock()
defer b.api.Unlock()
switch task, state := b.api.tasks().ForPod(podKey); state {
case podtask.StatePending:
return b.bind(ctx, binding, task)
default:
// in this case it's likely that the pod has been deleted between Schedule
// and Bind calls
log.Infof("No pending task for pod %s", podKey)
return noSuchPodErr //TODO(jdef) this error is somewhat misleading since the task could be running?!
}
}
func (b *binder) rollback(task *podtask.T, err error) error {
task.Offer.Release()
task.Reset()
if err2 := b.api.tasks().Update(task); err2 != nil {
log.Errorf("failed to update pod task: %v", err2)
}
return err
}
// assumes that: caller has acquired scheduler lock and that the task is still pending
func (b *binder) bind(ctx api.Context, binding *api.Binding, task *podtask.T) (err error) {
// sanity check: ensure that the task hasAcceptedOffer(), it's possible that between
// Schedule() and now that the offer for this task was rescinded or invalidated.
// ((we should never see this here))
if !task.HasAcceptedOffer() {
return fmt.Errorf("task has not accepted a valid offer %v", task.ID)
}
// By this time, there is a chance that the slave is disconnected.
offerId := task.GetOfferId()
if offer, ok := b.api.offers().Get(offerId); !ok || offer.HasExpired() {
// already rescinded or timed out or otherwise invalidated
return b.rollback(task, fmt.Errorf("failed prior to launchTask due to expired offer for task %v", task.ID))
}
if err = b.prepareTaskForLaunch(ctx, binding.Target.Name, task, offerId); err == nil {
log.V(2).Infof("launching task: %q on target %q slave %q for pod \"%v/%v\"",
task.ID, binding.Target.Name, task.Spec.SlaveID, task.Pod.Namespace, task.Pod.Name)
if err = b.api.launchTask(task); err == nil {
b.api.offers().Invalidate(offerId)
task.Set(podtask.Launched)
if err = b.api.tasks().Update(task); err != nil {
// this should only happen if the task has been removed or has changed status,
// which SHOULD NOT HAPPEN as long as we're synchronizing correctly
log.Errorf("failed to update task w/ Launched status: %v", err)
}
return
}
}
return b.rollback(task, fmt.Errorf("Failed to launch task %v: %v", task.ID, err))
}
//TODO(jdef) unit test this, ensure that task's copy of api.Pod is not modified
func (b *binder) prepareTaskForLaunch(ctx api.Context, machine string, task *podtask.T, offerId string) error {
pod := task.Pod
// we make an effort here to avoid making changes to the task's copy of the pod, since
// we want that to reflect the initial user spec, and not the modified spec that we
// build for the executor to consume.
oemCt := pod.Spec.Containers
pod.Spec.Containers = append([]api.Container{}, oemCt...) // (shallow) clone before mod
if pod.Annotations == nil {
pod.Annotations = make(map[string]string)
} else {
oemAnn := pod.Annotations
pod.Annotations = make(map[string]string)
for k, v := range oemAnn {
pod.Annotations[k] = v
}
}
pod.Annotations[annotation.BindingHostKey] = machine
task.SaveRecoveryInfo(pod.Annotations)
for _, entry := range task.Spec.PortMap {
oemPorts := pod.Spec.Containers[entry.ContainerIdx].Ports
ports := append([]api.ContainerPort{}, oemPorts...)
p := &ports[entry.PortIdx]
p.HostPort = int(entry.OfferPort)
op := strconv.FormatUint(entry.OfferPort, 10)
pod.Annotations[fmt.Sprintf(annotation.PortMappingKeyFormat, p.Protocol, p.ContainerPort)] = op
if p.Name != "" {
pod.Annotations[fmt.Sprintf(annotation.PortNameMappingKeyFormat, p.Protocol, p.Name)] = op
}
pod.Spec.Containers[entry.ContainerIdx].Ports = ports
}
// the kubelet-executor uses this to instantiate the pod
log.V(3).Infof("prepared pod spec: %+v", pod)
data, err := api.Codec.Encode(&pod)
if err != nil {
log.V(2).Infof("Failed to marshal the pod spec: %v", err)
return err
}
task.Spec.Data = data
return nil
}
type kubeScheduler struct {
api schedulerInterface
podUpdates queue.FIFO
}
// Schedule implements the Scheduler interface of Kubernetes.
// It returns the selectedMachine's name and error (if there's any).
func (k *kubeScheduler) Schedule(pod *api.Pod, unused algorithm.MinionLister) (string, error) {
log.Infof("Try to schedule pod %v\n", pod.Name)
ctx := api.WithNamespace(api.NewDefaultContext(), pod.Namespace)
// default upstream scheduler passes pod.Name as binding.PodID
podKey, err := podtask.MakePodKey(ctx, pod.Name)
if err != nil {
return "", err
}
k.api.Lock()
defer k.api.Unlock()
switch task, state := k.api.tasks().ForPod(podKey); state {
case podtask.StateUnknown:
// There's a bit of a potential race here, a pod could have been yielded() and
// then before we get *here* it could be deleted.
// We use meta to index the pod in the store since that's what k8s reflector does.
podName, err := cache.MetaNamespaceKeyFunc(pod)
if err != nil {
log.Warningf("aborting Schedule, unable to understand pod object %+v", pod)
return "", noSuchPodErr
}
if deleted := k.podUpdates.Poll(podName, queue.DELETE_EVENT); deleted {
// avoid scheduling a pod that's been deleted between yieldPod() and Schedule()
log.Infof("aborting Schedule, pod has been deleted %+v", pod)
return "", noSuchPodErr
}
return k.doSchedule(k.api.tasks().Register(k.api.createPodTask(ctx, pod)))
//TODO(jdef) it's possible that the pod state has diverged from what
//we knew previously, we should probably update the task.Pod state here
//before proceeding with scheduling
case podtask.StatePending:
if pod.UID != task.Pod.UID {
// we're dealing with a brand new pod spec here, so the old one must have been
// deleted -- and so our task store is out of sync w/ respect to reality
//TODO(jdef) reconcile task
return "", fmt.Errorf("task %v spec is out of sync with pod %v spec, aborting schedule", task.ID, pod.Name)
} else if task.Has(podtask.Launched) {
// task has been marked as "launched" but the pod binding creation may have failed in k8s,
// but we're going to let someone else handle it, probably the mesos task error handler
return "", fmt.Errorf("task %s has already been launched, aborting schedule", task.ID)
} else {
return k.doSchedule(task, nil)
}
default:
return "", fmt.Errorf("task %s is not pending, nothing to schedule", task.ID)
}
}
// Call ScheduleFunc and subtract some resources, returning the name of the machine the task is scheduled on
func (k *kubeScheduler) doSchedule(task *podtask.T, err error) (string, error) {
var offer offers.Perishable
if task.HasAcceptedOffer() {
// verify that the offer is still on the table
offerId := task.GetOfferId()
if offer, ok := k.api.offers().Get(offerId); ok && !offer.HasExpired() {
// skip tasks that have already have assigned offers
offer = task.Offer
} else {
task.Offer.Release()
task.Reset()
if err = k.api.tasks().Update(task); err != nil {
return "", err
}
}
}
if err == nil && offer == nil {
offer, err = k.api.algorithm()(k.api.offers(), k.api, task)
}
if err != nil {
return "", err
}
details := offer.Details()
if details == nil {
return "", fmt.Errorf("offer already invalid/expired for task %v", task.ID)
}
slaveId := details.GetSlaveId().GetValue()
if slave, ok := k.api.slaveFor(slaveId); !ok {
// not much sense in Release()ing the offer here since its owner died
offer.Release()
k.api.offers().Invalidate(details.Id.GetValue())
return "", fmt.Errorf("Slave disappeared (%v) while scheduling task %v", slaveId, task.ID)
} else {
if task.Offer != nil && task.Offer != offer {
return "", fmt.Errorf("task.offer assignment must be idempotent, task %+v: offer %+v", task, offer)
}
task.Offer = offer
task.FillFromDetails(details)
if err := k.api.tasks().Update(task); err != nil {
offer.Release()
return "", err
}
return slave.HostName, nil
}
}
type queuer struct {
lock sync.Mutex // shared by condition variables of this struct
podUpdates queue.FIFO // queue of pod updates to be processed
podQueue *queue.DelayFIFO // queue of pods to be scheduled
deltaCond sync.Cond // pod changes are available for processing
unscheduledCond sync.Cond // there are unscheduled pods for processing
}
func newQueuer(store queue.FIFO) *queuer {
q := &queuer{
podQueue: queue.NewDelayFIFO(),
podUpdates: store,
}
q.deltaCond.L = &q.lock
q.unscheduledCond.L = &q.lock
return q
}
func (q *queuer) installDebugHandlers(mux *http.ServeMux) {
mux.HandleFunc("/debug/scheduler/podqueue", func(w http.ResponseWriter, r *http.Request) {
for _, x := range q.podQueue.List() {
if _, err := io.WriteString(w, fmt.Sprintf("%+v\n", x)); err != nil {
break
}
}
})
mux.HandleFunc("/debug/scheduler/podstore", func(w http.ResponseWriter, r *http.Request) {
for _, x := range q.podUpdates.List() {
if _, err := io.WriteString(w, fmt.Sprintf("%+v\n", x)); err != nil {
break
}
}
})
}
// signal that there are probably pod updates waiting to be processed
func (q *queuer) updatesAvailable() {
q.deltaCond.Broadcast()
}
// delete a pod from the to-be-scheduled queue
func (q *queuer) dequeue(id string) {
q.podQueue.Delete(id)
}
// re-add a pod to the to-be-scheduled queue, will not overwrite existing pod data (that
// may have already changed).
func (q *queuer) requeue(pod *Pod) {
// use KeepExisting in case the pod has already been updated (can happen if binding fails
// due to constraint voilations); we don't want to overwrite a newer entry with stale data.
q.podQueue.Add(pod, queue.KeepExisting)
q.unscheduledCond.Broadcast()
}
// same as requeue but calls podQueue.Offer instead of podQueue.Add
func (q *queuer) reoffer(pod *Pod) {
// use KeepExisting in case the pod has already been updated (can happen if binding fails
// due to constraint voilations); we don't want to overwrite a newer entry with stale data.
if q.podQueue.Offer(pod, queue.KeepExisting) {
q.unscheduledCond.Broadcast()
}
}
// spawns a go-routine to watch for unscheduled pods and queue them up
// for scheduling. returns immediately.
func (q *queuer) Run(done <-chan struct{}) {
go runtime.Until(func() {
log.Info("Watching for newly created pods")
q.lock.Lock()
defer q.lock.Unlock()
for {
// limit blocking here for short intervals so that scheduling
// may proceed even if there have been no recent pod changes
p := q.podUpdates.Await(enqueuePopTimeout)
if p == nil {
signalled := runtime.After(q.deltaCond.Wait)
// we've yielded the lock
select {
case <-time.After(enqueueWaitTimeout):
q.deltaCond.Broadcast() // abort Wait()
<-signalled // wait for lock re-acquisition
log.V(4).Infoln("timed out waiting for a pod update")
case <-signalled:
// we've acquired the lock and there may be
// changes for us to process now
}
continue
}
pod := p.(*Pod)
if pod.Spec.NodeName != "" {
log.V(3).Infof("dequeuing pod for scheduling: %v", pod.Pod.Name)
q.dequeue(pod.GetUID())
} else {
// use ReplaceExisting because we are always pushing the latest state
now := time.Now()
pod.deadline = &now
if q.podQueue.Offer(pod, queue.ReplaceExisting) {
q.unscheduledCond.Broadcast()
log.V(3).Infof("queued pod for scheduling: %v", pod.Pod.Name)
} else {
log.Warningf("failed to queue pod for scheduling: %v", pod.Pod.Name)
}
}
}
}, 1*time.Second, done)
}
// implementation of scheduling plugin's NextPod func; see k8s plugin/pkg/scheduler
func (q *queuer) yield() *api.Pod {
log.V(2).Info("attempting to yield a pod")
q.lock.Lock()
defer q.lock.Unlock()
for {
// limit blocking here to short intervals so that we don't block the
// enqueuer Run() routine for very long
kpod := q.podQueue.Await(yieldPopTimeout)
if kpod == nil {
signalled := runtime.After(q.unscheduledCond.Wait)
// lock is yielded at this point and we're going to wait for either
// a timeout, or a signal that there's data
select {
case <-time.After(yieldWaitTimeout):
q.unscheduledCond.Broadcast() // abort Wait()
<-signalled // wait for the go-routine, and the lock
log.V(4).Infoln("timed out waiting for a pod to yield")
case <-signalled:
// we have acquired the lock, and there
// may be a pod for us to pop now
}
continue
}
pod := kpod.(*Pod).Pod
if podName, err := cache.MetaNamespaceKeyFunc(pod); err != nil {
log.Warningf("yield unable to understand pod object %+v, will skip: %v", pod, err)
} else if !q.podUpdates.Poll(podName, queue.POP_EVENT) {
log.V(1).Infof("yield popped a transitioning pod, skipping: %+v", pod)
} else if pod.Spec.NodeName != "" {
// should never happen if enqueuePods is filtering properly
log.Warningf("yield popped an already-scheduled pod, skipping: %+v", pod)
} else {
return pod
}
}
}
type errorHandler struct {
api schedulerInterface
backoff *backoff.Backoff
qr *queuer
}
// implementation of scheduling plugin's Error func; see plugin/pkg/scheduler
func (k *errorHandler) handleSchedulingError(pod *api.Pod, schedulingErr error) {
if schedulingErr == noSuchPodErr {
log.V(2).Infof("Not rescheduling non-existent pod %v", pod.Name)
return
}
log.Infof("Error scheduling %v: %v; retrying", pod.Name, schedulingErr)
defer util.HandleCrash()
// default upstream scheduler passes pod.Name as binding.PodID
ctx := api.WithNamespace(api.NewDefaultContext(), pod.Namespace)
podKey, err := podtask.MakePodKey(ctx, pod.Name)
if err != nil {
log.Errorf("Failed to construct pod key, aborting scheduling for pod %v: %v", pod.Name, err)
return
}
k.backoff.GC()
k.api.Lock()
defer k.api.Unlock()
switch task, state := k.api.tasks().ForPod(podKey); state {
case podtask.StateUnknown:
// if we don't have a mapping here any more then someone deleted the pod
log.V(2).Infof("Could not resolve pod to task, aborting pod reschdule: %s", podKey)
return
case podtask.StatePending:
if task.Has(podtask.Launched) {
log.V(2).Infof("Skipping re-scheduling for already-launched pod %v", podKey)
return
}
breakoutEarly := queue.BreakChan(nil)
if schedulingErr == noSuitableOffersErr {
log.V(3).Infof("adding backoff breakout handler for pod %v", podKey)
breakoutEarly = queue.BreakChan(k.api.offers().Listen(podKey, func(offer *mesos.Offer) bool {
k.api.Lock()
defer k.api.Unlock()
switch task, state := k.api.tasks().Get(task.ID); state {
case podtask.StatePending:
return !task.Has(podtask.Launched) && task.AcceptOffer(offer)
default:
// no point in continuing to check for matching offers
return true
}
}))
}
delay := k.backoff.Get(podKey)
log.V(3).Infof("requeuing pod %v with delay %v", podKey, delay)
k.qr.requeue(&Pod{Pod: pod, delay: &delay, notify: breakoutEarly})
default:
log.V(2).Infof("Task is no longer pending, aborting reschedule for pod %v", podKey)
}
}
type deleter struct {
api schedulerInterface
qr *queuer
}
// currently monitors for "pod deleted" events, upon which handle()
// is invoked.
func (k *deleter) Run(updates <-chan queue.Entry, done <-chan struct{}) {
go runtime.Until(func() {
for {
entry := <-updates
pod := entry.Value().(*Pod)
if entry.Is(queue.DELETE_EVENT) {
if err := k.deleteOne(pod); err != nil {
log.Error(err)
}
} else if !entry.Is(queue.POP_EVENT) {
k.qr.updatesAvailable()
}
}
}, 1*time.Second, done)
}
func (k *deleter) deleteOne(pod *Pod) error {
ctx := api.WithNamespace(api.NewDefaultContext(), pod.Namespace)
podKey, err := podtask.MakePodKey(ctx, pod.Name)
if err != nil {
return err
}
log.V(2).Infof("pod deleted: %v", podKey)
// order is important here: we want to make sure we have the lock before
// removing the pod from the scheduling queue. this makes the concurrent
// execution of scheduler-error-handling and delete-handling easier to
// reason about.
k.api.Lock()
defer k.api.Unlock()
// prevent the scheduler from attempting to pop this; it's also possible that
// it's concurrently being scheduled (somewhere between pod scheduling and
// binding) - if so, then we'll end up removing it from taskRegistry which
// will abort Bind()ing
k.qr.dequeue(pod.GetUID())
switch task, state := k.api.tasks().ForPod(podKey); state {
case podtask.StateUnknown:
log.V(2).Infof("Could not resolve pod '%s' to task id", podKey)
return noSuchPodErr
// determine if the task has already been launched to mesos, if not then
// cleanup is easier (unregister) since there's no state to sync
case podtask.StatePending:
if !task.Has(podtask.Launched) {
// we've been invoked in between Schedule() and Bind()
if task.HasAcceptedOffer() {
task.Offer.Release()
task.Reset()
task.Set(podtask.Deleted)
//TODO(jdef) probably want better handling here
if err := k.api.tasks().Update(task); err != nil {
return err
}
}
k.api.tasks().Unregister(task)
return nil
}
fallthrough
case podtask.StateRunning:
// signal to watchers that the related pod is going down
task.Set(podtask.Deleted)
if err := k.api.tasks().Update(task); err != nil {
log.Errorf("failed to update task w/ Deleted status: %v", err)
}
return k.api.killTask(task.ID)
default:
log.Infof("cannot kill pod '%s': non-terminal task not found %v", podKey, task.ID)
return noSuchTaskErr
}
}
// Create creates a scheduler plugin and all supporting background functions.
func (k *KubernetesScheduler) NewDefaultPluginConfig(terminate <-chan struct{}, mux *http.ServeMux) *PluginConfig {
// use ListWatch watching pods using the client by default
return k.NewPluginConfig(terminate, mux, createAllPodsLW(k.client))
}
func (k *KubernetesScheduler) NewPluginConfig(terminate <-chan struct{}, mux *http.ServeMux,
podsWatcher *cache.ListWatch) *PluginConfig {
// Watch and queue pods that need scheduling.
updates := make(chan queue.Entry, k.schedcfg.UpdatesBacklog)
podUpdates := &podStoreAdapter{queue.NewHistorical(updates)}
reflector := cache.NewReflector(podsWatcher, &api.Pod{}, podUpdates, 0)
// lock that guards critial sections that involve transferring pods from
// the store (cache) to the scheduling queue; its purpose is to maintain
// an ordering (vs interleaving) of operations that's easier to reason about.
kapi := &k8smScheduler{internal: k}
q := newQueuer(podUpdates)
podDeleter := &deleter{
api: kapi,
qr: q,
}
eh := &errorHandler{
api: kapi,
backoff: backoff.New(k.schedcfg.InitialPodBackoff.Duration, k.schedcfg.MaxPodBackoff.Duration),
qr: q,
}
startLatch := make(chan struct{})
eventBroadcaster := record.NewBroadcaster()
runtime.On(startLatch, func() {
eventBroadcaster.StartRecordingToSink(k.client.Events(""))
reflector.Run() // TODO(jdef) should listen for termination
podDeleter.Run(updates, terminate)
q.Run(terminate)
q.installDebugHandlers(mux)
podtask.InstallDebugHandlers(k.taskRegistry, mux)
})
return &PluginConfig{
Config: &plugin.Config{
MinionLister: nil,
Algorithm: &kubeScheduler{
api: kapi,
podUpdates: podUpdates,
},
Binder: &binder{api: kapi},
NextPod: q.yield,
Error: eh.handleSchedulingError,
Recorder: eventBroadcaster.NewRecorder(api.EventSource{Component: "scheduler"}),
},
api: kapi,
client: k.client,
qr: q,
deleter: podDeleter,
starting: startLatch,
}
}
type PluginConfig struct {
*plugin.Config
api schedulerInterface
client *client.Client
qr *queuer
deleter *deleter
starting chan struct{} // startup latch
}
func NewPlugin(c *PluginConfig) PluginInterface {
return &schedulingPlugin{
config: c.Config,
api: c.api,
client: c.client,
qr: c.qr,
deleter: c.deleter,
starting: c.starting,
}
}
type schedulingPlugin struct {
config *plugin.Config
api schedulerInterface
client *client.Client
qr *queuer
deleter *deleter
starting chan struct{}
}
func (s *schedulingPlugin) Run(done <-chan struct{}) {
defer close(s.starting)
go runtime.Until(s.scheduleOne, pluginRecoveryDelay, done)
}
// hacked from GoogleCloudPlatform/kubernetes/plugin/pkg/scheduler/scheduler.go,
// with the Modeler stuff removed since we don't use it because we have mesos.
func (s *schedulingPlugin) scheduleOne() {
pod := s.config.NextPod()
log.V(3).Infof("Attempting to schedule: %v", pod)
dest, err := s.config.Algorithm.Schedule(pod, s.config.MinionLister) // call kubeScheduler.Schedule
if err != nil {
log.V(1).Infof("Failed to schedule: %v", pod)
s.config.Recorder.Eventf(pod, "failedScheduling", "Error scheduling: %v", err)
s.config.Error(pod, err)
return
}
b := &api.Binding{
ObjectMeta: api.ObjectMeta{Namespace: pod.Namespace, Name: pod.Name},
Target: api.ObjectReference{
Kind: "Node",
Name: dest,
},
}
if err := s.config.Binder.Bind(b); err != nil {
log.V(1).Infof("Failed to bind pod: %v", err)
s.config.Recorder.Eventf(pod, "failedScheduling", "Binding rejected: %v", err)
s.config.Error(pod, err)
return
}
s.config.Recorder.Eventf(pod, "scheduled", "Successfully assigned %v to %v", pod.Name, dest)
}
// this pod may be out of sync with respect to the API server registry:
// this pod | apiserver registry
// -------------|----------------------
// host=.* | 404 ; pod was deleted
// host=.* | 5xx ; failed to sync, try again later?
// host="" | host="" ; perhaps no updates to process?
// host="" | host="..." ; pod has been scheduled and assigned, is there a task assigned? (check TaskIdKey in binding?)
// host="..." | host="" ; pod is no longer scheduled, does it need to be re-queued?
// host="..." | host="..." ; perhaps no updates to process?
//
// TODO(jdef) this needs an integration test
func (s *schedulingPlugin) reconcilePod(oldPod api.Pod) {
log.V(1).Infof("reconcile pod %v", oldPod.Name)
ctx := api.WithNamespace(api.NewDefaultContext(), oldPod.Namespace)
pod, err := s.client.Pods(api.NamespaceValue(ctx)).Get(oldPod.Name)
if err != nil {
if errors.IsNotFound(err) {
// attempt to delete
if err = s.deleter.deleteOne(&Pod{Pod: &oldPod}); err != nil && err != noSuchPodErr && err != noSuchTaskErr {
log.Errorf("failed to delete pod: %v: %v", oldPod.Name, err)
}
} else {
//TODO(jdef) other errors should probably trigger a retry (w/ backoff).
//For now, drop the pod on the floor
log.Warning("aborting reconciliation for pod %v: %v", oldPod.Name, err)
}
return
}
if oldPod.Spec.NodeName != pod.Spec.NodeName {
if pod.Spec.NodeName == "" {
// pod is unscheduled.
// it's possible that we dropped the pod in the scheduler error handler
// because of task misalignment with the pod (task.Has(podtask.Launched) == true)
podKey, err := podtask.MakePodKey(ctx, pod.Name)
if err != nil {
log.Error(err)
return
}
s.api.Lock()
defer s.api.Unlock()
if _, state := s.api.tasks().ForPod(podKey); state != podtask.StateUnknown {
//TODO(jdef) reconcile the task
log.Errorf("task already registered for pod %v", pod.Name)
return
}
now := time.Now()
log.V(3).Infof("reoffering pod %v", podKey)
s.qr.reoffer(&Pod{
Pod: pod,
deadline: &now,
})
} else {
// pod is scheduled.
// not sure how this happened behind our backs. attempt to reconstruct
// at least a partial podtask.T record.
//TODO(jdef) reconcile the task
log.Errorf("pod already scheduled: %v", pod.Name)
}
} else {
//TODO(jdef) for now, ignore the fact that the rest of the spec may be different
//and assume that our knowledge of the pod aligns with that of the apiserver
log.Error("pod reconciliation does not support updates; not yet implemented")
}
}
func parseSelectorOrDie(s string) fields.Selector {
selector, err := fields.ParseSelector(s)
if err != nil {
panic(err)
}
return selector
}
// createAllPodsLW returns a listWatch that finds all pods
func createAllPodsLW(cl *client.Client) *cache.ListWatch {
return cache.NewListWatchFromClient(cl, "pods", api.NamespaceAll, parseSelectorOrDie(""))
}
// Consumes *api.Pod, produces *Pod; the k8s reflector wants to push *api.Pod
// objects at us, but we want to store more flexible (Pod) type defined in
// this package. The adapter implementation facilitates this. It's a little
// hackish since the object type going in is different than the object type
// coming out -- you've been warned.
type podStoreAdapter struct {
queue.FIFO
}
func (psa *podStoreAdapter) Add(obj interface{}) error {
pod := obj.(*api.Pod)
return psa.FIFO.Add(&Pod{Pod: pod})
}
func (psa *podStoreAdapter) Update(obj interface{}) error {
pod := obj.(*api.Pod)
return psa.FIFO.Update(&Pod{Pod: pod})
}
func (psa *podStoreAdapter) Delete(obj interface{}) error {
pod := obj.(*api.Pod)
return psa.FIFO.Delete(&Pod{Pod: pod})
}
func (psa *podStoreAdapter) Get(obj interface{}) (interface{}, bool, error) {
pod := obj.(*api.Pod)
return psa.FIFO.Get(&Pod{Pod: pod})
}
// Replace will delete the contents of the store, using instead the
// given map. This store implementation does NOT take ownership of the map.
func (psa *podStoreAdapter) Replace(objs []interface{}) error {
newobjs := make([]interface{}, len(objs))
for i, v := range objs {
pod := v.(*api.Pod)
newobjs[i] = &Pod{Pod: pod}
}
return psa.FIFO.Replace(newobjs)
}

View File

@@ -0,0 +1,700 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package scheduler
import (
"fmt"
"net/http"
"net/http/httptest"
"sync"
"testing"
"time"
"github.com/GoogleCloudPlatform/kubernetes/pkg/api"
"github.com/GoogleCloudPlatform/kubernetes/pkg/api/testapi"
"github.com/GoogleCloudPlatform/kubernetes/pkg/client"
"github.com/GoogleCloudPlatform/kubernetes/pkg/client/cache"
"github.com/GoogleCloudPlatform/kubernetes/pkg/runtime"
kutil "github.com/GoogleCloudPlatform/kubernetes/pkg/util"
"github.com/GoogleCloudPlatform/kubernetes/pkg/watch"
assertext "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/assert"
"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/executor/messages"
"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/queue"
schedcfg "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/scheduler/config"
"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/scheduler/ha"
"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/scheduler/podtask"
log "github.com/golang/glog"
mesos "github.com/mesos/mesos-go/mesosproto"
util "github.com/mesos/mesos-go/mesosutil"
bindings "github.com/mesos/mesos-go/scheduler"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/mock"
)
// A apiserver mock which partially mocks the pods API
type TestServer struct {
server *httptest.Server
stats map[string]uint
lock sync.Mutex
}
func NewTestServer(t *testing.T, namespace string, mockPodListWatch *MockPodsListWatch) *TestServer {
ts := TestServer{
stats: map[string]uint{},
}
mux := http.NewServeMux()
mux.HandleFunc(testapi.ResourcePath("pods", namespace, ""), func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusOK)
pods := mockPodListWatch.Pods()
w.Write([]byte(runtime.EncodeOrDie(testapi.Codec(), &pods)))
})
podsPrefix := testapi.ResourcePath("pods", namespace, "") + "/"
mux.HandleFunc(podsPrefix, func(w http.ResponseWriter, r *http.Request) {
name := r.URL.Path[len(podsPrefix):]
// update statistics for this pod
ts.lock.Lock()
defer ts.lock.Unlock()
ts.stats[name] = ts.stats[name] + 1
p := mockPodListWatch.GetPod(name)
if p != nil {
w.WriteHeader(http.StatusOK)
w.Write([]byte(runtime.EncodeOrDie(testapi.Codec(), p)))
return
}
w.WriteHeader(http.StatusNotFound)
})
mux.HandleFunc(testapi.ResourcePath("events", namespace, ""), func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusOK)
})
mux.HandleFunc("/", func(res http.ResponseWriter, req *http.Request) {
t.Errorf("unexpected request: %v", req.RequestURI)
res.WriteHeader(http.StatusNotFound)
})
ts.server = httptest.NewServer(mux)
return &ts
}
func (ts *TestServer) Stats(name string) uint {
ts.lock.Lock()
defer ts.lock.Unlock()
return ts.stats[name]
}
// Create mock of pods ListWatch, usually listening on the apiserver pods watch endpoint
type MockPodsListWatch struct {
ListWatch cache.ListWatch
fakeWatcher *watch.FakeWatcher
list api.PodList
lock sync.Mutex
}
func NewMockPodsListWatch(initialPodList api.PodList) *MockPodsListWatch {
lw := MockPodsListWatch{
fakeWatcher: watch.NewFake(),
list: initialPodList,
}
lw.ListWatch = cache.ListWatch{
WatchFunc: func(resourceVersion string) (watch.Interface, error) {
return lw.fakeWatcher, nil
},
ListFunc: func() (runtime.Object, error) {
return &lw.list, nil
},
}
return &lw
}
func (lw *MockPodsListWatch) Pods() api.PodList {
lw.lock.Lock()
defer lw.lock.Unlock()
return lw.list
}
func (lw *MockPodsListWatch) GetPod(name string) *api.Pod {
lw.lock.Lock()
defer lw.lock.Unlock()
for _, p := range lw.list.Items {
if p.Name == name {
return &p
}
}
return nil
}
func (lw *MockPodsListWatch) Add(pod *api.Pod, notify bool) {
lw.lock.Lock()
defer lw.lock.Unlock()
lw.list.Items = append(lw.list.Items, *pod)
if notify {
lw.fakeWatcher.Add(pod)
}
}
func (lw *MockPodsListWatch) Modify(pod *api.Pod, notify bool) {
lw.lock.Lock()
defer lw.lock.Unlock()
for i, otherPod := range lw.list.Items {
if otherPod.Name == pod.Name {
lw.list.Items[i] = *pod
if notify {
lw.fakeWatcher.Modify(pod)
}
return
}
}
log.Fatalf("Cannot find pod %v to modify in MockPodsListWatch", pod.Name)
}
func (lw *MockPodsListWatch) Delete(pod *api.Pod, notify bool) {
lw.lock.Lock()
defer lw.lock.Unlock()
for i, otherPod := range lw.list.Items {
if otherPod.Name == pod.Name {
lw.list.Items = append(lw.list.Items[:i], lw.list.Items[i+1:]...)
if notify {
lw.fakeWatcher.Delete(&otherPod)
}
return
}
}
log.Fatalf("Cannot find pod %v to delete in MockPodsListWatch", pod.Name)
}
// Create a pod with a given index, requiring one port
func NewTestPod(i int) *api.Pod {
name := fmt.Sprintf("pod%d", i)
return &api.Pod{
TypeMeta: api.TypeMeta{APIVersion: testapi.Version()},
ObjectMeta: api.ObjectMeta{
Name: name,
Namespace: "default",
SelfLink: fmt.Sprintf("http://1.2.3.4/api/v1beta1/pods/%s", name),
},
Spec: api.PodSpec{
Containers: []api.Container{
{
Ports: []api.ContainerPort{
{
ContainerPort: 8000 + i,
Protocol: api.ProtocolTCP,
},
},
},
},
},
Status: api.PodStatus{
PodIP: fmt.Sprintf("1.2.3.%d", 4+i),
Conditions: []api.PodCondition{
{
Type: api.PodReady,
Status: api.ConditionTrue,
},
},
},
}
}
// Offering some cpus and memory and the 8000-9000 port range
func NewTestOffer(i int) *mesos.Offer {
hostname := fmt.Sprintf("h%d", i)
cpus := util.NewScalarResource("cpus", 3.75)
mem := util.NewScalarResource("mem", 940)
var port8000 uint64 = 8000
var port9000 uint64 = 9000
ports8000to9000 := mesos.Value_Range{Begin: &port8000, End: &port9000}
ports := util.NewRangesResource("ports", []*mesos.Value_Range{&ports8000to9000})
return &mesos.Offer{
Id: util.NewOfferID(fmt.Sprintf("offer%d", i)),
Hostname: &hostname,
SlaveId: util.NewSlaveID(hostname),
Resources: []*mesos.Resource{cpus, mem, ports},
}
}
// Add assertions to reason about event streams
type Event struct {
Object runtime.Object
Reason string
Message string
}
type EventPredicate func(e Event) bool
type EventAssertions struct {
assert.Assertions
}
// EventObserver implements record.EventRecorder for the purposes of validation via EventAssertions.
type EventObserver struct {
fifo chan Event
}
func NewEventObserver() *EventObserver {
return &EventObserver{
fifo: make(chan Event, 1000),
}
}
func (o *EventObserver) Event(object runtime.Object, reason, message string) {
o.fifo <- Event{Object: object, Reason: reason, Message: message}
}
func (o *EventObserver) Eventf(object runtime.Object, reason, messageFmt string, args ...interface{}) {
o.fifo <- Event{Object: object, Reason: reason, Message: fmt.Sprintf(messageFmt, args...)}
}
func (o *EventObserver) PastEventf(object runtime.Object, timestamp kutil.Time, reason, messageFmt string, args ...interface{}) {
o.fifo <- Event{Object: object, Reason: reason, Message: fmt.Sprintf(messageFmt, args...)}
}
func (a *EventAssertions) Event(observer *EventObserver, pred EventPredicate, msgAndArgs ...interface{}) bool {
// parse msgAndArgs: first possibly a duration, otherwise a format string with further args
timeout := time.Second * 2
msg := "event not received"
msgArgStart := 0
if len(msgAndArgs) > 0 {
switch msgAndArgs[0].(type) {
case time.Duration:
timeout = msgAndArgs[0].(time.Duration)
msgArgStart += 1
}
}
if len(msgAndArgs) > msgArgStart {
msg = fmt.Sprintf(msgAndArgs[msgArgStart].(string), msgAndArgs[msgArgStart+1:]...)
}
// watch events
result := make(chan bool)
stop := make(chan struct{})
go func() {
for {
select {
case e, ok := <-observer.fifo:
if !ok {
result <- false
return
} else if pred(e) {
log.V(3).Infof("found asserted event for reason '%v': %v", e.Reason, e.Message)
result <- true
return
} else {
log.V(5).Infof("ignoring not-asserted event for reason '%v': %v", e.Reason, e.Message)
}
case _, ok := <-stop:
if !ok {
return
}
}
}
}()
defer close(stop)
// wait for watch to match or timeout
select {
case matched := <-result:
return matched
case <-time.After(timeout):
return a.Fail(msg)
}
}
func (a *EventAssertions) EventWithReason(observer *EventObserver, reason string, msgAndArgs ...interface{}) bool {
return a.Event(observer, func(e Event) bool {
return e.Reason == reason
}, msgAndArgs...)
}
type joinableDriver struct {
MockSchedulerDriver
joinFunc func() (mesos.Status, error)
}
// Join invokes joinFunc if it has been set, otherwise blocks forever
func (m *joinableDriver) Join() (mesos.Status, error) {
if m.joinFunc != nil {
return m.joinFunc()
}
select {}
}
// Create mesos.TaskStatus for a given task
func newTaskStatusForTask(task *mesos.TaskInfo, state mesos.TaskState) *mesos.TaskStatus {
healthy := state == mesos.TaskState_TASK_RUNNING
ts := float64(time.Now().Nanosecond()) / 1000000000.0
source := mesos.TaskStatus_SOURCE_EXECUTOR
return &mesos.TaskStatus{
TaskId: task.TaskId,
State: &state,
SlaveId: task.SlaveId,
ExecutorId: task.Executor.ExecutorId,
Timestamp: &ts,
Healthy: &healthy,
Source: &source,
Data: task.Data,
}
}
// Test to create the scheduler plugin with an empty plugin config
func TestPlugin_New(t *testing.T) {
assert := assert.New(t)
c := PluginConfig{}
p := NewPlugin(&c)
assert.NotNil(p)
}
// Test to create the scheduler plugin with the config returned by the scheduler,
// and play through the whole life cycle of the plugin while creating pods, deleting
// and failing them.
func TestPlugin_LifeCycle(t *testing.T) {
assert := &EventAssertions{*assert.New(t)}
// create a fake pod watch. We use that below to submit new pods to the scheduler
podListWatch := NewMockPodsListWatch(api.PodList{})
// create fake apiserver
testApiServer := NewTestServer(t, api.NamespaceDefault, podListWatch)
defer testApiServer.server.Close()
// create scheduler
testScheduler := New(Config{
Executor: util.NewExecutorInfo(
util.NewExecutorID("executor-id"),
util.NewCommandInfo("executor-cmd"),
),
Client: client.NewOrDie(&client.Config{Host: testApiServer.server.URL, Version: testapi.Version()}),
ScheduleFunc: FCFSScheduleFunc,
Schedcfg: *schedcfg.CreateDefaultConfig(),
})
assert.NotNil(testScheduler.client, "client is nil")
assert.NotNil(testScheduler.executor, "executor is nil")
assert.NotNil(testScheduler.offers, "offer registry is nil")
// create scheduler process
schedulerProcess := ha.New(testScheduler)
// get plugin config from it
c := testScheduler.NewPluginConfig(schedulerProcess.Terminal(), http.DefaultServeMux, &podListWatch.ListWatch)
assert.NotNil(c)
// make events observable
eventObserver := NewEventObserver()
c.Recorder = eventObserver
// create plugin
p := NewPlugin(c)
assert.NotNil(p)
// run plugin
p.Run(schedulerProcess.Terminal())
defer schedulerProcess.End()
// init scheduler
err := testScheduler.Init(schedulerProcess.Master(), p, http.DefaultServeMux)
assert.NoError(err)
// create mock mesos scheduler driver
mockDriver := &joinableDriver{}
mockDriver.On("Start").Return(mesos.Status_DRIVER_RUNNING, nil).Once()
started := mockDriver.Upon()
mAny := mock.AnythingOfType
mockDriver.On("ReconcileTasks", mAny("[]*mesosproto.TaskStatus")).Return(mesos.Status_DRIVER_RUNNING, nil)
mockDriver.On("SendFrameworkMessage", mAny("*mesosproto.ExecutorID"), mAny("*mesosproto.SlaveID"), mAny("string")).
Return(mesos.Status_DRIVER_RUNNING, nil)
launchedTasks := make(chan *mesos.TaskInfo, 1)
launchTasksCalledFunc := func(args mock.Arguments) {
taskInfos := args.Get(1).([]*mesos.TaskInfo)
assert.Equal(1, len(taskInfos))
launchedTasks <- taskInfos[0]
}
mockDriver.On("LaunchTasks", mAny("[]*mesosproto.OfferID"), mAny("[]*mesosproto.TaskInfo"), mAny("*mesosproto.Filters")).
Return(mesos.Status_DRIVER_RUNNING, nil).Run(launchTasksCalledFunc)
// elect master with mock driver
driverFactory := ha.DriverFactory(func() (bindings.SchedulerDriver, error) {
return mockDriver, nil
})
schedulerProcess.Elect(driverFactory)
elected := schedulerProcess.Elected()
// driver will be started
<-started
// tell scheduler to be registered
testScheduler.Registered(
mockDriver,
util.NewFrameworkID("kubernetes-id"),
util.NewMasterInfo("master-id", (192<<24)+(168<<16)+(0<<8)+1, 5050),
)
// wait for being elected
<-elected
//TODO(jdef) refactor things above here into a test suite setup of some sort
// fake new, unscheduled pod
pod1 := NewTestPod(1)
podListWatch.Add(pod1, true) // notify watchers
// wait for failedScheduling event because there is no offer
assert.EventWithReason(eventObserver, "failedScheduling", "failedScheduling event not received")
// add some matching offer
offers1 := []*mesos.Offer{NewTestOffer(1)}
testScheduler.ResourceOffers(nil, offers1)
// and wait for scheduled pod
assert.EventWithReason(eventObserver, "scheduled")
select {
case launchedTask := <-launchedTasks:
// report back that the task has been staged, and then started by mesos
testScheduler.StatusUpdate(mockDriver, newTaskStatusForTask(launchedTask, mesos.TaskState_TASK_STAGING))
testScheduler.StatusUpdate(mockDriver, newTaskStatusForTask(launchedTask, mesos.TaskState_TASK_RUNNING))
// report back that the task has been lost
mockDriver.AssertNumberOfCalls(t, "SendFrameworkMessage", 0)
testScheduler.StatusUpdate(mockDriver, newTaskStatusForTask(launchedTask, mesos.TaskState_TASK_LOST))
// and wait that framework message is sent to executor
mockDriver.AssertNumberOfCalls(t, "SendFrameworkMessage", 1)
case <-time.After(5 * time.Second):
t.Fatalf("timed out waiting for launchTasks call")
}
// start another pod
podNum := 1
startPod := func(offers []*mesos.Offer) (*api.Pod, *mesos.TaskInfo) {
podNum = podNum + 1
// create pod and matching offer
pod := NewTestPod(podNum)
podListWatch.Add(pod, true) // notify watchers
testScheduler.ResourceOffers(mockDriver, offers)
assert.EventWithReason(eventObserver, "scheduled")
// wait for driver.launchTasks call
select {
case launchedTask := <-launchedTasks:
testScheduler.StatusUpdate(mockDriver, newTaskStatusForTask(launchedTask, mesos.TaskState_TASK_STAGING))
testScheduler.StatusUpdate(mockDriver, newTaskStatusForTask(launchedTask, mesos.TaskState_TASK_RUNNING))
return pod, launchedTask
case <-time.After(5 * time.Second):
t.Fatal("timed out waiting for launchTasks")
return nil, nil
}
}
pod, launchedTask := startPod(offers1)
// mock drvier.KillTask, should be invoked when a pod is deleted
mockDriver.On("KillTask", mAny("*mesosproto.TaskID")).Return(mesos.Status_DRIVER_RUNNING, nil).Run(func(args mock.Arguments) {
killedTaskId := *(args.Get(0).(*mesos.TaskID))
assert.Equal(*launchedTask.TaskId, killedTaskId, "expected same TaskID as during launch")
})
killTaskCalled := mockDriver.Upon()
// stop it again via the apiserver mock
podListWatch.Delete(pod, true) // notify watchers
// and wait for the driver killTask call with the correct TaskId
select {
case <-killTaskCalled:
// report back that the task is finished
testScheduler.StatusUpdate(mockDriver, newTaskStatusForTask(launchedTask, mesos.TaskState_TASK_FINISHED))
case <-time.After(5 * time.Second):
t.Fatal("timed out waiting for KillTask")
}
// start pods:
// - which are failing while binding,
// - leading to reconciliation
// - with different states on the apiserver
failPodFromExecutor := func(task *mesos.TaskInfo) {
beforePodLookups := testApiServer.Stats(pod.Name)
status := newTaskStatusForTask(task, mesos.TaskState_TASK_FAILED)
message := messages.CreateBindingFailure
status.Message = &message
testScheduler.StatusUpdate(mockDriver, status)
// wait until pod is looked up at the apiserver
assertext.EventuallyTrue(t, time.Second, func() bool {
return testApiServer.Stats(pod.Name) == beforePodLookups+1
}, "expect that reconcilePod will access apiserver for pod %v", pod.Name)
}
// 1. with pod deleted from the apiserver
pod, launchedTask = startPod(offers1)
podListWatch.Delete(pod, false) // not notifying the watchers
failPodFromExecutor(launchedTask)
// 2. with pod still on the apiserver, not bound
pod, launchedTask = startPod(offers1)
failPodFromExecutor(launchedTask)
// 3. with pod still on the apiserver, bound i.e. host!=""
pod, launchedTask = startPod(offers1)
pod.Spec.NodeName = *offers1[0].Hostname
podListWatch.Modify(pod, false) // not notifying the watchers
failPodFromExecutor(launchedTask)
// 4. with pod still on the apiserver, bound i.e. host!="", notified via ListWatch
pod, launchedTask = startPod(offers1)
pod.Spec.NodeName = *offers1[0].Hostname
podListWatch.Modify(pod, true) // notifying the watchers
time.Sleep(time.Second / 2)
failPodFromExecutor(launchedTask)
}
func TestDeleteOne_NonexistentPod(t *testing.T) {
assert := assert.New(t)
obj := &MockScheduler{}
reg := podtask.NewInMemoryRegistry()
obj.On("tasks").Return(reg)
qr := newQueuer(nil)
assert.Equal(0, len(qr.podQueue.List()))
d := &deleter{
api: obj,
qr: qr,
}
pod := &Pod{Pod: &api.Pod{
ObjectMeta: api.ObjectMeta{
Name: "foo",
Namespace: api.NamespaceDefault,
}}}
err := d.deleteOne(pod)
assert.Equal(err, noSuchPodErr)
obj.AssertExpectations(t)
}
func TestDeleteOne_PendingPod(t *testing.T) {
assert := assert.New(t)
obj := &MockScheduler{}
reg := podtask.NewInMemoryRegistry()
obj.On("tasks").Return(reg)
pod := &Pod{Pod: &api.Pod{
ObjectMeta: api.ObjectMeta{
Name: "foo",
UID: "foo0",
Namespace: api.NamespaceDefault,
}}}
_, err := reg.Register(podtask.New(api.NewDefaultContext(), "bar", *pod.Pod, &mesos.ExecutorInfo{}))
if err != nil {
t.Fatalf("failed to create task: %v", err)
}
// preconditions
qr := newQueuer(nil)
qr.podQueue.Add(pod, queue.ReplaceExisting)
assert.Equal(1, len(qr.podQueue.List()))
_, found := qr.podQueue.Get("default/foo")
assert.True(found)
// exec & post conditions
d := &deleter{
api: obj,
qr: qr,
}
err = d.deleteOne(pod)
assert.Nil(err)
_, found = qr.podQueue.Get("foo0")
assert.False(found)
assert.Equal(0, len(qr.podQueue.List()))
obj.AssertExpectations(t)
}
func TestDeleteOne_Running(t *testing.T) {
assert := assert.New(t)
obj := &MockScheduler{}
reg := podtask.NewInMemoryRegistry()
obj.On("tasks").Return(reg)
pod := &Pod{Pod: &api.Pod{
ObjectMeta: api.ObjectMeta{
Name: "foo",
UID: "foo0",
Namespace: api.NamespaceDefault,
}}}
task, err := reg.Register(podtask.New(api.NewDefaultContext(), "bar", *pod.Pod, &mesos.ExecutorInfo{}))
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
task.Set(podtask.Launched)
err = reg.Update(task)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
// preconditions
qr := newQueuer(nil)
qr.podQueue.Add(pod, queue.ReplaceExisting)
assert.Equal(1, len(qr.podQueue.List()))
_, found := qr.podQueue.Get("default/foo")
assert.True(found)
obj.On("killTask", task.ID).Return(nil)
// exec & post conditions
d := &deleter{
api: obj,
qr: qr,
}
err = d.deleteOne(pod)
assert.Nil(err)
_, found = qr.podQueue.Get("foo0")
assert.False(found)
assert.Equal(0, len(qr.podQueue.List()))
obj.AssertExpectations(t)
}
func TestDeleteOne_badPodNaming(t *testing.T) {
assert := assert.New(t)
obj := &MockScheduler{}
pod := &Pod{Pod: &api.Pod{}}
d := &deleter{
api: obj,
qr: newQueuer(nil),
}
err := d.deleteOne(pod)
assert.NotNil(err)
pod.Pod.ObjectMeta.Name = "foo"
err = d.deleteOne(pod)
assert.NotNil(err)
pod.Pod.ObjectMeta.Name = ""
pod.Pod.ObjectMeta.Namespace = "bar"
err = d.deleteOne(pod)
assert.NotNil(err)
obj.AssertExpectations(t)
}

View File

@@ -0,0 +1,80 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package scheduler
import (
"fmt"
"time"
"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/queue"
"github.com/GoogleCloudPlatform/kubernetes/pkg/api"
"github.com/GoogleCloudPlatform/kubernetes/pkg/client/cache"
)
// wrapper for the k8s pod type so that we can define additional methods on a "pod"
type Pod struct {
*api.Pod
deadline *time.Time
delay *time.Duration
notify queue.BreakChan
}
// implements Copyable
func (p *Pod) Copy() queue.Copyable {
if p == nil {
return nil
}
//TODO(jdef) we may need a better "deep-copy" implementation
pod := *(p.Pod)
return &Pod{Pod: &pod}
}
// implements Unique
func (p *Pod) GetUID() string {
if id, err := cache.MetaNamespaceKeyFunc(p.Pod); err != nil {
panic(fmt.Sprintf("failed to determine pod id for '%+v'", p.Pod))
} else {
return id
}
}
// implements Deadlined
func (dp *Pod) Deadline() (time.Time, bool) {
if dp.deadline != nil {
return *(dp.deadline), true
}
return time.Time{}, false
}
func (dp *Pod) GetDelay() time.Duration {
if dp.delay != nil {
return *(dp.delay)
}
return 0
}
func (p *Pod) Breaker() queue.BreakChan {
return p.notify
}
func (p *Pod) String() string {
displayDeadline := "<none>"
if deadline, ok := p.Deadline(); ok {
displayDeadline = deadline.String()
}
return fmt.Sprintf("{pod:%v, deadline:%v, delay:%v}", p.Pod.Name, displayDeadline, p.GetDelay())
}

View File

@@ -0,0 +1,54 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package podtask
import (
"fmt"
"io"
"net/http"
log "github.com/golang/glog"
)
//TODO(jdef) we use a Locker to guard against concurrent task state changes, but it would be
//really, really nice to avoid doing this. Maybe someday the registry won't return data ptrs
//but plain structs instead.
func InstallDebugHandlers(reg Registry, mux *http.ServeMux) {
mux.HandleFunc("/debug/registry/tasks", func(w http.ResponseWriter, r *http.Request) {
//TODO(jdef) support filtering tasks based on status
alltasks := reg.List(nil)
io.WriteString(w, fmt.Sprintf("task_count=%d\n", len(alltasks)))
for _, task := range alltasks {
if err := func() (err error) {
podName := task.Pod.Name
podNamespace := task.Pod.Namespace
offerId := ""
if task.Offer != nil {
offerId = task.Offer.Id()
}
_, err = io.WriteString(w, fmt.Sprintf("%v\t%v/%v\t%v\t%v\n", task.ID, podNamespace, podName, task.State, offerId))
return
}(); err != nil {
log.Warningf("aborting debug handler: %v", err)
break // stop listing on I/O errors
}
}
if flusher, ok := w.(http.Flusher); ok {
flusher.Flush()
}
})
}

View File

@@ -0,0 +1,18 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Package podtask maps Kubernetes pods to Mesos tasks.
package podtask

View File

@@ -0,0 +1,29 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package podtask
// Concepts that have leaked to where they should not have.
import (
"github.com/GoogleCloudPlatform/kubernetes/pkg/api"
"github.com/GoogleCloudPlatform/kubernetes/pkg/registry/etcd"
)
// makePodKey constructs etcd paths to pod items enforcing namespace rules.
func MakePodKey(ctx api.Context, id string) (string, error) {
return etcd.MakeEtcdItemKey(ctx, PodPath, id)
}

View File

@@ -0,0 +1,373 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package podtask
import (
"fmt"
"strings"
"time"
"code.google.com/p/go-uuid/uuid"
"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/offers"
annotation "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/scheduler/meta"
"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/scheduler/metrics"
"github.com/GoogleCloudPlatform/kubernetes/pkg/api"
"github.com/gogo/protobuf/proto"
log "github.com/golang/glog"
mesos "github.com/mesos/mesos-go/mesosproto"
mutil "github.com/mesos/mesos-go/mesosutil"
)
const (
containerCpus = 0.25 // initial CPU allocated for executor
containerMem = 64 // initial MB of memory allocated for executor
)
type StateType int
const (
StatePending StateType = iota
StateRunning
StateFinished
StateUnknown
)
type FlagType string
const (
Launched = FlagType("launched")
Bound = FlagType("bound")
Deleted = FlagType("deleted")
)
// A struct that describes a pod task.
type T struct {
ID string
Pod api.Pod
Spec Spec
Offer offers.Perishable // thread-safe
State StateType
Flags map[FlagType]struct{}
CreateTime time.Time
UpdatedTime time.Time // time of the most recent StatusUpdate we've seen from the mesos master
podStatus api.PodStatus
executor *mesos.ExecutorInfo // readonly
podKey string
launchTime time.Time
bindTime time.Time
mapper HostPortMappingType
}
type Spec struct {
SlaveID string
CPU float64
Memory float64
PortMap []HostPortMapping
Ports []uint64
Data []byte
}
// mostly-clone this pod task. the clone will actually share the some fields:
// - executor // OK because it's read only
// - Offer // OK because it's guarantees safe concurrent access
func (t *T) Clone() *T {
if t == nil {
return nil
}
// shallow-copy
clone := *t
// deep copy
(&t.Spec).copyTo(&clone.Spec)
clone.Flags = map[FlagType]struct{}{}
for k := range t.Flags {
clone.Flags[k] = struct{}{}
}
return &clone
}
func (old *Spec) copyTo(new *Spec) {
if len(old.PortMap) > 0 {
new.PortMap = append(([]HostPortMapping)(nil), old.PortMap...)
}
if len(old.Ports) > 0 {
new.Ports = append(([]uint64)(nil), old.Ports...)
}
if len(old.Data) > 0 {
new.Data = append(([]byte)(nil), old.Data...)
}
}
func (t *T) HasAcceptedOffer() bool {
return t.Spec.SlaveID != ""
}
func (t *T) GetOfferId() string {
if t.Offer == nil {
return ""
}
return t.Offer.Details().Id.GetValue()
}
func generateTaskName(pod *api.Pod) string {
ns := pod.Namespace
if ns == "" {
ns = api.NamespaceDefault
}
return fmt.Sprintf("%s.%s.pods", pod.Name, ns)
}
func (t *T) BuildTaskInfo() *mesos.TaskInfo {
info := &mesos.TaskInfo{
Name: proto.String(generateTaskName(&t.Pod)),
TaskId: mutil.NewTaskID(t.ID),
SlaveId: mutil.NewSlaveID(t.Spec.SlaveID),
Executor: t.executor,
Data: t.Spec.Data,
Resources: []*mesos.Resource{
mutil.NewScalarResource("cpus", t.Spec.CPU),
mutil.NewScalarResource("mem", t.Spec.Memory),
},
}
if portsResource := rangeResource("ports", t.Spec.Ports); portsResource != nil {
info.Resources = append(info.Resources, portsResource)
}
return info
}
// Fill the Spec in the T, should be called during k8s scheduling,
// before binding.
func (t *T) FillFromDetails(details *mesos.Offer) error {
if details == nil {
//programming error
panic("offer details are nil")
}
log.V(3).Infof("Recording offer(s) %v against pod %v", details.Id, t.Pod.Name)
t.Spec = Spec{
SlaveID: details.GetSlaveId().GetValue(),
CPU: containerCpus,
Memory: containerMem,
}
if mapping, err := t.mapper.Generate(t, details); err != nil {
t.Reset()
return err
} else {
ports := []uint64{}
for _, entry := range mapping {
ports = append(ports, entry.OfferPort)
}
t.Spec.PortMap = mapping
t.Spec.Ports = ports
}
// hostname needs of the executor needs to match that of the offer, otherwise
// the kubelet node status checker/updater is very unhappy
const HOSTNAME_OVERRIDE_FLAG = "--hostname-override="
hostname := details.GetHostname() // required field, non-empty
hostnameOverride := HOSTNAME_OVERRIDE_FLAG + hostname
argv := t.executor.Command.Arguments
overwrite := false
for i, arg := range argv {
if strings.HasPrefix(arg, HOSTNAME_OVERRIDE_FLAG) {
overwrite = true
argv[i] = hostnameOverride
break
}
}
if !overwrite {
t.executor.Command.Arguments = append(argv, hostnameOverride)
}
return nil
}
// Clear offer-related details from the task, should be called if/when an offer
// has already been assigned to a task but for some reason is no longer valid.
func (t *T) Reset() {
log.V(3).Infof("Clearing offer(s) from pod %v", t.Pod.Name)
t.Offer = nil
t.Spec = Spec{}
}
func (t *T) AcceptOffer(offer *mesos.Offer) bool {
if offer == nil {
return false
}
var (
cpus float64 = 0
mem float64 = 0
)
for _, resource := range offer.Resources {
if resource.GetName() == "cpus" {
cpus = *resource.GetScalar().Value
}
if resource.GetName() == "mem" {
mem = *resource.GetScalar().Value
}
}
if _, err := t.mapper.Generate(t, offer); err != nil {
log.V(3).Info(err)
return false
}
// for now hard-coded, constant values are used for cpus and mem. This is necessary
// until parent-cgroup integration is finished for mesos and k8sm. Then the k8sm
// executor can become the parent of pods and subsume their resource usage and
// therefore be compliant with expectations of mesos executors w/ respect to
// resource allocation and management.
//
// TODO(jdef): remove hardcoded values and make use of actual pod resource settings
if (cpus < containerCpus) || (mem < containerMem) {
log.V(3).Infof("not enough resources: cpus: %f mem: %f", cpus, mem)
return false
}
return true
}
func (t *T) Set(f FlagType) {
t.Flags[f] = struct{}{}
if Launched == f {
t.launchTime = time.Now()
queueWaitTime := t.launchTime.Sub(t.CreateTime)
metrics.QueueWaitTime.Observe(metrics.InMicroseconds(queueWaitTime))
}
}
func (t *T) Has(f FlagType) (exists bool) {
_, exists = t.Flags[f]
return
}
func New(ctx api.Context, id string, pod api.Pod, executor *mesos.ExecutorInfo) (*T, error) {
if executor == nil {
return nil, fmt.Errorf("illegal argument: executor was nil")
}
key, err := MakePodKey(ctx, pod.Name)
if err != nil {
return nil, err
}
if id == "" {
id = "pod." + uuid.NewUUID().String()
}
task := &T{
ID: id,
Pod: pod,
State: StatePending,
podKey: key,
mapper: MappingTypeForPod(&pod),
Flags: make(map[FlagType]struct{}),
executor: proto.Clone(executor).(*mesos.ExecutorInfo),
}
task.CreateTime = time.Now()
return task, nil
}
func (t *T) SaveRecoveryInfo(dict map[string]string) {
dict[annotation.TaskIdKey] = t.ID
dict[annotation.SlaveIdKey] = t.Spec.SlaveID
dict[annotation.OfferIdKey] = t.Offer.Details().Id.GetValue()
dict[annotation.ExecutorIdKey] = t.executor.ExecutorId.GetValue()
}
// reconstruct a task from metadata stashed in a pod entry. there are limited pod states that
// support reconstruction. if we expect to be able to reconstruct state but encounter errors
// in the process then those errors are returned. if the pod is in a seemingly valid state but
// otherwise does not support task reconstruction return false. if we're able to reconstruct
// state then return a reconstructed task and true.
//
// at this time task reconstruction is only supported for pods that have been annotated with
// binding metadata, which implies that they've previously been associated with a task and
// that mesos knows about it.
//
// assumes that the pod data comes from the k8s registry and reflects the desired state.
//
func RecoverFrom(pod api.Pod) (*T, bool, error) {
// we only expect annotations if pod has been bound, which implies that it has already
// been scheduled and launched
if pod.Spec.NodeName == "" && len(pod.Annotations) == 0 {
log.V(1).Infof("skipping recovery for unbound pod %v/%v", pod.Namespace, pod.Name)
return nil, false, nil
}
// only process pods that are not in a terminal state
switch pod.Status.Phase {
case api.PodPending, api.PodRunning, api.PodUnknown: // continue
default:
log.V(1).Infof("skipping recovery for terminal pod %v/%v", pod.Namespace, pod.Name)
return nil, false, nil
}
ctx := api.WithNamespace(api.NewDefaultContext(), pod.Namespace)
key, err := MakePodKey(ctx, pod.Name)
if err != nil {
return nil, false, err
}
//TODO(jdef) recover ports (and other resource requirements?) from the pod spec as well
now := time.Now()
t := &T{
Pod: pod,
CreateTime: now,
podKey: key,
State: StatePending, // possibly running? mesos will tell us during reconciliation
Flags: make(map[FlagType]struct{}),
mapper: MappingTypeForPod(&pod),
launchTime: now,
bindTime: now,
}
var (
offerId string
hostname string
)
for _, k := range []string{
annotation.BindingHostKey,
annotation.TaskIdKey,
annotation.SlaveIdKey,
annotation.OfferIdKey,
annotation.ExecutorIdKey,
} {
v, found := pod.Annotations[k]
if !found {
return nil, false, fmt.Errorf("incomplete metadata: missing value for pod annotation: %v", k)
}
switch k {
case annotation.BindingHostKey:
hostname = v
case annotation.SlaveIdKey:
t.Spec.SlaveID = v
case annotation.OfferIdKey:
offerId = v
case annotation.TaskIdKey:
t.ID = v
case annotation.ExecutorIdKey:
// this is nowhere near sufficient to re-launch a task, but we really just
// want this for tracking
t.executor = &mesos.ExecutorInfo{ExecutorId: mutil.NewExecutorID(v)}
}
}
t.Offer = offers.Expired(offerId, hostname, 0)
t.Flags[Launched] = struct{}{}
t.Flags[Bound] = struct{}{}
return t, true, nil
}

View File

@@ -0,0 +1,153 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package podtask
import (
"testing"
"github.com/GoogleCloudPlatform/kubernetes/pkg/api"
mesos "github.com/mesos/mesos-go/mesosproto"
mutil "github.com/mesos/mesos-go/mesosutil"
)
const (
t_min_cpu = 128
t_min_mem = 128
)
func fakePodTask(id string) (*T, error) {
return New(api.NewDefaultContext(), "", api.Pod{
ObjectMeta: api.ObjectMeta{
Name: id,
Namespace: api.NamespaceDefault,
},
}, &mesos.ExecutorInfo{})
}
func TestEmptyOffer(t *testing.T) {
t.Parallel()
task, err := fakePodTask("foo")
if err != nil {
t.Fatal(err)
}
if ok := task.AcceptOffer(nil); ok {
t.Fatalf("accepted nil offer")
}
if ok := task.AcceptOffer(&mesos.Offer{}); ok {
t.Fatalf("accepted empty offer")
}
}
func TestNoPortsInPodOrOffer(t *testing.T) {
t.Parallel()
task, err := fakePodTask("foo")
if err != nil || task == nil {
t.Fatal(err)
}
offer := &mesos.Offer{
Resources: []*mesos.Resource{
mutil.NewScalarResource("cpus", 0.001),
mutil.NewScalarResource("mem", 0.001),
},
}
if ok := task.AcceptOffer(offer); ok {
t.Fatalf("accepted offer %v:", offer)
}
offer = &mesos.Offer{
Resources: []*mesos.Resource{
mutil.NewScalarResource("cpus", t_min_cpu),
mutil.NewScalarResource("mem", t_min_mem),
},
}
if ok := task.AcceptOffer(offer); !ok {
t.Fatalf("did not accepted offer %v:", offer)
}
}
func TestAcceptOfferPorts(t *testing.T) {
t.Parallel()
task, _ := fakePodTask("foo")
pod := &task.Pod
offer := &mesos.Offer{
Resources: []*mesos.Resource{
mutil.NewScalarResource("cpus", t_min_cpu),
mutil.NewScalarResource("mem", t_min_mem),
rangeResource("ports", []uint64{1, 1}),
},
}
if ok := task.AcceptOffer(offer); !ok {
t.Fatalf("did not accepted offer %v:", offer)
}
pod.Spec = api.PodSpec{
Containers: []api.Container{{
Ports: []api.ContainerPort{{
HostPort: 123,
}},
}},
}
if ok := task.AcceptOffer(offer); ok {
t.Fatalf("accepted offer %v:", offer)
}
pod.Spec.Containers[0].Ports[0].HostPort = 1
if ok := task.AcceptOffer(offer); !ok {
t.Fatalf("did not accepted offer %v:", offer)
}
pod.Spec.Containers[0].Ports[0].HostPort = 0
if ok := task.AcceptOffer(offer); !ok {
t.Fatalf("did not accepted offer %v:", offer)
}
offer.Resources = []*mesos.Resource{
mutil.NewScalarResource("cpus", t_min_cpu),
mutil.NewScalarResource("mem", t_min_mem),
}
if ok := task.AcceptOffer(offer); ok {
t.Fatalf("accepted offer %v:", offer)
}
pod.Spec.Containers[0].Ports[0].HostPort = 1
if ok := task.AcceptOffer(offer); ok {
t.Fatalf("accepted offer %v:", offer)
}
}
func TestGeneratePodName(t *testing.T) {
p := &api.Pod{
ObjectMeta: api.ObjectMeta{
Name: "foo",
Namespace: "bar",
},
}
name := generateTaskName(p)
expected := "foo.bar.pods"
if name != expected {
t.Fatalf("expected %q instead of %q", expected, name)
}
p.Namespace = ""
name = generateTaskName(p)
expected = "foo.default.pods"
if name != expected {
t.Fatalf("expected %q instead of %q", expected, name)
}
}

View File

@@ -0,0 +1,185 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package podtask
import (
"fmt"
"github.com/GoogleCloudPlatform/kubernetes/pkg/api"
"github.com/GoogleCloudPlatform/kubernetes/pkg/labels"
log "github.com/golang/glog"
mesos "github.com/mesos/mesos-go/mesosproto"
)
type HostPortMappingType string
const (
// maps a Container.HostPort to the same exact offered host port, ignores .HostPort = 0
HostPortMappingFixed HostPortMappingType = "fixed"
// same as HostPortMappingFixed, except that .HostPort of 0 are mapped to any port offered
HostPortMappingWildcard = "wildcard"
)
type HostPortMapper interface {
// abstracts the way that host ports are mapped to pod container ports
Generate(t *T, offer *mesos.Offer) ([]HostPortMapping, error)
}
type HostPortMapping struct {
ContainerIdx int // index of the container in the pod spec
PortIdx int // index of the port in a container's port spec
OfferPort uint64
}
func (self HostPortMappingType) Generate(t *T, offer *mesos.Offer) ([]HostPortMapping, error) {
switch self {
case HostPortMappingWildcard:
return wildcardHostPortMapping(t, offer)
case HostPortMappingFixed:
default:
log.Warningf("illegal host-port mapping spec %q, defaulting to %q", self, HostPortMappingFixed)
}
return defaultHostPortMapping(t, offer)
}
type PortAllocationError struct {
PodId string
Ports []uint64
}
func (err *PortAllocationError) Error() string {
return fmt.Sprintf("Could not schedule pod %s: %d port(s) could not be allocated", err.PodId, len(err.Ports))
}
type DuplicateHostPortError struct {
m1, m2 HostPortMapping
}
func (err *DuplicateHostPortError) Error() string {
return fmt.Sprintf(
"Host port %d is specified for container %d, pod %d and container %d, pod %d",
err.m1.OfferPort, err.m1.ContainerIdx, err.m1.PortIdx, err.m2.ContainerIdx, err.m2.PortIdx)
}
// wildcard k8s host port mapping implementation: hostPort == 0 gets mapped to any available offer port
func wildcardHostPortMapping(t *T, offer *mesos.Offer) ([]HostPortMapping, error) {
mapping, err := defaultHostPortMapping(t, offer)
if err != nil {
return nil, err
}
taken := make(map[uint64]struct{})
for _, entry := range mapping {
taken[entry.OfferPort] = struct{}{}
}
wildports := []HostPortMapping{}
for i, container := range t.Pod.Spec.Containers {
for pi, port := range container.Ports {
if port.HostPort == 0 {
wildports = append(wildports, HostPortMapping{
ContainerIdx: i,
PortIdx: pi,
})
}
}
}
remaining := len(wildports)
foreachRange(offer, "ports", func(bp, ep uint64) {
log.V(3).Infof("Searching for wildcard port in range {%d:%d}", bp, ep)
for _, entry := range wildports {
if entry.OfferPort != 0 {
continue
}
for port := bp; port <= ep && remaining > 0; port++ {
if _, inuse := taken[port]; inuse {
continue
}
entry.OfferPort = port
mapping = append(mapping, entry)
remaining--
taken[port] = struct{}{}
break
}
}
})
if remaining > 0 {
err := &PortAllocationError{
PodId: t.Pod.Name,
}
// it doesn't make sense to include a port list here because they were all zero (wildcards)
return nil, err
}
return mapping, nil
}
// default k8s host port mapping implementation: hostPort == 0 means containerPort remains pod-private, and so
// no offer ports will be mapped to such Container ports.
func defaultHostPortMapping(t *T, offer *mesos.Offer) ([]HostPortMapping, error) {
requiredPorts := make(map[uint64]HostPortMapping)
mapping := []HostPortMapping{}
for i, container := range t.Pod.Spec.Containers {
// strip all port==0 from this array; k8s already knows what to do with zero-
// ports (it does not create 'port bindings' on the minion-host); we need to
// remove the wildcards from this array since they don't consume host resources
for pi, port := range container.Ports {
if port.HostPort == 0 {
continue // ignore
}
m := HostPortMapping{
ContainerIdx: i,
PortIdx: pi,
OfferPort: uint64(port.HostPort),
}
if entry, inuse := requiredPorts[uint64(port.HostPort)]; inuse {
return nil, &DuplicateHostPortError{entry, m}
}
requiredPorts[uint64(port.HostPort)] = m
}
}
foreachRange(offer, "ports", func(bp, ep uint64) {
for port := range requiredPorts {
log.V(3).Infof("evaluating port range {%d:%d} %d", bp, ep, port)
if (bp <= port) && (port <= ep) {
mapping = append(mapping, requiredPorts[port])
delete(requiredPorts, port)
}
}
})
unsatisfiedPorts := len(requiredPorts)
if unsatisfiedPorts > 0 {
err := &PortAllocationError{
PodId: t.Pod.Name,
}
for p := range requiredPorts {
err.Ports = append(err.Ports, p)
}
return nil, err
}
return mapping, nil
}
const PortMappingLabelKey = "k8s.mesosphere.io/portMapping"
func MappingTypeForPod(pod *api.Pod) HostPortMappingType {
filter := map[string]string{
PortMappingLabelKey: string(HostPortMappingFixed),
}
selector := labels.Set(filter).AsSelector()
if selector.Matches(labels.Set(pod.Labels)) {
return HostPortMappingFixed
}
return HostPortMappingWildcard
}

View File

@@ -0,0 +1,205 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package podtask
import (
"testing"
"github.com/GoogleCloudPlatform/kubernetes/pkg/api"
mesos "github.com/mesos/mesos-go/mesosproto"
)
func TestDefaultHostPortMatching(t *testing.T) {
t.Parallel()
task, _ := fakePodTask("foo")
pod := &task.Pod
offer := &mesos.Offer{
Resources: []*mesos.Resource{
rangeResource("ports", []uint64{1, 1}),
},
}
mapping, err := defaultHostPortMapping(task, offer)
if err != nil {
t.Fatal(err)
}
if len(mapping) > 0 {
t.Fatalf("Found mappings for a pod without ports: %v", pod)
}
//--
pod.Spec = api.PodSpec{
Containers: []api.Container{{
Ports: []api.ContainerPort{{
HostPort: 123,
}, {
HostPort: 123,
}},
}},
}
task, err = New(api.NewDefaultContext(), "", *pod, &mesos.ExecutorInfo{})
if err != nil {
t.Fatal(err)
}
_, err = defaultHostPortMapping(task, offer)
if err, _ := err.(*DuplicateHostPortError); err == nil {
t.Fatal("Expected duplicate port error")
} else if err.m1.OfferPort != 123 {
t.Fatal("Expected duplicate host port 123")
}
}
func TestWildcardHostPortMatching(t *testing.T) {
t.Parallel()
task, _ := fakePodTask("foo")
pod := &task.Pod
offer := &mesos.Offer{}
mapping, err := wildcardHostPortMapping(task, offer)
if err != nil {
t.Fatal(err)
}
if len(mapping) > 0 {
t.Fatalf("Found mappings for an empty offer and a pod without ports: %v", pod)
}
//--
offer = &mesos.Offer{
Resources: []*mesos.Resource{
rangeResource("ports", []uint64{1, 1}),
},
}
mapping, err = wildcardHostPortMapping(task, offer)
if err != nil {
t.Fatal(err)
}
if len(mapping) > 0 {
t.Fatalf("Found mappings for a pod without ports: %v", pod)
}
//--
pod.Spec = api.PodSpec{
Containers: []api.Container{{
Ports: []api.ContainerPort{{
HostPort: 123,
}},
}},
}
task, err = New(api.NewDefaultContext(), "", *pod, &mesos.ExecutorInfo{})
if err != nil {
t.Fatal(err)
}
mapping, err = wildcardHostPortMapping(task, offer)
if err == nil {
t.Fatalf("expected error instead of mappings: %#v", mapping)
} else if err, _ := err.(*PortAllocationError); err == nil {
t.Fatal("Expected port allocation error")
} else if !(len(err.Ports) == 1 && err.Ports[0] == 123) {
t.Fatal("Expected port allocation error for host port 123")
}
//--
pod.Spec = api.PodSpec{
Containers: []api.Container{{
Ports: []api.ContainerPort{{
HostPort: 0,
}, {
HostPort: 123,
}},
}},
}
task, err = New(api.NewDefaultContext(), "", *pod, &mesos.ExecutorInfo{})
if err != nil {
t.Fatal(err)
}
mapping, err = wildcardHostPortMapping(task, offer)
if err, _ := err.(*PortAllocationError); err == nil {
t.Fatal("Expected port allocation error")
} else if !(len(err.Ports) == 1 && err.Ports[0] == 123) {
t.Fatal("Expected port allocation error for host port 123")
}
//--
pod.Spec = api.PodSpec{
Containers: []api.Container{{
Ports: []api.ContainerPort{{
HostPort: 0,
}, {
HostPort: 1,
}},
}},
}
task, err = New(api.NewDefaultContext(), "", *pod, &mesos.ExecutorInfo{})
if err != nil {
t.Fatal(err)
}
mapping, err = wildcardHostPortMapping(task, offer)
if err, _ := err.(*PortAllocationError); err == nil {
t.Fatal("Expected port allocation error")
} else if len(err.Ports) != 0 {
t.Fatal("Expected port allocation error for wildcard port")
}
//--
offer = &mesos.Offer{
Resources: []*mesos.Resource{
rangeResource("ports", []uint64{1, 2}),
},
}
mapping, err = wildcardHostPortMapping(task, offer)
if err != nil {
t.Fatal(err)
} else if len(mapping) != 2 {
t.Fatal("Expected both ports allocated")
}
valid := 0
for _, entry := range mapping {
if entry.ContainerIdx == 0 && entry.PortIdx == 0 && entry.OfferPort == 2 {
valid++
}
if entry.ContainerIdx == 0 && entry.PortIdx == 1 && entry.OfferPort == 1 {
valid++
}
}
if valid < 2 {
t.Fatalf("Expected 2 valid port mappings, not %d", valid)
}
}
func TestMappingTypeForPod(t *testing.T) {
pod := &api.Pod{
ObjectMeta: api.ObjectMeta{
Labels: map[string]string{},
},
}
mt := MappingTypeForPod(pod)
if mt != HostPortMappingWildcard {
t.Fatalf("expected wildcard mapping")
}
pod.Labels[PortMappingLabelKey] = string(HostPortMappingFixed)
mt = MappingTypeForPod(pod)
if mt != HostPortMappingFixed {
t.Fatalf("expected fixed mapping")
}
pod.Labels[PortMappingLabelKey] = string(HostPortMappingWildcard)
mt = MappingTypeForPod(pod)
if mt != HostPortMappingWildcard {
t.Fatalf("expected wildcard mapping")
}
}

View File

@@ -0,0 +1,57 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package podtask
import (
"github.com/gogo/protobuf/proto"
mesos "github.com/mesos/mesos-go/mesosproto"
)
// create a range resource for the listed ports
func rangeResource(name string, ports []uint64) *mesos.Resource {
if len(ports) == 0 {
// pod may consist of a container that doesn't expose any ports on the host
return nil
}
return &mesos.Resource{
Name: proto.String(name),
Type: mesos.Value_RANGES.Enum(),
Ranges: newRanges(ports),
}
}
// generate port ranges from a list of ports. this implementation is very naive
func newRanges(ports []uint64) *mesos.Value_Ranges {
r := make([]*mesos.Value_Range, 0)
for _, port := range ports {
x := proto.Uint64(port)
r = append(r, &mesos.Value_Range{Begin: x, End: x})
}
return &mesos.Value_Ranges{Range: r}
}
func foreachRange(offer *mesos.Offer, resourceName string, f func(begin, end uint64)) {
for _, resource := range offer.Resources {
if resource.GetName() == resourceName {
for _, r := range (*resource).GetRanges().Range {
bp := r.GetBegin()
ep := r.GetEnd()
f(bp, ep)
}
}
}
}

View File

@@ -0,0 +1,335 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package podtask
import (
"container/ring"
"encoding/json"
"fmt"
"sync"
"time"
"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/scheduler/metrics"
"github.com/GoogleCloudPlatform/kubernetes/pkg/api"
log "github.com/golang/glog"
mesos "github.com/mesos/mesos-go/mesosproto"
)
const (
//TODO(jdef) move this somewhere else
PodPath = "/pods"
// length of historical record of finished tasks
defaultFinishedTasksSize = 1024
)
// state store for pod tasks
type Registry interface {
// register the specified task with this registry, as long as the current error
// condition is nil. if no errors occur then return a copy of the registered task.
Register(*T, error) (*T, error)
// unregister the specified task from this registry
Unregister(*T)
// update state for the registered task identified by task.ID, returning a copy of
// the updated task, if any.
Update(task *T) error
// return the task registered for the specified task ID and its current state.
// if there is no such task then StateUnknown is returned.
Get(taskId string) (task *T, currentState StateType)
// return the non-terminal task corresponding to the specified pod ID
ForPod(podID string) (task *T, currentState StateType)
// update the task status given the specified mesos task status update, returning a
// copy of the updated task (if any) and its state.
UpdateStatus(status *mesos.TaskStatus) (*T, StateType)
// return a list of task ID's that match the given filter, or all task ID's if filter == nil.
List(filter func(*T) bool) []*T
}
type inMemoryRegistry struct {
rw sync.RWMutex
taskRegistry map[string]*T
tasksFinished *ring.Ring
podToTask map[string]string
}
func NewInMemoryRegistry() Registry {
return &inMemoryRegistry{
taskRegistry: make(map[string]*T),
tasksFinished: ring.New(defaultFinishedTasksSize),
podToTask: make(map[string]string),
}
}
func (k *inMemoryRegistry) List(accepts func(t *T) bool) (tasks []*T) {
k.rw.RLock()
defer k.rw.RUnlock()
for _, task := range k.taskRegistry {
if accepts == nil || accepts(task) {
tasks = append(tasks, task.Clone())
}
}
return
}
func (k *inMemoryRegistry) ForPod(podID string) (task *T, currentState StateType) {
k.rw.RLock()
defer k.rw.RUnlock()
tid, ok := k.podToTask[podID]
if !ok {
return nil, StateUnknown
}
t, state := k._get(tid)
return t.Clone(), state
}
// registers a pod task unless the spec'd error is not nil
func (k *inMemoryRegistry) Register(task *T, err error) (*T, error) {
if err == nil {
k.rw.Lock()
defer k.rw.Unlock()
if _, found := k.podToTask[task.podKey]; found {
return nil, fmt.Errorf("task already registered for pod key %q", task.podKey)
}
if _, found := k.taskRegistry[task.ID]; found {
return nil, fmt.Errorf("task already registered for id %q", task.ID)
}
k.podToTask[task.podKey] = task.ID
k.taskRegistry[task.ID] = task
}
return task.Clone(), err
}
// updates internal task state. updates are limited to Spec, Flags, and Offer for
// StatePending tasks, and are limited to Flag updates (additive only) for StateRunning tasks.
func (k *inMemoryRegistry) Update(task *T) error {
if task == nil {
return nil
}
k.rw.Lock()
defer k.rw.Unlock()
switch internal, state := k._get(task.ID); state {
case StateUnknown:
return fmt.Errorf("no such task: %v", task.ID)
case StatePending:
internal.Offer = task.Offer
internal.Spec = task.Spec
(&task.Spec).copyTo(&internal.Spec)
internal.Flags = map[FlagType]struct{}{}
fallthrough
case StateRunning:
for k, v := range task.Flags {
internal.Flags[k] = v
}
return nil
default:
return fmt.Errorf("may not update task %v in state %v", task.ID, state)
}
}
func (k *inMemoryRegistry) Unregister(task *T) {
k.rw.Lock()
defer k.rw.Unlock()
delete(k.podToTask, task.podKey)
delete(k.taskRegistry, task.ID)
}
func (k *inMemoryRegistry) Get(taskId string) (*T, StateType) {
k.rw.RLock()
defer k.rw.RUnlock()
t, state := k._get(taskId)
return t.Clone(), state
}
// assume that the caller has already locked around access to task state.
// the caller is also responsible for cloning the task object before it leaves
// the context of this registry.
func (k *inMemoryRegistry) _get(taskId string) (*T, StateType) {
if task, found := k.taskRegistry[taskId]; found {
return task, task.State
}
return nil, StateUnknown
}
func (k *inMemoryRegistry) UpdateStatus(status *mesos.TaskStatus) (*T, StateType) {
taskId := status.GetTaskId().GetValue()
k.rw.Lock()
defer k.rw.Unlock()
task, state := k._get(taskId)
switch status.GetState() {
case mesos.TaskState_TASK_STAGING:
k.handleTaskStaging(task, state, status)
case mesos.TaskState_TASK_STARTING:
k.handleTaskStarting(task, state, status)
case mesos.TaskState_TASK_RUNNING:
k.handleTaskRunning(task, state, status)
case mesos.TaskState_TASK_FINISHED:
k.handleTaskFinished(task, state, status)
case mesos.TaskState_TASK_FAILED:
k.handleTaskFailed(task, state, status)
case mesos.TaskState_TASK_KILLED:
k.handleTaskKilled(task, state, status)
case mesos.TaskState_TASK_LOST:
k.handleTaskLost(task, state, status)
default:
log.Warningf("unhandled status update for task: %v", taskId)
}
return task.Clone(), state
}
func (k *inMemoryRegistry) handleTaskStaging(task *T, state StateType, status *mesos.TaskStatus) {
if status.GetSource() != mesos.TaskStatus_SOURCE_MASTER {
log.Errorf("received STAGING for task %v with unexpected source: %v",
status.GetTaskId().GetValue(), status.GetSource())
}
}
func (k *inMemoryRegistry) handleTaskStarting(task *T, state StateType, status *mesos.TaskStatus) {
// we expect to receive this when a launched task is finally "bound"
// via the API server. however, there's nothing specific for us to do here.
switch state {
case StatePending:
task.UpdatedTime = time.Now()
if !task.Has(Bound) {
task.Set(Bound)
task.bindTime = task.UpdatedTime
timeToBind := task.bindTime.Sub(task.launchTime)
metrics.BindLatency.Observe(metrics.InMicroseconds(timeToBind))
}
default:
taskId := status.GetTaskId().GetValue()
log.Warningf("Ignore status TASK_STARTING because the task %v is not pending", taskId)
}
}
func (k *inMemoryRegistry) handleTaskRunning(task *T, state StateType, status *mesos.TaskStatus) {
taskId := status.GetTaskId().GetValue()
switch state {
case StatePending:
task.UpdatedTime = time.Now()
log.Infof("Received running status for pending task: %v", taskId)
fillRunningPodInfo(task, status)
task.State = StateRunning
case StateRunning:
task.UpdatedTime = time.Now()
log.V(2).Infof("Ignore status TASK_RUNNING because the task %v is already running", taskId)
case StateFinished:
log.Warningf("Ignore status TASK_RUNNING because the task %v is already finished", taskId)
default:
log.Warningf("Ignore status TASK_RUNNING because the task %v is discarded", taskId)
}
}
func ParsePodStatusResult(taskStatus *mesos.TaskStatus) (result api.PodStatusResult, err error) {
if taskStatus.Data != nil {
err = json.Unmarshal(taskStatus.Data, &result)
} else {
err = fmt.Errorf("missing TaskStatus.Data")
}
return
}
func fillRunningPodInfo(task *T, taskStatus *mesos.TaskStatus) {
if taskStatus.GetReason() == mesos.TaskStatus_REASON_RECONCILIATION && taskStatus.GetSource() == mesos.TaskStatus_SOURCE_MASTER {
// there is no data..
return
}
//TODO(jdef) determine the usefullness of this information (if any)
if result, err := ParsePodStatusResult(taskStatus); err != nil {
log.Errorf("invalid TaskStatus.Data for task '%v': %v", task.ID, err)
} else {
task.podStatus = result.Status
log.Infof("received pod status for task %v: %+v", task.ID, result.Status)
}
}
func (k *inMemoryRegistry) handleTaskFinished(task *T, state StateType, status *mesos.TaskStatus) {
taskId := status.GetTaskId().GetValue()
switch state {
case StatePending:
panic(fmt.Sprintf("Pending task %v finished, this couldn't happen", taskId))
case StateRunning:
log.V(2).Infof("received finished status for running task: %v", taskId)
delete(k.podToTask, task.podKey)
task.State = StateFinished
task.UpdatedTime = time.Now()
k.tasksFinished = k.recordFinishedTask(task.ID)
case StateFinished:
log.Warningf("Ignore status TASK_FINISHED because the task %v is already finished", taskId)
default:
log.Warningf("Ignore status TASK_FINISHED because the task %v is not running", taskId)
}
}
// record that a task has finished.
// older record are expunged one at a time once the historical ring buffer is saturated.
// assumes caller is holding state lock.
func (k *inMemoryRegistry) recordFinishedTask(taskId string) *ring.Ring {
slot := k.tasksFinished.Next()
if slot.Value != nil {
// garbage collect older finished task from the registry
gctaskId := slot.Value.(string)
if gctask, found := k.taskRegistry[gctaskId]; found && gctask.State == StateFinished {
delete(k.taskRegistry, gctaskId)
}
}
slot.Value = taskId
return slot
}
func (k *inMemoryRegistry) handleTaskFailed(task *T, state StateType, status *mesos.TaskStatus) {
switch state {
case StatePending:
delete(k.taskRegistry, task.ID)
delete(k.podToTask, task.podKey)
case StateRunning:
delete(k.taskRegistry, task.ID)
delete(k.podToTask, task.podKey)
}
}
func (k *inMemoryRegistry) handleTaskKilled(task *T, state StateType, status *mesos.TaskStatus) {
defer func() {
msg := fmt.Sprintf("task killed: %+v, task %+v", status, task)
if task != nil && task.Has(Deleted) {
// we were expecting this, nothing out of the ordinary
log.V(2).Infoln(msg)
} else {
log.Errorln(msg)
}
}()
switch state {
case StatePending, StateRunning:
delete(k.taskRegistry, task.ID)
delete(k.podToTask, task.podKey)
}
}
func (k *inMemoryRegistry) handleTaskLost(task *T, state StateType, status *mesos.TaskStatus) {
switch state {
case StateRunning, StatePending:
delete(k.taskRegistry, task.ID)
delete(k.podToTask, task.podKey)
}
}

View File

@@ -0,0 +1,320 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package podtask
import (
"testing"
"time"
"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/offers"
"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/proc"
mesos "github.com/mesos/mesos-go/mesosproto"
"github.com/mesos/mesos-go/mesosutil"
"github.com/stretchr/testify/assert"
)
func TestInMemoryRegistry_RegisterGetUnregister(t *testing.T) {
assert := assert.New(t)
registry := NewInMemoryRegistry()
// it's empty at the beginning
tasks := registry.List(func(t *T) bool { return true })
assert.Empty(tasks)
// add a task
a, _ := fakePodTask("a")
a_clone, err := registry.Register(a, nil)
assert.NoError(err)
assert.Equal(a_clone.ID, a.ID)
assert.Equal(a_clone.podKey, a.podKey)
// add another task
b, _ := fakePodTask("b")
b_clone, err := registry.Register(b, nil)
assert.NoError(err)
assert.Equal(b_clone.ID, b.ID)
assert.Equal(b_clone.podKey, b.podKey)
// find tasks in the registry
tasks = registry.List(func(t *T) bool { return true })
assert.Len(tasks, 2)
assert.Contains(tasks, a_clone)
assert.Contains(tasks, b_clone)
tasks = registry.List(func(t *T) bool { return t.ID == a.ID })
assert.Len(tasks, 1)
assert.Contains(tasks, a_clone)
task, _ := registry.ForPod(a.podKey)
assert.NotNil(task)
assert.Equal(task.ID, a.ID)
task, _ = registry.ForPod(b.podKey)
assert.NotNil(task)
assert.Equal(task.ID, b.ID)
task, _ = registry.ForPod("no-pod-key")
assert.Nil(task)
task, _ = registry.Get(a.ID)
assert.NotNil(task)
assert.Equal(task.ID, a.ID)
task, _ = registry.Get("unknown-task-id")
assert.Nil(task)
// re-add a task
a_clone, err = registry.Register(a, nil)
assert.Error(err)
assert.Nil(a_clone)
// re-add a task with another podKey, but same task id
another_a := a.Clone()
another_a.podKey = "another-pod"
another_a_clone, err := registry.Register(another_a, nil)
assert.Error(err)
assert.Nil(another_a_clone)
// re-add a task with another task ID, but same podKey
another_b := b.Clone()
another_b.ID = "another-task-id"
another_b_clone, err := registry.Register(another_b, nil)
assert.Error(err)
assert.Nil(another_b_clone)
// unregister a task
registry.Unregister(b)
tasks = registry.List(func(t *T) bool { return true })
assert.Len(tasks, 1)
assert.Contains(tasks, a)
// unregister a task not registered
unregistered_task, _ := fakePodTask("unregistered-task")
registry.Unregister(unregistered_task)
}
func fakeStatusUpdate(taskId string, state mesos.TaskState) *mesos.TaskStatus {
status := mesosutil.NewTaskStatus(mesosutil.NewTaskID(taskId), state)
status.Data = []byte("{}") // empty json
masterSource := mesos.TaskStatus_SOURCE_MASTER
status.Source = &masterSource
return status
}
func TestInMemoryRegistry_State(t *testing.T) {
assert := assert.New(t)
registry := NewInMemoryRegistry()
// add a task
a, _ := fakePodTask("a")
a_clone, err := registry.Register(a, nil)
assert.NoError(err)
assert.Equal(a.State, a_clone.State)
// update the status
assert.Equal(a_clone.State, StatePending)
a_clone, state := registry.UpdateStatus(fakeStatusUpdate(a.ID, mesos.TaskState_TASK_RUNNING))
assert.Equal(state, StatePending) // old state
assert.Equal(a_clone.State, StateRunning) // new state
// update unknown task
unknown_clone, state := registry.UpdateStatus(fakeStatusUpdate("unknown-task-id", mesos.TaskState_TASK_RUNNING))
assert.Nil(unknown_clone)
assert.Equal(state, StateUnknown)
}
func TestInMemoryRegistry_Update(t *testing.T) {
assert := assert.New(t)
// create offers registry
ttl := time.Second / 4
config := offers.RegistryConfig{
DeclineOffer: func(offerId string) <-chan error {
return proc.ErrorChan(nil)
},
Compat: func(o *mesos.Offer) bool {
return true
},
TTL: ttl,
LingerTTL: 2 * ttl,
}
storage := offers.CreateRegistry(config)
// Add offer
offerId := mesosutil.NewOfferID("foo")
mesosOffer := &mesos.Offer{Id: offerId}
storage.Add([]*mesos.Offer{mesosOffer})
offer, ok := storage.Get(offerId.GetValue())
assert.True(ok)
// create registry
registry := NewInMemoryRegistry()
a, _ := fakePodTask("a")
registry.Register(a.Clone(), nil) // here clone a because we change it below
// state changes are ignored
a.State = StateRunning
err := registry.Update(a)
assert.NoError(err)
a_clone, _ := registry.Get(a.ID)
assert.Equal(StatePending, a_clone.State)
// offer is updated while pending
a.Offer = offer
err = registry.Update(a)
assert.NoError(err)
a_clone, _ = registry.Get(a.ID)
assert.Equal(offer.Id(), a_clone.Offer.Id())
// spec is updated while pending
a.Spec = Spec{SlaveID: "slave-1"}
err = registry.Update(a)
assert.NoError(err)
a_clone, _ = registry.Get(a.ID)
assert.Equal("slave-1", a_clone.Spec.SlaveID)
// flags are updated while pending
a.Flags[Launched] = struct{}{}
err = registry.Update(a)
assert.NoError(err)
a_clone, _ = registry.Get(a.ID)
_, found_launched := a_clone.Flags[Launched]
assert.True(found_launched)
// flags are updated while running
registry.UpdateStatus(fakeStatusUpdate(a.ID, mesos.TaskState_TASK_RUNNING))
a.Flags[Bound] = struct{}{}
err = registry.Update(a)
assert.NoError(err)
a_clone, _ = registry.Get(a.ID)
_, found_launched = a_clone.Flags[Launched]
assert.True(found_launched)
_, found_bound := a_clone.Flags[Bound]
assert.True(found_bound)
// spec is ignored while running
a.Spec = Spec{SlaveID: "slave-2"}
err = registry.Update(a)
assert.NoError(err)
a_clone, _ = registry.Get(a.ID)
assert.Equal("slave-1", a_clone.Spec.SlaveID)
// error when finished
registry.UpdateStatus(fakeStatusUpdate(a.ID, mesos.TaskState_TASK_FINISHED))
err = registry.Update(a)
assert.Error(err)
// update unknown task
unknown_task, _ := fakePodTask("unknown-task")
err = registry.Update(unknown_task)
assert.Error(err)
// update nil task
err = registry.Update(nil)
assert.Nil(err)
}
type transition struct {
statusUpdate mesos.TaskState
expectedState *StateType
expectPanic bool
}
func NewTransition(statusUpdate mesos.TaskState, expectedState StateType) transition {
return transition{statusUpdate: statusUpdate, expectedState: &expectedState, expectPanic: false}
}
func NewTransitionToDeletedTask(statusUpdate mesos.TaskState) transition {
return transition{statusUpdate: statusUpdate, expectedState: nil, expectPanic: false}
}
func NewTransitionWhichPanics(statusUpdate mesos.TaskState) transition {
return transition{statusUpdate: statusUpdate, expectPanic: true}
}
func testStateTrace(t *testing.T, transitions []transition) *Registry {
assert := assert.New(t)
registry := NewInMemoryRegistry()
a, _ := fakePodTask("a")
a, _ = registry.Register(a, nil)
// initial pending state
assert.Equal(a.State, StatePending)
for _, transition := range transitions {
if transition.expectPanic {
assert.Panics(func() {
registry.UpdateStatus(fakeStatusUpdate(a.ID, transition.statusUpdate))
})
} else {
a, _ = registry.UpdateStatus(fakeStatusUpdate(a.ID, transition.statusUpdate))
if transition.expectedState == nil {
a, _ = registry.Get(a.ID)
assert.Nil(a, "expected task to be deleted from registry after status update to %v", transition.statusUpdate)
} else {
assert.Equal(a.State, *transition.expectedState)
}
}
}
return &registry
}
func TestInMemoryRegistry_TaskLifeCycle(t *testing.T) {
testStateTrace(t, []transition{
NewTransition(mesos.TaskState_TASK_STAGING, StatePending),
NewTransition(mesos.TaskState_TASK_STARTING, StatePending),
NewTransitionWhichPanics(mesos.TaskState_TASK_FINISHED),
NewTransition(mesos.TaskState_TASK_RUNNING, StateRunning),
NewTransition(mesos.TaskState_TASK_RUNNING, StateRunning),
NewTransition(mesos.TaskState_TASK_STARTING, StateRunning),
NewTransition(mesos.TaskState_TASK_FINISHED, StateFinished),
NewTransition(mesos.TaskState_TASK_FINISHED, StateFinished),
NewTransition(mesos.TaskState_TASK_RUNNING, StateFinished),
})
}
func TestInMemoryRegistry_NotFinished(t *testing.T) {
// all these behave the same
notFinishedStates := []mesos.TaskState{
mesos.TaskState_TASK_FAILED,
mesos.TaskState_TASK_KILLED,
mesos.TaskState_TASK_LOST,
}
for _, notFinishedState := range notFinishedStates {
testStateTrace(t, []transition{
NewTransitionToDeletedTask(notFinishedState),
})
testStateTrace(t, []transition{
NewTransition(mesos.TaskState_TASK_RUNNING, StateRunning),
NewTransitionToDeletedTask(notFinishedState),
})
testStateTrace(t, []transition{
NewTransition(mesos.TaskState_TASK_RUNNING, StateRunning),
NewTransition(mesos.TaskState_TASK_FINISHED, StateFinished),
NewTransition(notFinishedState, StateFinished),
})
}
}

View File

@@ -0,0 +1,924 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package scheduler
import (
"fmt"
"io"
"math"
"net/http"
"reflect"
"sync"
"time"
execcfg "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/executor/config"
"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/executor/messages"
"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/offers"
offerMetrics "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/offers/metrics"
"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/proc"
"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/runtime"
schedcfg "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/scheduler/config"
"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/scheduler/meta"
"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/scheduler/metrics"
"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/scheduler/podtask"
"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/scheduler/uid"
"github.com/GoogleCloudPlatform/kubernetes/pkg/api"
"github.com/GoogleCloudPlatform/kubernetes/pkg/api/errors"
"github.com/GoogleCloudPlatform/kubernetes/pkg/client"
"github.com/GoogleCloudPlatform/kubernetes/pkg/fields"
"github.com/GoogleCloudPlatform/kubernetes/pkg/kubelet/container"
"github.com/GoogleCloudPlatform/kubernetes/pkg/labels"
"github.com/GoogleCloudPlatform/kubernetes/pkg/tools"
"github.com/GoogleCloudPlatform/kubernetes/pkg/util"
log "github.com/golang/glog"
mesos "github.com/mesos/mesos-go/mesosproto"
mutil "github.com/mesos/mesos-go/mesosutil"
bindings "github.com/mesos/mesos-go/scheduler"
)
type Slave struct {
HostName string
}
func newSlave(hostName string) *Slave {
return &Slave{
HostName: hostName,
}
}
type slaveStorage struct {
sync.Mutex
slaves map[string]*Slave // SlaveID => slave.
}
func newSlaveStorage() *slaveStorage {
return &slaveStorage{
slaves: make(map[string]*Slave),
}
}
// Create a mapping between a slaveID and slave if not existing.
func (self *slaveStorage) checkAndAdd(slaveId, slaveHostname string) {
self.Lock()
defer self.Unlock()
_, exists := self.slaves[slaveId]
if !exists {
self.slaves[slaveId] = newSlave(slaveHostname)
}
}
func (self *slaveStorage) getSlaveIds() []string {
self.Lock()
defer self.Unlock()
slaveIds := make([]string, 0, len(self.slaves))
for slaveID := range self.slaves {
slaveIds = append(slaveIds, slaveID)
}
return slaveIds
}
func (self *slaveStorage) getSlave(slaveId string) (*Slave, bool) {
self.Lock()
defer self.Unlock()
slave, exists := self.slaves[slaveId]
return slave, exists
}
type PluginInterface interface {
// the apiserver may have a different state for the pod than we do
// so reconcile our records, but only for this one pod
reconcilePod(api.Pod)
// execute the Scheduling plugin, should start a go routine and return immediately
Run(<-chan struct{})
}
// KubernetesScheduler implements:
// 1: A mesos scheduler.
// 2: A kubernetes scheduler plugin.
// 3: A kubernetes pod.Registry.
type KubernetesScheduler struct {
// We use a lock here to avoid races
// between invoking the mesos callback
// and the invoking the pod registry interfaces.
// In particular, changes to podtask.T objects are currently guarded by this lock.
*sync.RWMutex
// Config related, write-once
schedcfg *schedcfg.Config
executor *mesos.ExecutorInfo
executorGroup uint64
scheduleFunc PodScheduleFunc
client *client.Client
etcdClient tools.EtcdGetSet
failoverTimeout float64 // in seconds
reconcileInterval int64
// Mesos context.
driver bindings.SchedulerDriver // late initialization
frameworkId *mesos.FrameworkID
masterInfo *mesos.MasterInfo
registered bool
registration chan struct{} // signal chan that closes upon first successful registration
onRegistration sync.Once
offers offers.Registry
slaves *slaveStorage
// unsafe state, needs to be guarded
taskRegistry podtask.Registry
// via deferred init
plugin PluginInterface
reconciler *Reconciler
reconcileCooldown time.Duration
asRegisteredMaster proc.Doer
terminate <-chan struct{} // signal chan, closes when we should kill background tasks
}
type Config struct {
Schedcfg schedcfg.Config
Executor *mesos.ExecutorInfo
ScheduleFunc PodScheduleFunc
Client *client.Client
EtcdClient tools.EtcdGetSet
FailoverTimeout float64
ReconcileInterval int64
ReconcileCooldown time.Duration
}
// New creates a new KubernetesScheduler
func New(config Config) *KubernetesScheduler {
var k *KubernetesScheduler
k = &KubernetesScheduler{
schedcfg: &config.Schedcfg,
RWMutex: new(sync.RWMutex),
executor: config.Executor,
executorGroup: uid.Parse(config.Executor.ExecutorId.GetValue()).Group(),
scheduleFunc: config.ScheduleFunc,
client: config.Client,
etcdClient: config.EtcdClient,
failoverTimeout: config.FailoverTimeout,
reconcileInterval: config.ReconcileInterval,
offers: offers.CreateRegistry(offers.RegistryConfig{
Compat: func(o *mesos.Offer) bool {
// filter the offers: the executor IDs must not identify a kubelet-
// executor with a group that doesn't match ours
for _, eid := range o.GetExecutorIds() {
execuid := uid.Parse(eid.GetValue())
if execuid.Name() == execcfg.DefaultInfoID && execuid.Group() != k.executorGroup {
return false
}
}
return true
},
DeclineOffer: func(id string) <-chan error {
errOnce := proc.NewErrorOnce(k.terminate)
errOuter := k.asRegisteredMaster.Do(func() {
var err error
defer errOnce.Report(err)
offerId := mutil.NewOfferID(id)
filters := &mesos.Filters{}
_, err = k.driver.DeclineOffer(offerId, filters)
})
return errOnce.Send(errOuter).Err()
},
// remember expired offers so that we can tell if a previously scheduler offer relies on one
LingerTTL: config.Schedcfg.OfferLingerTTL.Duration,
TTL: config.Schedcfg.OfferTTL.Duration,
ListenerDelay: config.Schedcfg.ListenerDelay.Duration,
}),
slaves: newSlaveStorage(),
taskRegistry: podtask.NewInMemoryRegistry(),
reconcileCooldown: config.ReconcileCooldown,
registration: make(chan struct{}),
asRegisteredMaster: proc.DoerFunc(func(proc.Action) <-chan error {
return proc.ErrorChanf("cannot execute action with unregistered scheduler")
}),
}
return k
}
func (k *KubernetesScheduler) Init(electedMaster proc.Process, pl PluginInterface, mux *http.ServeMux) error {
log.V(1).Infoln("initializing kubernetes mesos scheduler")
k.asRegisteredMaster = proc.DoerFunc(func(a proc.Action) <-chan error {
if !k.registered {
return proc.ErrorChanf("failed to execute action, scheduler is disconnected")
}
return electedMaster.Do(a)
})
k.terminate = electedMaster.Done()
k.plugin = pl
k.offers.Init(k.terminate)
k.InstallDebugHandlers(mux)
return k.recoverTasks()
}
func (k *KubernetesScheduler) asMaster() proc.Doer {
k.RLock()
defer k.RUnlock()
return k.asRegisteredMaster
}
func (k *KubernetesScheduler) InstallDebugHandlers(mux *http.ServeMux) {
wrappedHandler := func(uri string, h http.Handler) {
mux.HandleFunc(uri, func(w http.ResponseWriter, r *http.Request) {
ch := make(chan struct{})
closer := runtime.Closer(ch)
proc.OnError(k.asMaster().Do(func() {
defer closer()
h.ServeHTTP(w, r)
}), func(err error) {
defer closer()
log.Warningf("failed HTTP request for %s: %v", uri, err)
w.WriteHeader(http.StatusServiceUnavailable)
}, k.terminate)
select {
case <-time.After(k.schedcfg.HttpHandlerTimeout.Duration):
log.Warningf("timed out waiting for request to be processed")
w.WriteHeader(http.StatusServiceUnavailable)
return
case <-ch: // noop
}
})
}
requestReconciliation := func(uri string, requestAction func()) {
wrappedHandler(uri, http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
requestAction()
w.WriteHeader(http.StatusNoContent)
}))
}
requestReconciliation("/debug/actions/requestExplicit", k.reconciler.RequestExplicit)
requestReconciliation("/debug/actions/requestImplicit", k.reconciler.RequestImplicit)
wrappedHandler("/debug/actions/kamikaze", http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
slaves := k.slaves.getSlaveIds()
for _, slaveId := range slaves {
_, err := k.driver.SendFrameworkMessage(
k.executor.ExecutorId,
mutil.NewSlaveID(slaveId),
messages.Kamikaze)
if err != nil {
log.Warningf("failed to send kamikaze message to slave %s: %v", slaveId, err)
} else {
io.WriteString(w, fmt.Sprintf("kamikaze slave %s\n", slaveId))
}
}
io.WriteString(w, "OK")
}))
}
func (k *KubernetesScheduler) Registration() <-chan struct{} {
return k.registration
}
// Registered is called when the scheduler registered with the master successfully.
func (k *KubernetesScheduler) Registered(drv bindings.SchedulerDriver, fid *mesos.FrameworkID, mi *mesos.MasterInfo) {
log.Infof("Scheduler registered with the master: %v with frameworkId: %v\n", mi, fid)
k.driver = drv
k.frameworkId = fid
k.masterInfo = mi
k.registered = true
k.onRegistration.Do(func() { k.onInitialRegistration(drv) })
k.reconciler.RequestExplicit()
}
func (k *KubernetesScheduler) storeFrameworkId() {
// TODO(jdef): port FrameworkId store to generic Kubernetes config store as soon as available
_, err := k.etcdClient.Set(meta.FrameworkIDKey, k.frameworkId.GetValue(), uint64(k.failoverTimeout))
if err != nil {
log.Errorf("failed to renew frameworkId TTL: %v", err)
}
}
// Reregistered is called when the scheduler re-registered with the master successfully.
// This happends when the master fails over.
func (k *KubernetesScheduler) Reregistered(drv bindings.SchedulerDriver, mi *mesos.MasterInfo) {
log.Infof("Scheduler reregistered with the master: %v\n", mi)
k.driver = drv
k.masterInfo = mi
k.registered = true
k.onRegistration.Do(func() { k.onInitialRegistration(drv) })
k.reconciler.RequestExplicit()
}
// perform one-time initialization actions upon the first registration event received from Mesos.
func (k *KubernetesScheduler) onInitialRegistration(driver bindings.SchedulerDriver) {
defer close(k.registration)
if k.failoverTimeout > 0 {
refreshInterval := k.schedcfg.FrameworkIdRefreshInterval.Duration
if k.failoverTimeout < k.schedcfg.FrameworkIdRefreshInterval.Duration.Seconds() {
refreshInterval = time.Duration(math.Max(1, k.failoverTimeout/2)) * time.Second
}
go runtime.Until(k.storeFrameworkId, refreshInterval, k.terminate)
}
r1 := k.makeTaskRegistryReconciler()
r2 := k.makePodRegistryReconciler()
k.reconciler = newReconciler(k.asRegisteredMaster, k.makeCompositeReconciler(r1, r2),
k.reconcileCooldown, k.schedcfg.ExplicitReconciliationAbortTimeout.Duration, k.terminate)
go k.reconciler.Run(driver)
if k.reconcileInterval > 0 {
ri := time.Duration(k.reconcileInterval) * time.Second
time.AfterFunc(k.schedcfg.InitialImplicitReconciliationDelay.Duration, func() { runtime.Until(k.reconciler.RequestImplicit, ri, k.terminate) })
log.Infof("will perform implicit task reconciliation at interval: %v after %v", ri, k.schedcfg.InitialImplicitReconciliationDelay.Duration)
}
}
// Disconnected is called when the scheduler loses connection to the master.
func (k *KubernetesScheduler) Disconnected(driver bindings.SchedulerDriver) {
log.Infof("Master disconnected!\n")
k.registered = false
// discard all cached offers to avoid unnecessary TASK_LOST updates
k.offers.Invalidate("")
}
// ResourceOffers is called when the scheduler receives some offers from the master.
func (k *KubernetesScheduler) ResourceOffers(driver bindings.SchedulerDriver, offers []*mesos.Offer) {
log.V(2).Infof("Received offers %+v", offers)
// Record the offers in the global offer map as well as each slave's offer map.
k.offers.Add(offers)
for _, offer := range offers {
slaveId := offer.GetSlaveId().GetValue()
k.slaves.checkAndAdd(slaveId, offer.GetHostname())
}
}
// OfferRescinded is called when the resources are recinded from the scheduler.
func (k *KubernetesScheduler) OfferRescinded(driver bindings.SchedulerDriver, offerId *mesos.OfferID) {
log.Infof("Offer rescinded %v\n", offerId)
oid := offerId.GetValue()
k.offers.Delete(oid, offerMetrics.OfferRescinded)
}
// StatusUpdate is called when a status update message is sent to the scheduler.
func (k *KubernetesScheduler) StatusUpdate(driver bindings.SchedulerDriver, taskStatus *mesos.TaskStatus) {
source, reason := "none", "none"
if taskStatus.Source != nil {
source = (*taskStatus.Source).String()
}
if taskStatus.Reason != nil {
reason = (*taskStatus.Reason).String()
}
taskState := taskStatus.GetState()
metrics.StatusUpdates.WithLabelValues(source, reason, taskState.String()).Inc()
log.Infof(
"task status update %q from %q for task %q on slave %q executor %q for reason %q",
taskState.String(),
source,
taskStatus.TaskId.GetValue(),
taskStatus.SlaveId.GetValue(),
taskStatus.ExecutorId.GetValue(),
reason)
switch taskState {
case mesos.TaskState_TASK_RUNNING, mesos.TaskState_TASK_FINISHED, mesos.TaskState_TASK_STARTING, mesos.TaskState_TASK_STAGING:
if _, state := k.taskRegistry.UpdateStatus(taskStatus); state == podtask.StateUnknown {
if taskState != mesos.TaskState_TASK_FINISHED {
//TODO(jdef) what if I receive this after a TASK_LOST or TASK_KILLED?
//I don't want to reincarnate then.. TASK_LOST is a special case because
//the master is stateless and there are scenarios where I may get TASK_LOST
//followed by TASK_RUNNING.
//TODO(jdef) consider running this asynchronously since there are API server
//calls that may be made
k.reconcileNonTerminalTask(driver, taskStatus)
} // else, we don't really care about FINISHED tasks that aren't registered
return
}
if _, exists := k.slaves.getSlave(taskStatus.GetSlaveId().GetValue()); !exists {
// a registered task has an update reported by a slave that we don't recognize.
// this should never happen! So we don't reconcile it.
log.Errorf("Ignore status %+v because the slave does not exist", taskStatus)
return
}
case mesos.TaskState_TASK_FAILED:
if task, _ := k.taskRegistry.UpdateStatus(taskStatus); task != nil {
if task.Has(podtask.Launched) && !task.Has(podtask.Bound) {
go k.plugin.reconcilePod(task.Pod)
return
}
} else {
// unknown task failed, not much we can do about it
return
}
// last-ditch effort to reconcile our records
fallthrough
case mesos.TaskState_TASK_LOST, mesos.TaskState_TASK_KILLED:
k.reconcileTerminalTask(driver, taskStatus)
}
}
func (k *KubernetesScheduler) reconcileTerminalTask(driver bindings.SchedulerDriver, taskStatus *mesos.TaskStatus) {
task, state := k.taskRegistry.UpdateStatus(taskStatus)
if (state == podtask.StateRunning || state == podtask.StatePending) && taskStatus.SlaveId != nil &&
((taskStatus.GetSource() == mesos.TaskStatus_SOURCE_MASTER && taskStatus.GetReason() == mesos.TaskStatus_REASON_RECONCILIATION) ||
(taskStatus.GetSource() == mesos.TaskStatus_SOURCE_SLAVE && taskStatus.GetReason() == mesos.TaskStatus_REASON_EXECUTOR_TERMINATED) ||
(taskStatus.GetSource() == mesos.TaskStatus_SOURCE_SLAVE && taskStatus.GetReason() == mesos.TaskStatus_REASON_EXECUTOR_UNREGISTERED)) {
//--
// pod-task has metadata that refers to:
// (1) a task that Mesos no longer knows about, or else
// (2) a pod that the Kubelet will never report as "failed"
// For now, destroy the pod and hope that there's a replication controller backing it up.
// TODO(jdef) for case #2 don't delete the pod, just update it's status to Failed
pod := &task.Pod
log.Warningf("deleting rogue pod %v/%v for lost task %v", pod.Namespace, pod.Name, task.ID)
if err := k.client.Pods(pod.Namespace).Delete(pod.Name, nil); err != nil && !errors.IsNotFound(err) {
log.Errorf("failed to delete pod %v/%v for terminal task %v: %v", pod.Namespace, pod.Name, task.ID, err)
}
} else if taskStatus.GetReason() == mesos.TaskStatus_REASON_EXECUTOR_TERMINATED || taskStatus.GetReason() == mesos.TaskStatus_REASON_EXECUTOR_UNREGISTERED {
// attempt to prevent dangling pods in the pod and task registries
log.V(1).Infof("request explicit reconciliation to clean up for task %v after executor reported (terminated/unregistered)", taskStatus.TaskId.GetValue())
k.reconciler.RequestExplicit()
} else if taskStatus.GetState() == mesos.TaskState_TASK_LOST && state == podtask.StateRunning && taskStatus.ExecutorId != nil && taskStatus.SlaveId != nil {
//TODO(jdef) this may not be meaningful once we have proper checkpointing and master detection
//If we're reconciling and receive this then the executor may be
//running a task that we need it to kill. It's possible that the framework
//is unrecognized by the master at this point, so KillTask is not guaranteed
//to do anything. The underlying driver transport may be able to send a
//FrameworkMessage directly to the slave to terminate the task.
log.V(2).Info("forwarding TASK_LOST message to executor %v on slave %v", taskStatus.ExecutorId, taskStatus.SlaveId)
data := fmt.Sprintf("task-lost:%s", task.ID) //TODO(jdef) use a real message type
if _, err := driver.SendFrameworkMessage(taskStatus.ExecutorId, taskStatus.SlaveId, data); err != nil {
log.Error(err.Error())
}
}
}
// reconcile an unknown (from the perspective of our registry) non-terminal task
func (k *KubernetesScheduler) reconcileNonTerminalTask(driver bindings.SchedulerDriver, taskStatus *mesos.TaskStatus) {
// attempt to recover task from pod info:
// - task data may contain an api.PodStatusResult; if status.reason == REASON_RECONCILIATION then status.data == nil
// - the Name can be parsed by container.ParseFullName() to yield a pod Name and Namespace
// - pull the pod metadata down from the api server
// - perform task recovery based on pod metadata
taskId := taskStatus.TaskId.GetValue()
if taskStatus.GetReason() == mesos.TaskStatus_REASON_RECONCILIATION && taskStatus.GetSource() == mesos.TaskStatus_SOURCE_MASTER {
// there will be no data in the task status that we can use to determine the associated pod
switch taskStatus.GetState() {
case mesos.TaskState_TASK_STAGING:
// there is still hope for this task, don't kill it just yet
//TODO(jdef) there should probably be a limit for how long we tolerate tasks stuck in this state
return
default:
// for TASK_{STARTING,RUNNING} we should have already attempted to recoverTasks() for.
// if the scheduler failed over before the executor fired TASK_STARTING, then we should *not*
// be processing this reconciliation update before we process the one from the executor.
// point: we don't know what this task is (perhaps there was unrecoverable metadata in the pod),
// so it gets killed.
log.Errorf("killing non-terminal, unrecoverable task %v", taskId)
}
} else if podStatus, err := podtask.ParsePodStatusResult(taskStatus); err != nil {
// possible rogue pod exists at this point because we can't identify it; should kill the task
log.Errorf("possible rogue pod; illegal task status data for task %v, expected an api.PodStatusResult: %v", taskId, err)
} else if name, namespace, err := container.ParsePodFullName(podStatus.Name); err != nil {
// possible rogue pod exists at this point because we can't identify it; should kill the task
log.Errorf("possible rogue pod; illegal api.PodStatusResult, unable to parse full pod name from: '%v' for task %v: %v",
podStatus.Name, taskId, err)
} else if pod, err := k.client.Pods(namespace).Get(name); err == nil {
if t, ok, err := podtask.RecoverFrom(*pod); ok {
log.Infof("recovered task %v from metadata in pod %v/%v", taskId, namespace, name)
_, err := k.taskRegistry.Register(t, nil)
if err != nil {
// someone beat us to it?!
log.Warningf("failed to register recovered task: %v", err)
return
} else {
k.taskRegistry.UpdateStatus(taskStatus)
}
return
} else if err != nil {
//should kill the pod and the task
log.Errorf("killing pod, failed to recover task from pod %v/%v: %v", namespace, name, err)
if err := k.client.Pods(namespace).Delete(name, nil); err != nil {
log.Errorf("failed to delete pod %v/%v: %v", namespace, name, err)
}
} else {
//this is pretty unexpected: we received a TASK_{STARTING,RUNNING} message, but the apiserver's pod
//metadata is not appropriate for task reconstruction -- which should almost certainly never
//be the case unless someone swapped out the pod on us (and kept the same namespace/name) while
//we were failed over.
//kill this task, allow the newly launched scheduler to schedule the new pod
log.Warningf("unexpected pod metadata for task %v in apiserver, assuming new unscheduled pod spec: %+v", taskId, pod)
}
} else if errors.IsNotFound(err) {
// pod lookup failed, should delete the task since the pod is no longer valid; may be redundant, that's ok
log.Infof("killing task %v since pod %v/%v no longer exists", taskId, namespace, name)
} else if errors.IsServerTimeout(err) {
log.V(2).Infof("failed to reconcile task due to API server timeout: %v", err)
return
} else {
log.Errorf("unexpected API server error, aborting reconcile for task %v: %v", taskId, err)
return
}
if _, err := driver.KillTask(taskStatus.TaskId); err != nil {
log.Errorf("failed to kill task %v: %v", taskId, err)
}
}
// FrameworkMessage is called when the scheduler receives a message from the executor.
func (k *KubernetesScheduler) FrameworkMessage(driver bindings.SchedulerDriver,
executorId *mesos.ExecutorID, slaveId *mesos.SlaveID, message string) {
log.Infof("Received messages from executor %v of slave %v, %v\n", executorId, slaveId, message)
}
// SlaveLost is called when some slave is lost.
func (k *KubernetesScheduler) SlaveLost(driver bindings.SchedulerDriver, slaveId *mesos.SlaveID) {
log.Infof("Slave %v is lost\n", slaveId)
sid := slaveId.GetValue()
k.offers.InvalidateForSlave(sid)
// TODO(jdef): delete slave from our internal list? probably not since we may need to reconcile
// tasks. it would be nice to somehow flag the slave as lost so that, perhaps, we can periodically
// flush lost slaves older than X, and for which no tasks or pods reference.
// unfinished tasks/pods will be dropped. use a replication controller if you want pods to
// be restarted when slaves die.
}
// ExecutorLost is called when some executor is lost.
func (k *KubernetesScheduler) ExecutorLost(driver bindings.SchedulerDriver, executorId *mesos.ExecutorID, slaveId *mesos.SlaveID, status int) {
log.Infof("Executor %v of slave %v is lost, status: %v\n", executorId, slaveId, status)
// TODO(yifan): Restart any unfinished tasks of the executor.
}
// Error is called when there is an unrecoverable error in the scheduler or scheduler driver.
// The driver should have been aborted before this is invoked.
func (k *KubernetesScheduler) Error(driver bindings.SchedulerDriver, message string) {
log.Fatalf("fatal scheduler error: %v\n", message)
}
// filter func used for explicit task reconciliation, selects only non-terminal tasks which
// have been communicated to mesos (read: launched).
func explicitTaskFilter(t *podtask.T) bool {
switch t.State {
case podtask.StateRunning:
return true
case podtask.StatePending:
return t.Has(podtask.Launched)
default:
return false
}
}
// invoke the given ReconcilerAction funcs in sequence, aborting the sequence if reconciliation
// is cancelled. if any other errors occur the composite reconciler will attempt to complete the
// sequence, reporting only the last generated error.
func (k *KubernetesScheduler) makeCompositeReconciler(actions ...ReconcilerAction) ReconcilerAction {
if x := len(actions); x == 0 {
// programming error
panic("no actions specified for composite reconciler")
} else if x == 1 {
return actions[0]
}
chained := func(d bindings.SchedulerDriver, c <-chan struct{}, a, b ReconcilerAction) <-chan error {
ech := a(d, c)
ch := make(chan error, 1)
go func() {
select {
case <-k.terminate:
case <-c:
case e := <-ech:
if e != nil {
ch <- e
return
}
ech = b(d, c)
select {
case <-k.terminate:
case <-c:
case e := <-ech:
if e != nil {
ch <- e
return
}
close(ch)
return
}
}
ch <- fmt.Errorf("aborting composite reconciler action")
}()
return ch
}
result := func(d bindings.SchedulerDriver, c <-chan struct{}) <-chan error {
return chained(d, c, actions[0], actions[1])
}
for i := 2; i < len(actions); i++ {
i := i
next := func(d bindings.SchedulerDriver, c <-chan struct{}) <-chan error {
return chained(d, c, ReconcilerAction(result), actions[i])
}
result = next
}
return ReconcilerAction(result)
}
// reconciler action factory, performs explicit task reconciliation for non-terminal
// tasks listed in the scheduler's internal taskRegistry.
func (k *KubernetesScheduler) makeTaskRegistryReconciler() ReconcilerAction {
return ReconcilerAction(func(drv bindings.SchedulerDriver, cancel <-chan struct{}) <-chan error {
taskToSlave := make(map[string]string)
for _, t := range k.taskRegistry.List(explicitTaskFilter) {
if t.Spec.SlaveID != "" {
taskToSlave[t.ID] = t.Spec.SlaveID
}
}
return proc.ErrorChan(k.explicitlyReconcileTasks(drv, taskToSlave, cancel))
})
}
// reconciler action factory, performs explicit task reconciliation for non-terminal
// tasks identified by annotations in the Kubernetes pod registry.
func (k *KubernetesScheduler) makePodRegistryReconciler() ReconcilerAction {
return ReconcilerAction(func(drv bindings.SchedulerDriver, cancel <-chan struct{}) <-chan error {
ctx := api.NewDefaultContext()
podList, err := k.client.Pods(api.NamespaceValue(ctx)).List(labels.Everything(), fields.Everything())
if err != nil {
return proc.ErrorChanf("failed to reconcile pod registry: %v", err)
}
taskToSlave := make(map[string]string)
for _, pod := range podList.Items {
if len(pod.Annotations) == 0 {
continue
}
taskId, found := pod.Annotations[meta.TaskIdKey]
if !found {
continue
}
slaveId, found := pod.Annotations[meta.SlaveIdKey]
if !found {
continue
}
taskToSlave[taskId] = slaveId
}
return proc.ErrorChan(k.explicitlyReconcileTasks(drv, taskToSlave, cancel))
})
}
// execute an explicit task reconciliation, as per http://mesos.apache.org/documentation/latest/reconciliation/
func (k *KubernetesScheduler) explicitlyReconcileTasks(driver bindings.SchedulerDriver, taskToSlave map[string]string, cancel <-chan struct{}) error {
log.Info("explicit reconcile tasks")
// tell mesos to send us the latest status updates for all the non-terminal tasks that we know about
statusList := []*mesos.TaskStatus{}
remaining := util.KeySet(reflect.ValueOf(taskToSlave))
for taskId, slaveId := range taskToSlave {
if slaveId == "" {
delete(taskToSlave, taskId)
continue
}
statusList = append(statusList, &mesos.TaskStatus{
TaskId: mutil.NewTaskID(taskId),
SlaveId: mutil.NewSlaveID(slaveId),
State: mesos.TaskState_TASK_RUNNING.Enum(), // req'd field, doesn't have to reflect reality
})
}
select {
case <-cancel:
return reconciliationCancelledErr
default:
if _, err := driver.ReconcileTasks(statusList); err != nil {
return err
}
}
start := time.Now()
first := true
for backoff := 1 * time.Second; first || remaining.Len() > 0; backoff = backoff * 2 {
first = false
// nothing to do here other than wait for status updates..
if backoff > k.schedcfg.ExplicitReconciliationMaxBackoff.Duration {
backoff = k.schedcfg.ExplicitReconciliationMaxBackoff.Duration
}
select {
case <-cancel:
return reconciliationCancelledErr
case <-time.After(backoff):
for taskId := range remaining {
if task, _ := k.taskRegistry.Get(taskId); task != nil && explicitTaskFilter(task) && task.UpdatedTime.Before(start) {
// keep this task in remaining list
continue
}
remaining.Delete(taskId)
}
}
}
return nil
}
var (
reconciliationCancelledErr = fmt.Errorf("explicit task reconciliation cancelled")
)
type ReconcilerAction func(driver bindings.SchedulerDriver, cancel <-chan struct{}) <-chan error
type Reconciler struct {
proc.Doer
Action ReconcilerAction
explicit chan struct{} // send an empty struct to trigger explicit reconciliation
implicit chan struct{} // send an empty struct to trigger implicit reconciliation
done <-chan struct{} // close this when you want the reconciler to exit
cooldown time.Duration
explicitReconciliationAbortTimeout time.Duration
}
func newReconciler(doer proc.Doer, action ReconcilerAction,
cooldown, explicitReconciliationAbortTimeout time.Duration, done <-chan struct{}) *Reconciler {
return &Reconciler{
Doer: doer,
explicit: make(chan struct{}, 1),
implicit: make(chan struct{}, 1),
cooldown: cooldown,
explicitReconciliationAbortTimeout: explicitReconciliationAbortTimeout,
done: done,
Action: func(driver bindings.SchedulerDriver, cancel <-chan struct{}) <-chan error {
// trigged the reconciler action in the doer's execution context,
// but it could take a while and the scheduler needs to be able to
// process updates, the callbacks for which ALSO execute in the SAME
// deferred execution context -- so the action MUST be executed async.
errOnce := proc.NewErrorOnce(cancel)
return errOnce.Send(doer.Do(func() {
// only triggers the action if we're the currently elected,
// registered master and runs the action async.
go func() {
var err <-chan error
defer errOnce.Send(err)
err = action(driver, cancel)
}()
})).Err()
},
}
}
func (r *Reconciler) RequestExplicit() {
select {
case r.explicit <- struct{}{}: // noop
default: // request queue full; noop
}
}
func (r *Reconciler) RequestImplicit() {
select {
case r.implicit <- struct{}{}: // noop
default: // request queue full; noop
}
}
// execute task reconciliation, returns when r.done is closed. intended to run as a goroutine.
// if reconciliation is requested while another is in progress, the in-progress operation will be
// cancelled before the new reconciliation operation begins.
func (r *Reconciler) Run(driver bindings.SchedulerDriver) {
var cancel, finished chan struct{}
requestLoop:
for {
select {
case <-r.done:
return
default: // proceed
}
select {
case <-r.implicit:
metrics.ReconciliationRequested.WithLabelValues("implicit").Inc()
select {
case <-r.done:
return
case <-r.explicit:
break // give preference to a pending request for explicit
default: // continue
// don't run implicit reconciliation while explicit is ongoing
if finished != nil {
select {
case <-finished: // continue w/ implicit
default:
log.Infoln("skipping implicit reconcile because explicit reconcile is ongoing")
continue requestLoop
}
}
errOnce := proc.NewErrorOnce(r.done)
errCh := r.Do(func() {
var err error
defer errOnce.Report(err)
log.Infoln("implicit reconcile tasks")
metrics.ReconciliationExecuted.WithLabelValues("implicit").Inc()
if _, err = driver.ReconcileTasks([]*mesos.TaskStatus{}); err != nil {
log.V(1).Infof("failed to request implicit reconciliation from mesos: %v", err)
}
})
proc.OnError(errOnce.Send(errCh).Err(), func(err error) {
log.Errorf("failed to run implicit reconciliation: %v", err)
}, r.done)
goto slowdown
}
case <-r.done:
return
case <-r.explicit: // continue
metrics.ReconciliationRequested.WithLabelValues("explicit").Inc()
}
if cancel != nil {
close(cancel)
cancel = nil
// play nice and wait for the prior operation to finish, complain
// if it doesn't
select {
case <-r.done:
return
case <-finished: // noop, expected
case <-time.After(r.explicitReconciliationAbortTimeout): // very unexpected
log.Error("reconciler action failed to stop upon cancellation")
}
}
// copy 'finished' to 'fin' here in case we end up with simultaneous go-routines,
// if cancellation takes too long or fails - we don't want to close the same chan
// more than once
cancel = make(chan struct{})
finished = make(chan struct{})
go func(fin chan struct{}) {
startedAt := time.Now()
defer func() {
metrics.ReconciliationLatency.Observe(metrics.InMicroseconds(time.Since(startedAt)))
}()
metrics.ReconciliationExecuted.WithLabelValues("explicit").Inc()
defer close(fin)
err := <-r.Action(driver, cancel)
if err == reconciliationCancelledErr {
metrics.ReconciliationCancelled.WithLabelValues("explicit").Inc()
log.Infoln(err.Error())
} else if err != nil {
log.Errorf("reconciler action failed: %v", err)
}
}(finished)
slowdown:
// don't allow reconciliation to run very frequently, either explicit or implicit
select {
case <-r.done:
return
case <-time.After(r.cooldown): // noop
}
} // for
}
func (ks *KubernetesScheduler) recoverTasks() error {
ctx := api.NewDefaultContext()
podList, err := ks.client.Pods(api.NamespaceValue(ctx)).List(labels.Everything(), fields.Everything())
if err != nil {
log.V(1).Infof("failed to recover pod registry, madness may ensue: %v", err)
return err
}
recoverSlave := func(t *podtask.T) {
slaveId := t.Spec.SlaveID
ks.slaves.checkAndAdd(slaveId, t.Offer.Host())
}
for _, pod := range podList.Items {
if t, ok, err := podtask.RecoverFrom(pod); err != nil {
log.Errorf("failed to recover task from pod, will attempt to delete '%v/%v': %v", pod.Namespace, pod.Name, err)
err := ks.client.Pods(pod.Namespace).Delete(pod.Name, nil)
//TODO(jdef) check for temporary or not-found errors
if err != nil {
log.Errorf("failed to delete pod '%v/%v': %v", pod.Namespace, pod.Name, err)
}
} else if ok {
ks.taskRegistry.Register(t, nil)
recoverSlave(t)
log.Infof("recovered task %v from pod %v/%v", t.ID, pod.Namespace, pod.Name)
}
}
return nil
}

View File

@@ -0,0 +1,350 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package scheduler
import (
"testing"
"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/offers"
"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/proc"
schedcfg "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/scheduler/config"
"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/scheduler/podtask"
mesos "github.com/mesos/mesos-go/mesosproto"
util "github.com/mesos/mesos-go/mesosutil"
"github.com/stretchr/testify/assert"
)
// Check that same slave is only added once.
func TestSlaveStorage_checkAndAdd(t *testing.T) {
assert := assert.New(t)
slaveStorage := newSlaveStorage()
assert.Equal(0, len(slaveStorage.slaves))
slaveId := "slave1"
slaveHostname := "slave1Hostname"
slaveStorage.checkAndAdd(slaveId, slaveHostname)
assert.Equal(1, len(slaveStorage.getSlaveIds()))
slaveStorage.checkAndAdd(slaveId, slaveHostname)
assert.Equal(1, len(slaveStorage.getSlaveIds()))
}
// Check that getSlave returns notExist for nonexisting slave.
func TestSlaveStorage_getSlave(t *testing.T) {
assert := assert.New(t)
slaveStorage := newSlaveStorage()
assert.Equal(0, len(slaveStorage.slaves))
slaveId := "slave1"
slaveHostname := "slave1Hostname"
_, exists := slaveStorage.getSlave(slaveId)
assert.Equal(false, exists)
slaveStorage.checkAndAdd(slaveId, slaveHostname)
assert.Equal(1, len(slaveStorage.getSlaveIds()))
_, exists = slaveStorage.getSlave(slaveId)
assert.Equal(true, exists)
}
// Check that getSlaveIds returns array with all slaveIds.
func TestSlaveStorage_getSlaveIds(t *testing.T) {
assert := assert.New(t)
slaveStorage := newSlaveStorage()
assert.Equal(0, len(slaveStorage.slaves))
slaveId := "1"
slaveHostname := "hn1"
slaveStorage.checkAndAdd(slaveId, slaveHostname)
assert.Equal(1, len(slaveStorage.getSlaveIds()))
slaveId = "2"
slaveHostname = "hn2"
slaveStorage.checkAndAdd(slaveId, slaveHostname)
assert.Equal(2, len(slaveStorage.getSlaveIds()))
slaveIds := slaveStorage.getSlaveIds()
slaveIdsMap := make(map[string]bool, len(slaveIds))
for _, s := range slaveIds {
slaveIdsMap[s] = true
}
_, ok := slaveIdsMap["1"]
assert.Equal(ok, true)
_, ok = slaveIdsMap["2"]
assert.Equal(ok, true)
}
//get number of non-expired offers from offer registry
func getNumberOffers(os offers.Registry) int {
//walk offers and check it is stored in registry
walked := 0
walker1 := func(p offers.Perishable) (bool, error) {
walked++
return false, nil
}
os.Walk(walker1)
return walked
}
//test adding of ressource offer, should be added to offer registry and slavesf
func TestResourceOffer_Add(t *testing.T) {
assert := assert.New(t)
testScheduler := &KubernetesScheduler{
offers: offers.CreateRegistry(offers.RegistryConfig{
Compat: func(o *mesos.Offer) bool {
return true
},
DeclineOffer: func(offerId string) <-chan error {
return proc.ErrorChan(nil)
},
// remember expired offers so that we can tell if a previously scheduler offer relies on one
LingerTTL: schedcfg.DefaultOfferLingerTTL,
TTL: schedcfg.DefaultOfferTTL,
ListenerDelay: schedcfg.DefaultListenerDelay,
}),
slaves: newSlaveStorage(),
}
hostname := "h1"
offerID1 := util.NewOfferID("test1")
offer1 := &mesos.Offer{Id: offerID1, Hostname: &hostname, SlaveId: util.NewSlaveID(hostname)}
offers1 := []*mesos.Offer{offer1}
testScheduler.ResourceOffers(nil, offers1)
assert.Equal(1, getNumberOffers(testScheduler.offers))
//check slave hostname
assert.Equal(1, len(testScheduler.slaves.getSlaveIds()))
//add another offer
hostname2 := "h2"
offer2 := &mesos.Offer{Id: util.NewOfferID("test2"), Hostname: &hostname2, SlaveId: util.NewSlaveID(hostname2)}
offers2 := []*mesos.Offer{offer2}
testScheduler.ResourceOffers(nil, offers2)
//check it is stored in registry
assert.Equal(2, getNumberOffers(testScheduler.offers))
//check slave hostnames
assert.Equal(2, len(testScheduler.slaves.getSlaveIds()))
}
//test adding of ressource offer, should be added to offer registry and slavesf
func TestResourceOffer_Add_Rescind(t *testing.T) {
assert := assert.New(t)
testScheduler := &KubernetesScheduler{
offers: offers.CreateRegistry(offers.RegistryConfig{
Compat: func(o *mesos.Offer) bool {
return true
},
DeclineOffer: func(offerId string) <-chan error {
return proc.ErrorChan(nil)
},
// remember expired offers so that we can tell if a previously scheduler offer relies on one
LingerTTL: schedcfg.DefaultOfferLingerTTL,
TTL: schedcfg.DefaultOfferTTL,
ListenerDelay: schedcfg.DefaultListenerDelay,
}),
slaves: newSlaveStorage(),
}
hostname := "h1"
offerID1 := util.NewOfferID("test1")
offer1 := &mesos.Offer{Id: offerID1, Hostname: &hostname, SlaveId: util.NewSlaveID(hostname)}
offers1 := []*mesos.Offer{offer1}
testScheduler.ResourceOffers(nil, offers1)
assert.Equal(1, getNumberOffers(testScheduler.offers))
//check slave hostname
assert.Equal(1, len(testScheduler.slaves.getSlaveIds()))
//add another offer
hostname2 := "h2"
offer2 := &mesos.Offer{Id: util.NewOfferID("test2"), Hostname: &hostname2, SlaveId: util.NewSlaveID(hostname2)}
offers2 := []*mesos.Offer{offer2}
testScheduler.ResourceOffers(nil, offers2)
assert.Equal(2, getNumberOffers(testScheduler.offers))
//check slave hostnames
assert.Equal(2, len(testScheduler.slaves.getSlaveIds()))
//next whether offers can be rescinded
testScheduler.OfferRescinded(nil, offerID1)
assert.Equal(1, getNumberOffers(testScheduler.offers))
//next whether offers can be rescinded
testScheduler.OfferRescinded(nil, util.NewOfferID("test2"))
//walk offers again and check it is removed from registry
assert.Equal(0, getNumberOffers(testScheduler.offers))
//remove non existing ID
testScheduler.OfferRescinded(nil, util.NewOfferID("notExist"))
}
//test that when a slave is lost we remove all offers
func TestSlave_Lost(t *testing.T) {
assert := assert.New(t)
//
testScheduler := &KubernetesScheduler{
offers: offers.CreateRegistry(offers.RegistryConfig{
Compat: func(o *mesos.Offer) bool {
return true
},
// remember expired offers so that we can tell if a previously scheduler offer relies on one
LingerTTL: schedcfg.DefaultOfferLingerTTL,
TTL: schedcfg.DefaultOfferTTL,
ListenerDelay: schedcfg.DefaultListenerDelay,
}),
slaves: newSlaveStorage(),
}
hostname := "h1"
offer1 := &mesos.Offer{Id: util.NewOfferID("test1"), Hostname: &hostname, SlaveId: util.NewSlaveID(hostname)}
offers1 := []*mesos.Offer{offer1}
testScheduler.ResourceOffers(nil, offers1)
offer2 := &mesos.Offer{Id: util.NewOfferID("test2"), Hostname: &hostname, SlaveId: util.NewSlaveID(hostname)}
offers2 := []*mesos.Offer{offer2}
testScheduler.ResourceOffers(nil, offers2)
//add another offer from different slaveID
hostname2 := "h2"
offer3 := &mesos.Offer{Id: util.NewOfferID("test3"), Hostname: &hostname2, SlaveId: util.NewSlaveID(hostname2)}
offers3 := []*mesos.Offer{offer3}
testScheduler.ResourceOffers(nil, offers3)
//test precondition
assert.Equal(3, getNumberOffers(testScheduler.offers))
assert.Equal(2, len(testScheduler.slaves.getSlaveIds()))
//remove first slave
testScheduler.SlaveLost(nil, util.NewSlaveID(hostname))
//offers should be removed
assert.Equal(1, getNumberOffers(testScheduler.offers))
//slave hostnames should still be all present
assert.Equal(2, len(testScheduler.slaves.getSlaveIds()))
//remove second slave
testScheduler.SlaveLost(nil, util.NewSlaveID(hostname2))
//offers should be removed
assert.Equal(0, getNumberOffers(testScheduler.offers))
//slave hostnames should still be all present
assert.Equal(2, len(testScheduler.slaves.getSlaveIds()))
//try to remove non existing slave
testScheduler.SlaveLost(nil, util.NewSlaveID("notExist"))
}
//test when we loose connection to master we invalidate all cached offers
func TestDisconnect(t *testing.T) {
assert := assert.New(t)
//
testScheduler := &KubernetesScheduler{
offers: offers.CreateRegistry(offers.RegistryConfig{
Compat: func(o *mesos.Offer) bool {
return true
},
// remember expired offers so that we can tell if a previously scheduler offer relies on one
LingerTTL: schedcfg.DefaultOfferLingerTTL,
TTL: schedcfg.DefaultOfferTTL,
ListenerDelay: schedcfg.DefaultListenerDelay,
}),
slaves: newSlaveStorage(),
}
hostname := "h1"
offer1 := &mesos.Offer{Id: util.NewOfferID("test1"), Hostname: &hostname, SlaveId: util.NewSlaveID(hostname)}
offers1 := []*mesos.Offer{offer1}
testScheduler.ResourceOffers(nil, offers1)
offer2 := &mesos.Offer{Id: util.NewOfferID("test2"), Hostname: &hostname, SlaveId: util.NewSlaveID(hostname)}
offers2 := []*mesos.Offer{offer2}
testScheduler.ResourceOffers(nil, offers2)
//add another offer from different slaveID
hostname2 := "h2"
offer3 := &mesos.Offer{Id: util.NewOfferID("test2"), Hostname: &hostname2, SlaveId: util.NewSlaveID(hostname2)}
offers3 := []*mesos.Offer{offer3}
testScheduler.ResourceOffers(nil, offers3)
//disconnect
testScheduler.Disconnected(nil)
//all offers should be removed
assert.Equal(0, getNumberOffers(testScheduler.offers))
//slave hostnames should still be all present
assert.Equal(2, len(testScheduler.slaves.getSlaveIds()))
}
//test we can handle different status updates, TODO check state transitions
func TestStatus_Update(t *testing.T) {
mockdriver := MockSchedulerDriver{}
// setup expectations
mockdriver.On("KillTask", util.NewTaskID("test-task-001")).Return(mesos.Status_DRIVER_RUNNING, nil)
testScheduler := &KubernetesScheduler{
offers: offers.CreateRegistry(offers.RegistryConfig{
Compat: func(o *mesos.Offer) bool {
return true
},
// remember expired offers so that we can tell if a previously scheduler offer relies on one
LingerTTL: schedcfg.DefaultOfferLingerTTL,
TTL: schedcfg.DefaultOfferTTL,
ListenerDelay: schedcfg.DefaultListenerDelay,
}),
slaves: newSlaveStorage(),
driver: &mockdriver,
taskRegistry: podtask.NewInMemoryRegistry(),
}
taskStatus_task_starting := util.NewTaskStatus(
util.NewTaskID("test-task-001"),
mesos.TaskState_TASK_RUNNING,
)
testScheduler.StatusUpdate(testScheduler.driver, taskStatus_task_starting)
taskStatus_task_running := util.NewTaskStatus(
util.NewTaskID("test-task-001"),
mesos.TaskState_TASK_RUNNING,
)
testScheduler.StatusUpdate(testScheduler.driver, taskStatus_task_running)
taskStatus_task_failed := util.NewTaskStatus(
util.NewTaskID("test-task-001"),
mesos.TaskState_TASK_FAILED,
)
testScheduler.StatusUpdate(testScheduler.driver, taskStatus_task_failed)
//assert that mock was invoked
mockdriver.AssertExpectations(t)
}

View File

@@ -0,0 +1,32 @@
// +build unit_test
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package service
import (
"os"
"syscall"
)
func makeFailoverSigChan() <-chan os.Signal {
return nil
}
func makeDisownedProcAttr() *syscall.SysProcAttr {
return nil
}

View File

@@ -0,0 +1,38 @@
// +build darwin dragonfly freebsd linux netbsd openbsd
// +build !unit_test
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package service
import (
"os"
"os/signal"
"syscall"
)
func makeFailoverSigChan() <-chan os.Signal {
ch := make(chan os.Signal, 1)
signal.Notify(ch, syscall.SIGUSR1)
return ch
}
func makeDisownedProcAttr() *syscall.SysProcAttr {
return &syscall.SysProcAttr{
Setpgid: true, // disown the spawned scheduler
}
}

View File

@@ -0,0 +1,51 @@
// +build windows
// +build !unit_test
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package service
import (
"os"
"syscall"
)
func makeFailoverSigChan() <-chan os.Signal {
/* TODO(jdef)
from go's windows compatibility test, it looks like we need to provide a filtered
signal channel here
c := make(chan os.Signal, 10)
signal.Notify(c)
select {
case s := <-c:
if s != os.Interrupt {
log.Fatalf("Wrong signal received: got %q, want %q\n", s, os.Interrupt)
}
case <-time.After(3 * time.Second):
log.Fatalf("Timeout waiting for Ctrl+Break\n")
}
*/
return nil
}
func makeDisownedProcAttr() *syscall.SysProcAttr {
//TODO(jdef) test this somehow?!?!
return &syscall.SysProcAttr{
CreationFlags: syscall.CREATE_NEW_PROCESS_GROUP | syscall.CREATE_UNICODE_ENVIRONMENT,
}
}

View File

@@ -0,0 +1,18 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Package service contains the cmd/k8sm-scheduler glue code
package service

View File

@@ -0,0 +1,121 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package service
import (
"net"
"reflect"
"time"
"github.com/GoogleCloudPlatform/kubernetes/pkg/api"
"github.com/GoogleCloudPlatform/kubernetes/pkg/api/errors"
"github.com/GoogleCloudPlatform/kubernetes/pkg/master/ports"
"github.com/golang/glog"
)
const (
SCHEDULER_SERVICE_NAME = "k8sm-scheduler"
)
func (m *SchedulerServer) newServiceWriter(stop <-chan struct{}) func() {
return func() {
for {
// Update service & endpoint records.
// TODO(k8s): when it becomes possible to change this stuff,
// stop polling and start watching.
if err := m.createSchedulerServiceIfNeeded(SCHEDULER_SERVICE_NAME, ports.SchedulerPort); err != nil {
glog.Errorf("Can't create scheduler service: %v", err)
}
if err := m.setEndpoints(SCHEDULER_SERVICE_NAME, net.IP(m.Address), m.Port); err != nil {
glog.Errorf("Can't create scheduler endpoints: %v", err)
}
select {
case <-stop:
return
case <-time.After(10 * time.Second):
}
}
}
}
// createSchedulerServiceIfNeeded will create the specified service if it
// doesn't already exist.
func (m *SchedulerServer) createSchedulerServiceIfNeeded(serviceName string, servicePort int) error {
ctx := api.NewDefaultContext()
if _, err := m.client.Services(api.NamespaceValue(ctx)).Get(serviceName); err == nil {
// The service already exists.
return nil
}
svc := &api.Service{
ObjectMeta: api.ObjectMeta{
Name: serviceName,
Namespace: api.NamespaceDefault,
Labels: map[string]string{"provider": "k8sm", "component": "scheduler"},
},
Spec: api.ServiceSpec{
Ports: []api.ServicePort{{Port: servicePort, Protocol: api.ProtocolTCP}},
// maintained by this code, not by the pod selector
Selector: nil,
SessionAffinity: api.ServiceAffinityNone,
},
}
if m.ServiceAddress != nil {
svc.Spec.ClusterIP = m.ServiceAddress.String()
}
_, err := m.client.Services(api.NamespaceValue(ctx)).Create(svc)
if err != nil && errors.IsAlreadyExists(err) {
err = nil
}
return err
}
// setEndpoints sets the endpoints for the given service.
// in a multi-master scenario only the master will be publishing an endpoint.
// see SchedulerServer.bootstrap.
func (m *SchedulerServer) setEndpoints(serviceName string, ip net.IP, port int) error {
// The setting we want to find.
want := []api.EndpointSubset{{
Addresses: []api.EndpointAddress{{IP: ip.String()}},
Ports: []api.EndpointPort{{Port: port, Protocol: api.ProtocolTCP}},
}}
ctx := api.NewDefaultContext()
e, err := m.client.Endpoints(api.NamespaceValue(ctx)).Get(serviceName)
createOrUpdate := m.client.Endpoints(api.NamespaceValue(ctx)).Update
if err != nil {
if errors.IsNotFound(err) {
createOrUpdate = m.client.Endpoints(api.NamespaceValue(ctx)).Create
}
e = &api.Endpoints{
ObjectMeta: api.ObjectMeta{
Name: serviceName,
Namespace: api.NamespaceDefault,
},
}
}
if !reflect.DeepEqual(e.Subsets, want) {
e.Subsets = want
glog.Infof("setting endpoints for master service %q to %#v", serviceName, e)
_, err = createOrUpdate(e)
return err
}
// We didn't make any changes, no need to actually call update.
return nil
}

View File

@@ -0,0 +1,751 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package service
import (
"bufio"
"errors"
"fmt"
"io/ioutil"
"net"
"net/http"
"os"
"os/exec"
"os/user"
"strconv"
"strings"
"sync"
"time"
"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/election"
execcfg "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/executor/config"
"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/hyperkube"
"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/profile"
"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/runtime"
"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/scheduler"
schedcfg "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/scheduler/config"
"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/scheduler/ha"
"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/scheduler/meta"
"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/scheduler/metrics"
"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/scheduler/uid"
"github.com/GoogleCloudPlatform/kubernetes/pkg/client"
"github.com/GoogleCloudPlatform/kubernetes/pkg/clientauth"
"github.com/GoogleCloudPlatform/kubernetes/pkg/master/ports"
"github.com/GoogleCloudPlatform/kubernetes/pkg/tools"
"github.com/GoogleCloudPlatform/kubernetes/pkg/util"
"github.com/coreos/go-etcd/etcd"
"github.com/gogo/protobuf/proto"
log "github.com/golang/glog"
"github.com/kardianos/osext"
"github.com/mesos/mesos-go/auth"
"github.com/mesos/mesos-go/auth/sasl"
"github.com/mesos/mesos-go/auth/sasl/mech"
mesos "github.com/mesos/mesos-go/mesosproto"
mutil "github.com/mesos/mesos-go/mesosutil"
bindings "github.com/mesos/mesos-go/scheduler"
"github.com/prometheus/client_golang/prometheus"
"github.com/spf13/pflag"
"golang.org/x/net/context"
)
const (
defaultMesosMaster = "localhost:5050"
defaultMesosUser = "root" // should have privs to execute docker and iptables commands
defaultReconcileInterval = 300 // 5m default task reconciliation interval
defaultReconcileCooldown = 15 * time.Second
defaultFrameworkName = "Kubernetes"
)
type SchedulerServer struct {
Port int
Address util.IP
EnableProfiling bool
AuthPath string
APIServerList util.StringList
EtcdServerList util.StringList
EtcdConfigFile string
AllowPrivileged bool
ExecutorPath string
ProxyPath string
MesosMaster string
MesosUser string
MesosRole string
MesosAuthPrincipal string
MesosAuthSecretFile string
Checkpoint bool
FailoverTimeout float64
ExecutorBindall bool
ExecutorRunProxy bool
ExecutorProxyBindall bool
ExecutorLogV int
ExecutorSuicideTimeout time.Duration
MesosAuthProvider string
DriverPort uint
HostnameOverride string
ReconcileInterval int64
ReconcileCooldown time.Duration
SchedulerConfigFileName string
Graceful bool
FrameworkName string
FrameworkWebURI string
HA bool
AdvertisedAddress string
ServiceAddress util.IP
HADomain string
KMPath string
ClusterDNS util.IP
ClusterDomain string
KubeletRootDirectory string
KubeletDockerEndpoint string
KubeletPodInfraContainerImage string
KubeletCadvisorPort uint
KubeletHostNetworkSources string
KubeletSyncFrequency time.Duration
KubeletNetworkPluginName string
executable string // path to the binary running this service
client *client.Client
driver bindings.SchedulerDriver
driverMutex sync.RWMutex
mux *http.ServeMux
}
// useful for unit testing specific funcs
type schedulerProcessInterface interface {
End() <-chan struct{}
Failover() <-chan struct{}
Terminal() <-chan struct{}
}
// NewSchedulerServer creates a new SchedulerServer with default parameters
func NewSchedulerServer() *SchedulerServer {
s := SchedulerServer{
Port: ports.SchedulerPort,
Address: util.IP(net.ParseIP("127.0.0.1")),
FailoverTimeout: time.Duration((1 << 62) - 1).Seconds(),
ExecutorRunProxy: true,
ExecutorSuicideTimeout: execcfg.DefaultSuicideTimeout,
MesosAuthProvider: sasl.ProviderName,
MesosMaster: defaultMesosMaster,
MesosUser: defaultMesosUser,
ReconcileInterval: defaultReconcileInterval,
ReconcileCooldown: defaultReconcileCooldown,
Checkpoint: true,
FrameworkName: defaultFrameworkName,
HA: false,
mux: http.NewServeMux(),
KubeletCadvisorPort: 4194, // copied from github.com/GoogleCloudPlatform/kubernetes/blob/release-0.14/cmd/kubelet/app/server.go
KubeletSyncFrequency: 10 * time.Second,
}
// cache this for later use. also useful in case the original binary gets deleted, e.g.
// during upgrades, development deployments, etc.
if filename, err := osext.Executable(); err != nil {
log.Fatalf("failed to determine path to currently running executable: %v", err)
} else {
s.executable = filename
s.KMPath = filename
}
return &s
}
func (s *SchedulerServer) addCoreFlags(fs *pflag.FlagSet) {
fs.IntVar(&s.Port, "port", s.Port, "The port that the scheduler's http service runs on")
fs.Var(&s.Address, "address", "The IP address to serve on (set to 0.0.0.0 for all interfaces)")
fs.BoolVar(&s.EnableProfiling, "profiling", s.EnableProfiling, "Enable profiling via web interface host:port/debug/pprof/")
fs.Var(&s.APIServerList, "api-servers", "List of Kubernetes API servers for publishing events, and reading pods and services. (ip:port), comma separated.")
fs.StringVar(&s.AuthPath, "auth-path", s.AuthPath, "Path to .kubernetes_auth file, specifying how to authenticate to API server.")
fs.Var(&s.EtcdServerList, "etcd-servers", "List of etcd servers to watch (http://ip:port), comma separated. Mutually exclusive with --etcd-config")
fs.StringVar(&s.EtcdConfigFile, "etcd-config", s.EtcdConfigFile, "The config file for the etcd client. Mutually exclusive with --etcd-servers.")
fs.BoolVar(&s.AllowPrivileged, "allow-privileged", s.AllowPrivileged, "If true, allow privileged containers.")
fs.StringVar(&s.ClusterDomain, "cluster-domain", s.ClusterDomain, "Domain for this cluster. If set, kubelet will configure all containers to search this domain in addition to the host's search domains")
fs.Var(&s.ClusterDNS, "cluster-dns", "IP address for a cluster DNS server. If set, kubelet will configure all containers to use this for DNS resolution in addition to the host's DNS servers")
fs.StringVar(&s.MesosMaster, "mesos-master", s.MesosMaster, "Location of the Mesos master. The format is a comma-delimited list of of hosts like zk://host1:port,host2:port/mesos. If using ZooKeeper, pay particular attention to the leading zk:// and trailing /mesos! If not using ZooKeeper, standard URLs like http://localhost are also acceptable.")
fs.StringVar(&s.MesosUser, "mesos-user", s.MesosUser, "Mesos user for this framework, defaults to root.")
fs.StringVar(&s.MesosRole, "mesos-role", s.MesosRole, "Mesos role for this framework, defaults to none.")
fs.StringVar(&s.MesosAuthPrincipal, "mesos-authentication-principal", s.MesosAuthPrincipal, "Mesos authentication principal.")
fs.StringVar(&s.MesosAuthSecretFile, "mesos-authentication-secret-file", s.MesosAuthSecretFile, "Mesos authentication secret file.")
fs.StringVar(&s.MesosAuthProvider, "mesos-authentication-provider", s.MesosAuthProvider, fmt.Sprintf("Authentication provider to use, default is SASL that supports mechanisms: %+v", mech.ListSupported()))
fs.BoolVar(&s.Checkpoint, "checkpoint", s.Checkpoint, "Enable/disable checkpointing for the kubernetes-mesos framework.")
fs.Float64Var(&s.FailoverTimeout, "failover-timeout", s.FailoverTimeout, fmt.Sprintf("Framework failover timeout, in sec."))
fs.UintVar(&s.DriverPort, "driver-port", s.DriverPort, "Port that the Mesos scheduler driver process should listen on.")
fs.StringVar(&s.HostnameOverride, "hostname-override", s.HostnameOverride, "If non-empty, will use this string as identification instead of the actual hostname.")
fs.Int64Var(&s.ReconcileInterval, "reconcile-interval", s.ReconcileInterval, "Interval at which to execute task reconciliation, in sec. Zero disables.")
fs.DurationVar(&s.ReconcileCooldown, "reconcile-cooldown", s.ReconcileCooldown, "Minimum rest period between task reconciliation operations.")
fs.StringVar(&s.SchedulerConfigFileName, "scheduler-config", s.SchedulerConfigFileName, "An ini-style configuration file with low-level scheduler settings.")
fs.BoolVar(&s.Graceful, "graceful", s.Graceful, "Indicator of a graceful failover, intended for internal use only.")
fs.BoolVar(&s.HA, "ha", s.HA, "Run the scheduler in high availability mode with leader election. All peers should be configured exactly the same.")
fs.StringVar(&s.FrameworkName, "framework-name", s.FrameworkName, "The framework name to register with Mesos.")
fs.StringVar(&s.FrameworkWebURI, "framework-weburi", s.FrameworkWebURI, "A URI that points to a web-based interface for interacting with the framework.")
fs.StringVar(&s.AdvertisedAddress, "advertised-address", s.AdvertisedAddress, "host:port address that is advertised to clients. May be used to construct artifact download URIs.")
fs.Var(&s.ServiceAddress, "service-address", "The service portal IP address that the scheduler should register with (if unset, chooses randomly)")
fs.BoolVar(&s.ExecutorBindall, "executor-bindall", s.ExecutorBindall, "When true will set -address of the executor to 0.0.0.0.")
fs.IntVar(&s.ExecutorLogV, "executor-logv", s.ExecutorLogV, "Logging verbosity of spawned executor processes.")
fs.BoolVar(&s.ExecutorProxyBindall, "executor-proxy-bindall", s.ExecutorProxyBindall, "When true pass -proxy-bindall to the executor.")
fs.BoolVar(&s.ExecutorRunProxy, "executor-run-proxy", s.ExecutorRunProxy, "Run the kube-proxy as a child process of the executor.")
fs.DurationVar(&s.ExecutorSuicideTimeout, "executor-suicide-timeout", s.ExecutorSuicideTimeout, "Executor self-terminates after this period of inactivity. Zero disables suicide watch.")
fs.StringVar(&s.KubeletRootDirectory, "kubelet-root-dir", s.KubeletRootDirectory, "Directory path for managing kubelet files (volume mounts,etc). Defaults to executor sandbox.")
fs.StringVar(&s.KubeletDockerEndpoint, "kubelet-docker-endpoint", s.KubeletDockerEndpoint, "If non-empty, kubelet will use this for the docker endpoint to communicate with.")
fs.StringVar(&s.KubeletPodInfraContainerImage, "kubelet-pod-infra-container-image", s.KubeletPodInfraContainerImage, "The image whose network/ipc namespaces containers in each pod will use.")
fs.UintVar(&s.KubeletCadvisorPort, "kubelet-cadvisor-port", s.KubeletCadvisorPort, "The port of the kubelet's local cAdvisor endpoint")
fs.StringVar(&s.KubeletHostNetworkSources, "kubelet-host-network-sources", s.KubeletHostNetworkSources, "Comma-separated list of sources from which the Kubelet allows pods to use of host network. For all sources use \"*\" [default=\"file\"]")
fs.DurationVar(&s.KubeletSyncFrequency, "kubelet-sync-frequency", s.KubeletSyncFrequency, "Max period between synchronizing running containers and config")
fs.StringVar(&s.KubeletNetworkPluginName, "kubelet-network-plugin", s.KubeletNetworkPluginName, "<Warning: Alpha feature> The name of the network plugin to be invoked for various events in kubelet/pod lifecycle")
//TODO(jdef) support this flag once we have a better handle on mesos-dns and k8s DNS integration
//fs.StringVar(&s.HADomain, "ha-domain", s.HADomain, "Domain of the HA scheduler service, only used in HA mode. If specified may be used to construct artifact download URIs.")
}
func (s *SchedulerServer) AddStandaloneFlags(fs *pflag.FlagSet) {
s.addCoreFlags(fs)
fs.StringVar(&s.ExecutorPath, "executor-path", s.ExecutorPath, "Location of the kubernetes executor executable")
fs.StringVar(&s.ProxyPath, "proxy-path", s.ProxyPath, "Location of the kubernetes proxy executable")
}
func (s *SchedulerServer) AddHyperkubeFlags(fs *pflag.FlagSet) {
s.addCoreFlags(fs)
fs.StringVar(&s.KMPath, "km-path", s.KMPath, "Location of the km executable, may be a URI or an absolute file path.")
}
// returns (downloadURI, basename(path))
func (s *SchedulerServer) serveFrameworkArtifact(path string) (string, string) {
serveFile := func(pattern string, filename string) {
s.mux.HandleFunc(pattern, func(w http.ResponseWriter, r *http.Request) {
http.ServeFile(w, r, filename)
})
}
// Create base path (http://foobar:5000/<base>)
pathSplit := strings.Split(path, "/")
var base string
if len(pathSplit) > 0 {
base = pathSplit[len(pathSplit)-1]
} else {
base = path
}
serveFile("/"+base, path)
hostURI := ""
if s.AdvertisedAddress != "" {
hostURI = fmt.Sprintf("http://%s/%s", s.AdvertisedAddress, base)
} else if s.HA && s.HADomain != "" {
hostURI = fmt.Sprintf("http://%s.%s:%d/%s", SCHEDULER_SERVICE_NAME, s.HADomain, ports.SchedulerPort, base)
} else {
hostURI = fmt.Sprintf("http://%s:%d/%s", s.Address.String(), s.Port, base)
}
log.V(2).Infof("Hosting artifact '%s' at '%s'", path, hostURI)
return hostURI, base
}
func (s *SchedulerServer) prepareExecutorInfo(hks hyperkube.Interface) (*mesos.ExecutorInfo, *uid.UID, error) {
ci := &mesos.CommandInfo{
Shell: proto.Bool(false),
}
//TODO(jdef) these should be shared constants with km
const (
KM_EXECUTOR = "executor"
KM_PROXY = "proxy"
)
if s.ExecutorPath != "" {
uri, executorCmd := s.serveFrameworkArtifact(s.ExecutorPath)
ci.Uris = append(ci.Uris, &mesos.CommandInfo_URI{Value: proto.String(uri), Executable: proto.Bool(true)})
ci.Value = proto.String(fmt.Sprintf("./%s", executorCmd))
} else if !hks.FindServer(KM_EXECUTOR) {
return nil, nil, fmt.Errorf("either run this scheduler via km or else --executor-path is required")
} else {
if strings.Index(s.KMPath, "://") > 0 {
// URI could point directly to executable, e.g. hdfs:///km
// or else indirectly, e.g. http://acmestorage/tarball.tgz
// so we assume that for this case the command will always "km"
ci.Uris = append(ci.Uris, &mesos.CommandInfo_URI{Value: proto.String(s.KMPath), Executable: proto.Bool(true)})
ci.Value = proto.String("./km") // TODO(jdef) extract constant
} else if s.KMPath != "" {
uri, kmCmd := s.serveFrameworkArtifact(s.KMPath)
ci.Uris = append(ci.Uris, &mesos.CommandInfo_URI{Value: proto.String(uri), Executable: proto.Bool(true)})
ci.Value = proto.String(fmt.Sprintf("./%s", kmCmd))
} else {
uri, kmCmd := s.serveFrameworkArtifact(s.executable)
ci.Uris = append(ci.Uris, &mesos.CommandInfo_URI{Value: proto.String(uri), Executable: proto.Bool(true)})
ci.Value = proto.String(fmt.Sprintf("./%s", kmCmd))
}
ci.Arguments = append(ci.Arguments, KM_EXECUTOR)
}
if s.ProxyPath != "" {
uri, proxyCmd := s.serveFrameworkArtifact(s.ProxyPath)
ci.Uris = append(ci.Uris, &mesos.CommandInfo_URI{Value: proto.String(uri), Executable: proto.Bool(true)})
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--proxy-exec=./%s", proxyCmd))
} else if !hks.FindServer(KM_PROXY) {
return nil, nil, fmt.Errorf("either run this scheduler via km or else --proxy-path is required")
} else if s.ExecutorPath != "" {
return nil, nil, fmt.Errorf("proxy can only use km binary if executor does the same")
} // else, executor is smart enough to know when proxy-path is required, or to use km
//TODO(jdef): provide some way (env var?) for users to customize executor config
//TODO(jdef): set -address to 127.0.0.1 if `address` is 127.0.0.1
//TODO(jdef): propagate dockercfg from RootDirectory?
apiServerArgs := strings.Join(s.APIServerList, ",")
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--api-servers=%s", apiServerArgs))
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--v=%d", s.ExecutorLogV))
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--allow-privileged=%t", s.AllowPrivileged))
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--suicide-timeout=%v", s.ExecutorSuicideTimeout))
if s.ExecutorBindall {
//TODO(jdef) determine whether hostname-override is really needed for bindall because
//it conflicts with kubelet node status checks/updates
//ci.Arguments = append(ci.Arguments, "--hostname-override=0.0.0.0")
ci.Arguments = append(ci.Arguments, "--address=0.0.0.0")
}
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--proxy-bindall=%v", s.ExecutorProxyBindall))
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--run-proxy=%v", s.ExecutorRunProxy))
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--cadvisor-port=%v", s.KubeletCadvisorPort))
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--sync-frequency=%v", s.KubeletSyncFrequency))
if s.AuthPath != "" {
//TODO(jdef) should probably support non-local files, e.g. hdfs:///some/config/file
uri, basename := s.serveFrameworkArtifact(s.AuthPath)
ci.Uris = append(ci.Uris, &mesos.CommandInfo_URI{Value: proto.String(uri)})
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--auth-path=%s", basename))
}
appendOptional := func(name string, value string) {
if value != "" {
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--%s=%s", name, value))
}
}
if s.ClusterDNS != nil {
appendOptional("cluster-dns", s.ClusterDNS.String())
}
appendOptional("cluster-domain", s.ClusterDomain)
appendOptional("root-dir", s.KubeletRootDirectory)
appendOptional("docker-endpoint", s.KubeletDockerEndpoint)
appendOptional("pod-infra-container-image", s.KubeletPodInfraContainerImage)
appendOptional("host-network-sources", s.KubeletHostNetworkSources)
appendOptional("network-plugin", s.KubeletNetworkPluginName)
log.V(1).Infof("prepared executor command %q with args '%+v'", ci.GetValue(), ci.Arguments)
// Create mesos scheduler driver.
info := &mesos.ExecutorInfo{
Command: ci,
Name: proto.String(execcfg.DefaultInfoName),
Source: proto.String(execcfg.DefaultInfoSource),
}
// calculate ExecutorInfo hash to be used for validating compatibility
// of ExecutorInfo's generated by other HA schedulers.
ehash := hashExecutorInfo(info)
eid := uid.New(ehash, execcfg.DefaultInfoID)
info.ExecutorId = &mesos.ExecutorID{Value: proto.String(eid.String())}
return info, eid, nil
}
// TODO(jdef): hacked from kubelet/server/server.go
// TODO(k8s): replace this with clientcmd
func (s *SchedulerServer) createAPIServerClient() (*client.Client, error) {
authInfo, err := clientauth.LoadFromFile(s.AuthPath)
if err != nil {
log.Warningf("Could not load kubernetes auth path: %v. Continuing with defaults.", err)
}
if authInfo == nil {
// authInfo didn't load correctly - continue with defaults.
authInfo = &clientauth.Info{}
}
clientConfig, err := authInfo.MergeWithConfig(client.Config{})
if err != nil {
return nil, err
}
if len(s.APIServerList) < 1 {
return nil, fmt.Errorf("no api servers specified")
}
// TODO: adapt Kube client to support LB over several servers
if len(s.APIServerList) > 1 {
log.Infof("Multiple api servers specified. Picking first one")
}
clientConfig.Host = s.APIServerList[0]
c, err := client.New(&clientConfig)
if err != nil {
return nil, err
}
return c, nil
}
func (s *SchedulerServer) setDriver(driver bindings.SchedulerDriver) {
s.driverMutex.Lock()
defer s.driverMutex.Unlock()
s.driver = driver
}
func (s *SchedulerServer) getDriver() (driver bindings.SchedulerDriver) {
s.driverMutex.RLock()
defer s.driverMutex.RUnlock()
return s.driver
}
func (s *SchedulerServer) Run(hks hyperkube.Interface, _ []string) error {
// get scheduler low-level config
sc := schedcfg.CreateDefaultConfig()
if s.SchedulerConfigFileName != "" {
f, err := os.Open(s.SchedulerConfigFileName)
if err != nil {
log.Fatalf("Cannot open scheduler config file: %v", err)
}
err = sc.Read(bufio.NewReader(f))
if err != nil {
log.Fatalf("Invalid scheduler config file: %v", err)
}
}
schedulerProcess, driverFactory, etcdClient, eid := s.bootstrap(hks, sc)
if s.EnableProfiling {
profile.InstallHandler(s.mux)
}
go runtime.Until(func() {
log.V(1).Info("Starting HTTP interface")
log.Error(http.ListenAndServe(net.JoinHostPort(s.Address.String(), strconv.Itoa(s.Port)), s.mux))
}, sc.HttpBindInterval.Duration, schedulerProcess.Terminal())
if s.HA {
validation := ha.ValidationFunc(validateLeadershipTransition)
srv := ha.NewCandidate(schedulerProcess, driverFactory, validation)
path := fmt.Sprintf(meta.DefaultElectionFormat, s.FrameworkName)
sid := uid.New(eid.Group(), "").String()
log.Infof("registering for election at %v with id %v", path, sid)
go election.Notify(election.NewEtcdMasterElector(etcdClient), path, sid, srv, nil)
} else {
log.Infoln("self-electing in non-HA mode")
schedulerProcess.Elect(driverFactory)
}
return s.awaitFailover(schedulerProcess, func() error { return s.failover(s.getDriver(), hks) })
}
// watch the scheduler process for failover signals and properly handle such. may never return.
func (s *SchedulerServer) awaitFailover(schedulerProcess schedulerProcessInterface, handler func() error) error {
// we only want to return the first error (if any), everyone else can block forever
errCh := make(chan error, 1)
doFailover := func() error {
// we really don't expect handler to return, if it does something went seriously wrong
err := handler()
if err != nil {
defer schedulerProcess.End()
err = fmt.Errorf("failover failed, scheduler will terminate: %v", err)
}
return err
}
// guard for failover signal processing, first signal processor wins
failoverLatch := &runtime.Latch{}
runtime.On(schedulerProcess.Terminal(), func() {
if !failoverLatch.Acquire() {
log.V(1).Infof("scheduler process ending, already failing over")
select {}
}
var err error
defer func() { errCh <- err }()
select {
case <-schedulerProcess.Failover():
err = doFailover()
default:
if s.HA {
err = fmt.Errorf("ha scheduler exiting instead of failing over")
} else {
log.Infof("exiting scheduler")
}
}
})
runtime.OnOSSignal(makeFailoverSigChan(), func(_ os.Signal) {
if !failoverLatch.Acquire() {
log.V(1).Infof("scheduler process signalled, already failing over")
select {}
}
errCh <- doFailover()
})
return <-errCh
}
func validateLeadershipTransition(desired, current string) {
log.Infof("validating leadership transition")
d := uid.Parse(desired).Group()
c := uid.Parse(current).Group()
if d == 0 {
// should *never* happen, but..
log.Fatalf("illegal scheduler UID: %q", desired)
}
if d != c && c != 0 {
log.Fatalf("desired scheduler group (%x) != current scheduler group (%x)", d, c)
}
}
// hacked from https://github.com/GoogleCloudPlatform/kubernetes/blob/release-0.14/cmd/kube-apiserver/app/server.go
func newEtcd(etcdConfigFile string, etcdServerList util.StringList) (client tools.EtcdGetSet, err error) {
if etcdConfigFile != "" {
client, err = etcd.NewClientFromFile(etcdConfigFile)
} else {
client = etcd.NewClient(etcdServerList)
}
return
}
func (s *SchedulerServer) bootstrap(hks hyperkube.Interface, sc *schedcfg.Config) (*ha.SchedulerProcess, ha.DriverFactory, tools.EtcdGetSet, *uid.UID) {
s.FrameworkName = strings.TrimSpace(s.FrameworkName)
if s.FrameworkName == "" {
log.Fatalf("framework-name must be a non-empty string")
}
s.FrameworkWebURI = strings.TrimSpace(s.FrameworkWebURI)
metrics.Register()
runtime.Register()
s.mux.Handle("/metrics", prometheus.Handler())
if (s.EtcdConfigFile != "" && len(s.EtcdServerList) != 0) || (s.EtcdConfigFile == "" && len(s.EtcdServerList) == 0) {
log.Fatalf("specify either --etcd-servers or --etcd-config")
}
if len(s.APIServerList) < 1 {
log.Fatal("No api servers specified.")
}
client, err := s.createAPIServerClient()
if err != nil {
log.Fatalf("Unable to make apiserver client: %v", err)
}
s.client = client
if s.ReconcileCooldown < defaultReconcileCooldown {
s.ReconcileCooldown = defaultReconcileCooldown
log.Warningf("user-specified reconcile cooldown too small, defaulting to %v", s.ReconcileCooldown)
}
executor, eid, err := s.prepareExecutorInfo(hks)
if err != nil {
log.Fatalf("misconfigured executor: %v", err)
}
// TODO(jdef): remove the dependency on etcd as soon as
// (1) the generic config store is available for the FrameworkId storage
// (2) the generic master election is provided by the apiserver
// Compare docs/proposals/high-availability.md
etcdClient, err := newEtcd(s.EtcdConfigFile, s.EtcdServerList)
if err != nil {
log.Fatalf("misconfigured etcd: %v", err)
}
mesosPodScheduler := scheduler.New(scheduler.Config{
Schedcfg: *sc,
Executor: executor,
ScheduleFunc: scheduler.FCFSScheduleFunc,
Client: client,
EtcdClient: etcdClient,
FailoverTimeout: s.FailoverTimeout,
ReconcileInterval: s.ReconcileInterval,
ReconcileCooldown: s.ReconcileCooldown,
})
masterUri := s.MesosMaster
info, cred, err := s.buildFrameworkInfo()
if err != nil {
log.Fatalf("Misconfigured mesos framework: %v", err)
}
schedulerProcess := ha.New(mesosPodScheduler)
dconfig := &bindings.DriverConfig{
Scheduler: schedulerProcess,
Framework: info,
Master: masterUri,
Credential: cred,
BindingAddress: net.IP(s.Address),
BindingPort: uint16(s.DriverPort),
HostnameOverride: s.HostnameOverride,
WithAuthContext: func(ctx context.Context) context.Context {
ctx = auth.WithLoginProvider(ctx, s.MesosAuthProvider)
ctx = sasl.WithBindingAddress(ctx, net.IP(s.Address))
return ctx
},
}
kpl := scheduler.NewPlugin(mesosPodScheduler.NewDefaultPluginConfig(schedulerProcess.Terminal(), s.mux))
runtime.On(mesosPodScheduler.Registration(), func() { kpl.Run(schedulerProcess.Terminal()) })
runtime.On(mesosPodScheduler.Registration(), s.newServiceWriter(schedulerProcess.Terminal()))
driverFactory := ha.DriverFactory(func() (drv bindings.SchedulerDriver, err error) {
log.V(1).Infoln("performing deferred initialization")
if err = mesosPodScheduler.Init(schedulerProcess.Master(), kpl, s.mux); err != nil {
return nil, fmt.Errorf("failed to initialize pod scheduler: %v", err)
}
log.V(1).Infoln("deferred init complete")
// defer obtaining framework ID to prevent multiple schedulers
// from overwriting each other's framework IDs
dconfig.Framework.Id, err = s.fetchFrameworkID(etcdClient)
if err != nil {
return nil, fmt.Errorf("failed to fetch framework ID from etcd: %v", err)
}
log.V(1).Infoln("constructing mesos scheduler driver")
drv, err = bindings.NewMesosSchedulerDriver(*dconfig)
if err != nil {
return nil, fmt.Errorf("failed to construct scheduler driver: %v", err)
}
log.V(1).Infoln("constructed mesos scheduler driver:", drv)
s.setDriver(drv)
return drv, nil
})
return schedulerProcess, driverFactory, etcdClient, eid
}
func (s *SchedulerServer) failover(driver bindings.SchedulerDriver, hks hyperkube.Interface) error {
if driver != nil {
stat, err := driver.Stop(true)
if stat != mesos.Status_DRIVER_STOPPED {
return fmt.Errorf("failed to stop driver for failover, received unexpected status code: %v", stat)
} else if err != nil {
return err
}
}
// there's no guarantee that all goroutines are actually programmed intelligently with 'done'
// signals, so we'll need to restart if we want to really stop everything
// run the same command that we were launched with
//TODO(jdef) assumption here is that the sheduler is the only service running in this process, we should probably validate that somehow
args := []string{}
flags := pflag.CommandLine
if hks != nil {
args = append(args, hks.Name())
flags = hks.Flags()
}
flags.Visit(func(flag *pflag.Flag) {
if flag.Name != "api-servers" && flag.Name != "etcd-servers" {
args = append(args, fmt.Sprintf("--%s=%s", flag.Name, flag.Value.String()))
}
})
if !s.Graceful {
args = append(args, "--graceful")
}
if len(s.APIServerList) > 0 {
args = append(args, "--api-servers="+strings.Join(s.APIServerList, ","))
}
if len(s.EtcdServerList) > 0 {
args = append(args, "--etcd-servers="+strings.Join(s.EtcdServerList, ","))
}
args = append(args, flags.Args()...)
log.V(1).Infof("spawning scheduler for graceful failover: %s %+v", s.executable, args)
cmd := exec.Command(s.executable, args...)
cmd.Stdin = os.Stdin
cmd.Stdout = os.Stdout
cmd.Stderr = os.Stderr
cmd.SysProcAttr = makeDisownedProcAttr()
// TODO(jdef) pass in a pipe FD so that we can block, waiting for the child proc to be ready
//cmd.ExtraFiles = []*os.File{}
exitcode := 0
log.Flush() // TODO(jdef) it would be really nice to ensure that no one else in our process was still logging
if err := cmd.Start(); err != nil {
//log to stdtout here to avoid conflicts with normal stderr logging
fmt.Fprintf(os.Stdout, "failed to spawn failover process: %v\n", err)
os.Exit(1)
}
os.Exit(exitcode)
select {} // will never reach here
}
func (s *SchedulerServer) buildFrameworkInfo() (info *mesos.FrameworkInfo, cred *mesos.Credential, err error) {
username, err := s.getUsername()
if err != nil {
return nil, nil, err
}
log.V(2).Infof("Framework configured with mesos user %v", username)
info = &mesos.FrameworkInfo{
Name: proto.String(s.FrameworkName),
User: proto.String(username),
Checkpoint: proto.Bool(s.Checkpoint),
}
if s.FrameworkWebURI != "" {
info.WebuiUrl = proto.String(s.FrameworkWebURI)
}
if s.FailoverTimeout > 0 {
info.FailoverTimeout = proto.Float64(s.FailoverTimeout)
}
if s.MesosRole != "" {
info.Role = proto.String(s.MesosRole)
}
if s.MesosAuthPrincipal != "" {
info.Principal = proto.String(s.MesosAuthPrincipal)
if s.MesosAuthSecretFile == "" {
return nil, nil, errors.New("authentication principal specified without the required credentials file")
}
secret, err := ioutil.ReadFile(s.MesosAuthSecretFile)
if err != nil {
return nil, nil, err
}
cred = &mesos.Credential{
Principal: proto.String(s.MesosAuthPrincipal),
Secret: secret,
}
}
return
}
func (s *SchedulerServer) fetchFrameworkID(client tools.EtcdGetSet) (*mesos.FrameworkID, error) {
if s.FailoverTimeout > 0 {
if response, err := client.Get(meta.FrameworkIDKey, false, false); err != nil {
if !tools.IsEtcdNotFound(err) {
return nil, fmt.Errorf("unexpected failure attempting to load framework ID from etcd: %v", err)
}
log.V(1).Infof("did not find framework ID in etcd")
} else if response.Node.Value != "" {
log.Infof("configuring FrameworkInfo with Id found in etcd: '%s'", response.Node.Value)
return mutil.NewFrameworkID(response.Node.Value), nil
}
} else {
//TODO(jdef) this seems like a totally hackish way to clean up the framework ID
if _, err := client.Delete(meta.FrameworkIDKey, true); err != nil {
if !tools.IsEtcdNotFound(err) {
return nil, fmt.Errorf("failed to delete framework ID from etcd: %v", err)
}
log.V(1).Infof("nothing to delete: did not find framework ID in etcd")
}
}
return nil, nil
}
func (s *SchedulerServer) getUsername() (username string, err error) {
username = s.MesosUser
if username == "" {
if u, err := user.Current(); err == nil {
username = u.Username
if username == "" {
username = defaultMesosUser
}
}
}
return
}

View File

@@ -0,0 +1,108 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// +build unit_test
package service
import (
"testing"
"time"
)
type fakeSchedulerProcess struct {
doneFunc func() <-chan struct{}
failoverFunc func() <-chan struct{}
}
func (self *fakeSchedulerProcess) Terminal() <-chan struct{} {
if self == nil || self.doneFunc == nil {
return nil
}
return self.doneFunc()
}
func (self *fakeSchedulerProcess) Failover() <-chan struct{} {
if self == nil || self.failoverFunc == nil {
return nil
}
return self.failoverFunc()
}
func (self *fakeSchedulerProcess) End() <-chan struct{} {
ch := make(chan struct{})
close(ch)
return ch
}
func Test_awaitFailoverDone(t *testing.T) {
done := make(chan struct{})
p := &fakeSchedulerProcess{
doneFunc: func() <-chan struct{} { return done },
}
ss := &SchedulerServer{}
failoverHandlerCalled := false
failoverFailedHandler := func() error {
failoverHandlerCalled = true
return nil
}
errCh := make(chan error, 1)
go func() {
errCh <- ss.awaitFailover(p, failoverFailedHandler)
}()
close(done)
select {
case err := <-errCh:
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
case <-time.After(1 * time.Second):
t.Fatalf("timed out waiting for failover")
}
if failoverHandlerCalled {
t.Fatalf("unexpected call to failover handler")
}
}
func Test_awaitFailoverDoneFailover(t *testing.T) {
ch := make(chan struct{})
p := &fakeSchedulerProcess{
doneFunc: func() <-chan struct{} { return ch },
failoverFunc: func() <-chan struct{} { return ch },
}
ss := &SchedulerServer{}
failoverHandlerCalled := false
failoverFailedHandler := func() error {
failoverHandlerCalled = true
return nil
}
errCh := make(chan error, 1)
go func() {
errCh <- ss.awaitFailover(p, failoverFailedHandler)
}()
close(ch)
select {
case err := <-errCh:
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
case <-time.After(1 * time.Second):
t.Fatalf("timed out waiting for failover")
}
if !failoverHandlerCalled {
t.Fatalf("expected call to failover handler")
}
}

View File

@@ -0,0 +1,88 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package service
import (
"bytes"
"fmt"
"hash/crc64"
"sort"
"strconv"
mesos "github.com/mesos/mesos-go/mesosproto"
)
// compute a hashcode for ExecutorInfo that may be used as a reasonable litmus test
// with respect to compatibility across HA schedulers. the intent is that an HA scheduler
// should fail-fast if it doesn't pass this test, rather than generating (potentially many)
// errors at run-time because a Mesos master decides that the ExecutorInfo generated by a
// secondary scheduler doesn't match that of the primary scheduler.
//
// see https://github.com/apache/mesos/blob/0.22.0/src/common/type_utils.cpp#L110
func hashExecutorInfo(info *mesos.ExecutorInfo) uint64 {
// !!! we specifically do NOT include:
// - Framework ID because it's a value that's initialized too late for us to use
// - Executor ID because it's a value that includes a copy of this hash
buf := &bytes.Buffer{}
buf.WriteString(info.GetName())
buf.WriteString(info.GetSource())
buf.Write(info.Data)
if info.Command != nil {
buf.WriteString(info.Command.GetValue())
buf.WriteString(info.Command.GetUser())
buf.WriteString(strconv.FormatBool(info.Command.GetShell()))
if sz := len(info.Command.Arguments); sz > 0 {
x := make([]string, sz)
copy(x, info.Command.Arguments)
sort.Strings(x)
for _, item := range x {
buf.WriteString(item)
}
}
if vars := info.Command.Environment.GetVariables(); vars != nil && len(vars) > 0 {
names := []string{}
e := make(map[string]string)
for _, v := range vars {
if name := v.GetName(); name != "" {
names = append(names, name)
e[name] = v.GetValue()
}
}
sort.Strings(names)
for _, n := range names {
buf.WriteString(n)
buf.WriteString("=")
buf.WriteString(e[n])
}
}
if uris := info.Command.GetUris(); len(uris) > 0 {
su := []string{}
for _, uri := range uris {
su = append(su, fmt.Sprintf("%s%t%t", uri.GetValue(), uri.GetExecutable(), uri.GetExtract()))
}
sort.Strings(su)
for _, uri := range su {
buf.WriteString(uri)
}
}
//TODO(jdef) add support for Resources and Container
}
table := crc64.MakeTable(crc64.ECMA)
return crc64.Checksum(buf.Bytes(), table)
}

Some files were not shown because too many files have changed in this diff Show More