Merge pull request #16316 from mesosphere/scheduler-refactor

MESOS: Refactor scheduler
This commit is contained in:
Dr. Stefan Schimanski
2015-11-12 15:28:25 +01:00
52 changed files with 3254 additions and 2591 deletions

Binary file not shown.

View File

@@ -99,7 +99,7 @@ type NodeInfo struct {
// KubernetesExecutor is an mesos executor that runs pods
// in a minion machine.
type KubernetesExecutor struct {
type Executor struct {
updateChan chan<- kubetypes.PodUpdate // sent to the kubelet, closed on shutdown
state stateType
tasks map[string]*kuberTask
@@ -136,13 +136,13 @@ type Config struct {
NodeInfos chan<- NodeInfo
}
func (k *KubernetesExecutor) isConnected() bool {
func (k *Executor) isConnected() bool {
return connectedState == (&k.state).get()
}
// New creates a new kubernetes executor.
func New(config Config) *KubernetesExecutor {
k := &KubernetesExecutor{
func New(config Config) *Executor {
k := &Executor{
updateChan: config.Updates,
state: disconnectedState,
tasks: make(map[string]*kuberTask),
@@ -187,7 +187,7 @@ func New(config Config) *KubernetesExecutor {
return k
}
func (k *KubernetesExecutor) Init(driver bindings.ExecutorDriver) {
func (k *Executor) Init(driver bindings.ExecutorDriver) {
k.killKubeletContainers()
k.resetSuicideWatch(driver)
@@ -196,7 +196,7 @@ func (k *KubernetesExecutor) Init(driver bindings.ExecutorDriver) {
//TODO(jdef) monitor kubeletFinished and shutdown if it happens
}
func (k *KubernetesExecutor) isDone() bool {
func (k *Executor) isDone() bool {
select {
case <-k.terminate:
return true
@@ -206,7 +206,7 @@ func (k *KubernetesExecutor) isDone() bool {
}
// sendPodUpdate assumes that caller is holding state lock; returns true when update is sent otherwise false
func (k *KubernetesExecutor) sendPodUpdate(u *kubetypes.PodUpdate) bool {
func (k *Executor) sendPodUpdate(u *kubetypes.PodUpdate) bool {
if k.isDone() {
return false
}
@@ -215,7 +215,7 @@ func (k *KubernetesExecutor) sendPodUpdate(u *kubetypes.PodUpdate) bool {
}
// Registered is called when the executor is successfully registered with the slave.
func (k *KubernetesExecutor) Registered(driver bindings.ExecutorDriver,
func (k *Executor) Registered(driver bindings.ExecutorDriver,
executorInfo *mesos.ExecutorInfo, frameworkInfo *mesos.FrameworkInfo, slaveInfo *mesos.SlaveInfo) {
if k.isDone() {
return
@@ -252,7 +252,7 @@ func (k *KubernetesExecutor) Registered(driver bindings.ExecutorDriver,
// Reregistered is called when the executor is successfully re-registered with the slave.
// This can happen when the slave fails over.
func (k *KubernetesExecutor) Reregistered(driver bindings.ExecutorDriver, slaveInfo *mesos.SlaveInfo) {
func (k *Executor) Reregistered(driver bindings.ExecutorDriver, slaveInfo *mesos.SlaveInfo) {
if k.isDone() {
return
}
@@ -280,7 +280,7 @@ func (k *KubernetesExecutor) Reregistered(driver bindings.ExecutorDriver, slaveI
}
// initializeStaticPodsSource unzips the data slice into the static-pods directory
func (k *KubernetesExecutor) initializeStaticPodsSource(data []byte) {
func (k *Executor) initializeStaticPodsSource(data []byte) {
log.V(2).Infof("extracting static pods config to %s", k.staticPodsConfigPath)
err := archive.UnzipDir(data, k.staticPodsConfigPath)
if err != nil {
@@ -290,7 +290,7 @@ func (k *KubernetesExecutor) initializeStaticPodsSource(data []byte) {
}
// Disconnected is called when the executor is disconnected from the slave.
func (k *KubernetesExecutor) Disconnected(driver bindings.ExecutorDriver) {
func (k *Executor) Disconnected(driver bindings.ExecutorDriver) {
if k.isDone() {
return
}
@@ -306,7 +306,7 @@ func (k *KubernetesExecutor) Disconnected(driver bindings.ExecutorDriver) {
// is running, but the binding is not recorded in the Kubernetes store yet.
// This function is invoked to tell the executor to record the binding in the
// Kubernetes store and start the pod via the Kubelet.
func (k *KubernetesExecutor) LaunchTask(driver bindings.ExecutorDriver, taskInfo *mesos.TaskInfo) {
func (k *Executor) LaunchTask(driver bindings.ExecutorDriver, taskInfo *mesos.TaskInfo) {
if k.isDone() {
return
}
@@ -356,7 +356,7 @@ func (k *KubernetesExecutor) LaunchTask(driver bindings.ExecutorDriver, taskInfo
go k.launchTask(driver, taskId, pod)
}
func (k *KubernetesExecutor) handleChangedApiserverPod(pod *api.Pod) {
func (k *Executor) handleChangedApiserverPod(pod *api.Pod) {
// exclude "pre-scheduled" pods which have a NodeName set to this node without being scheduled already
taskId := pod.Annotations[meta.TaskIdKey]
if taskId == "" {
@@ -402,7 +402,7 @@ func (k *KubernetesExecutor) handleChangedApiserverPod(pod *api.Pod) {
// a timer that, upon expiration, causes this executor to commit suicide.
// this implementation runs asynchronously. callers that wish to wait for the
// reset to complete may wait for the returned signal chan to close.
func (k *KubernetesExecutor) resetSuicideWatch(driver bindings.ExecutorDriver) <-chan struct{} {
func (k *Executor) resetSuicideWatch(driver bindings.ExecutorDriver) <-chan struct{} {
ch := make(chan struct{})
go func() {
defer close(ch)
@@ -432,7 +432,7 @@ func (k *KubernetesExecutor) resetSuicideWatch(driver bindings.ExecutorDriver) <
return ch
}
func (k *KubernetesExecutor) attemptSuicide(driver bindings.ExecutorDriver, abort <-chan struct{}) {
func (k *Executor) attemptSuicide(driver bindings.ExecutorDriver, abort <-chan struct{}) {
k.lock.Lock()
defer k.lock.Unlock()
@@ -464,7 +464,7 @@ func (k *KubernetesExecutor) attemptSuicide(driver bindings.ExecutorDriver, abor
}
// async continuation of LaunchTask
func (k *KubernetesExecutor) launchTask(driver bindings.ExecutorDriver, taskId string, pod *api.Pod) {
func (k *Executor) launchTask(driver bindings.ExecutorDriver, taskId string, pod *api.Pod) {
deleteTask := func() {
k.lock.Lock()
defer k.lock.Unlock()
@@ -475,7 +475,7 @@ func (k *KubernetesExecutor) launchTask(driver bindings.ExecutorDriver, taskId s
// TODO(k8s): use Pods interface for binding once clusters are upgraded
// return b.Pods(binding.Namespace).Bind(binding)
if pod.Spec.NodeName == "" {
//HACK(jdef): cloned binding construction from k8s plugin/pkg/scheduler/scheduler.go
//HACK(jdef): cloned binding construction from k8s plugin/pkg/scheduler/framework.go
binding := &api.Binding{
ObjectMeta: api.ObjectMeta{
Namespace: pod.Namespace,
@@ -588,7 +588,7 @@ func (k *KubernetesExecutor) launchTask(driver bindings.ExecutorDriver, taskId s
go k._launchTask(driver, taskId, podFullName, psf)
}
func (k *KubernetesExecutor) _launchTask(driver bindings.ExecutorDriver, taskId, podFullName string, psf podStatusFunc) {
func (k *Executor) _launchTask(driver bindings.ExecutorDriver, taskId, podFullName string, psf podStatusFunc) {
expired := make(chan struct{})
@@ -669,7 +669,7 @@ reportLost:
k.reportLostTask(driver, taskId, messages.LaunchTaskFailed)
}
func (k *KubernetesExecutor) __launchTask(driver bindings.ExecutorDriver, taskId, podFullName string, psf podStatusFunc) {
func (k *Executor) __launchTask(driver bindings.ExecutorDriver, taskId, podFullName string, psf podStatusFunc) {
// TODO(nnielsen): Monitor health of pod and report if lost.
// Should we also allow this to fail a couple of times before reporting lost?
// What if the docker daemon is restarting and we can't connect, but it's
@@ -692,7 +692,7 @@ func (k *KubernetesExecutor) __launchTask(driver bindings.ExecutorDriver, taskId
// whether the pod is running. It will only return false if the task is still registered and the pod is
// registered in Docker. Otherwise it returns true. If there's still a task record on file, but no pod
// in Docker, then we'll also send a TASK_LOST event.
func (k *KubernetesExecutor) checkForLostPodTask(driver bindings.ExecutorDriver, taskId string, isKnownPod func() bool) bool {
func (k *Executor) checkForLostPodTask(driver bindings.ExecutorDriver, taskId string, isKnownPod func() bool) bool {
// TODO (jdefelice) don't send false alarms for deleted pods (KILLED tasks)
k.lock.Lock()
defer k.lock.Unlock()
@@ -716,7 +716,7 @@ func (k *KubernetesExecutor) checkForLostPodTask(driver bindings.ExecutorDriver,
}
// KillTask is called when the executor receives a request to kill a task.
func (k *KubernetesExecutor) KillTask(driver bindings.ExecutorDriver, taskId *mesos.TaskID) {
func (k *Executor) KillTask(driver bindings.ExecutorDriver, taskId *mesos.TaskID) {
if k.isDone() {
return
}
@@ -735,14 +735,14 @@ func (k *KubernetesExecutor) KillTask(driver bindings.ExecutorDriver, taskId *me
// Reports a lost task to the slave and updates internal task and pod tracking state.
// Assumes that the caller is locking around pod and task state.
func (k *KubernetesExecutor) reportLostTask(driver bindings.ExecutorDriver, tid, reason string) {
func (k *Executor) reportLostTask(driver bindings.ExecutorDriver, tid, reason string) {
k.removePodTask(driver, tid, reason, mesos.TaskState_TASK_LOST)
}
// deletes the pod and task associated with the task identified by tid and sends a task
// status update to mesos. also attempts to reset the suicide watch.
// Assumes that the caller is locking around pod and task state.
func (k *KubernetesExecutor) removePodTask(driver bindings.ExecutorDriver, tid, reason string, state mesos.TaskState) {
func (k *Executor) removePodTask(driver bindings.ExecutorDriver, tid, reason string, state mesos.TaskState) {
task, ok := k.tasks[tid]
if !ok {
log.V(1).Infof("Failed to remove task, unknown task %v\n", tid)
@@ -770,7 +770,7 @@ func (k *KubernetesExecutor) removePodTask(driver bindings.ExecutorDriver, tid,
}
// FrameworkMessage is called when the framework sends some message to the executor
func (k *KubernetesExecutor) FrameworkMessage(driver bindings.ExecutorDriver, message string) {
func (k *Executor) FrameworkMessage(driver bindings.ExecutorDriver, message string) {
if k.isDone() {
return
}
@@ -780,7 +780,7 @@ func (k *KubernetesExecutor) FrameworkMessage(driver bindings.ExecutorDriver, me
}
log.Infof("Receives message from framework %v\n", message)
//TODO(jdef) master reported a lost task, reconcile this! @see scheduler.go:handleTaskLost
//TODO(jdef) master reported a lost task, reconcile this! @see framework.go:handleTaskLost
if strings.HasPrefix(message, messages.TaskLost+":") {
taskId := message[len(messages.TaskLost)+1:]
if taskId != "" {
@@ -798,14 +798,14 @@ func (k *KubernetesExecutor) FrameworkMessage(driver bindings.ExecutorDriver, me
}
// Shutdown is called when the executor receives a shutdown request.
func (k *KubernetesExecutor) Shutdown(driver bindings.ExecutorDriver) {
func (k *Executor) Shutdown(driver bindings.ExecutorDriver) {
k.lock.Lock()
defer k.lock.Unlock()
k.doShutdown(driver)
}
// assumes that caller has obtained state lock
func (k *KubernetesExecutor) doShutdown(driver bindings.ExecutorDriver) {
func (k *Executor) doShutdown(driver bindings.ExecutorDriver) {
defer func() {
log.Errorf("exiting with unclean shutdown: %v", recover())
if k.exitFunc != nil {
@@ -859,7 +859,7 @@ func (k *KubernetesExecutor) doShutdown(driver bindings.ExecutorDriver) {
}
// Destroy existing k8s containers
func (k *KubernetesExecutor) killKubeletContainers() {
func (k *Executor) killKubeletContainers() {
if containers, err := dockertools.GetKubeletDockerContainers(k.dockerClient, true); err == nil {
opts := docker.RemoveContainerOptions{
RemoveVolumes: true,
@@ -878,7 +878,7 @@ func (k *KubernetesExecutor) killKubeletContainers() {
}
// Error is called when some error happens.
func (k *KubernetesExecutor) Error(driver bindings.ExecutorDriver, message string) {
func (k *Executor) Error(driver bindings.ExecutorDriver, message string) {
log.Errorln(message)
}
@@ -890,7 +890,7 @@ func newStatus(taskId *mesos.TaskID, state mesos.TaskState, message string) *mes
}
}
func (k *KubernetesExecutor) sendStatus(driver bindings.ExecutorDriver, status *mesos.TaskStatus) {
func (k *Executor) sendStatus(driver bindings.ExecutorDriver, status *mesos.TaskStatus) {
select {
case <-k.terminate:
default:
@@ -898,7 +898,7 @@ func (k *KubernetesExecutor) sendStatus(driver bindings.ExecutorDriver, status *
}
}
func (k *KubernetesExecutor) sendFrameworkMessage(driver bindings.ExecutorDriver, msg string) {
func (k *Executor) sendFrameworkMessage(driver bindings.ExecutorDriver, msg string) {
select {
case <-k.terminate:
default:
@@ -906,7 +906,7 @@ func (k *KubernetesExecutor) sendFrameworkMessage(driver bindings.ExecutorDriver
}
}
func (k *KubernetesExecutor) sendLoop() {
func (k *Executor) sendLoop() {
defer log.V(1).Info("sender loop exiting")
for {
select {

View File

@@ -170,11 +170,10 @@ func TestExecutorLaunchAndKillTask(t *testing.T) {
}
pod := NewTestPod(1)
podTask, err := podtask.New(api.NewDefaultContext(), "",
*pod, &mesosproto.ExecutorInfo{})
podTask, err := podtask.New(api.NewDefaultContext(), "", pod)
assert.Equal(t, nil, err, "must be able to create a task from a pod")
taskInfo := podTask.BuildTaskInfo()
taskInfo := podTask.BuildTaskInfo(&mesosproto.ExecutorInfo{})
data, err := testapi.Default.Codec().Encode(pod)
assert.Equal(t, nil, err, "must be able to encode a pod's spec data")
taskInfo.Data = data
@@ -417,10 +416,8 @@ func TestExecutorFrameworkMessage(t *testing.T) {
// set up a pod to then lose
pod := NewTestPod(1)
podTask, _ := podtask.New(api.NewDefaultContext(), "foo",
*pod, &mesosproto.ExecutorInfo{})
taskInfo := podTask.BuildTaskInfo()
podTask, _ := podtask.New(api.NewDefaultContext(), "foo", pod)
taskInfo := podTask.BuildTaskInfo(&mesosproto.ExecutorInfo{})
data, _ := testapi.Default.Codec().Encode(pod)
taskInfo.Data = data

View File

@@ -66,7 +66,7 @@ func (m *MockExecutorDriver) SendFrameworkMessage(msg string) (mesosproto.Status
return args.Get(0).(mesosproto.Status), args.Error(1)
}
func NewTestKubernetesExecutor() (*KubernetesExecutor, chan kubetypes.PodUpdate) {
func NewTestKubernetesExecutor() (*Executor, chan kubetypes.PodUpdate) {
updates := make(chan kubetypes.PodUpdate, 1024)
return New(Config{
Docker: dockertools.ConnectToDockerOrDie("fake://"),

View File

@@ -219,7 +219,7 @@ func (ms *MinionServer) launchHyperkubeServer(server string, args []string, logF
}
pwd, err := os.Getwd()
if err != nil {
log.Fatalf("Cannot get current directory: %v", err)
panic(fmt.Errorf("Cannot get current directory: %v", err))
}
kmEnv = append(kmEnv, fmt.Sprintf("%s:%s", e, path.Join(pwd, "bin")))
}

View File

@@ -0,0 +1,167 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package algorithm
import (
"fmt"
log "github.com/golang/glog"
"k8s.io/kubernetes/contrib/mesos/pkg/offers"
"k8s.io/kubernetes/contrib/mesos/pkg/queue"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/components/algorithm/podschedulers"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/errors"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/podtask"
"k8s.io/kubernetes/pkg/api"
"k8s.io/kubernetes/pkg/client/cache"
)
type SchedulerAlgorithm interface {
Schedule(pod *api.Pod) (string, error)
}
// SchedulerAlgorithm implements the algorithm.ScheduleAlgorithm interface
type schedulerAlgorithm struct {
sched scheduler.Scheduler
podUpdates queue.FIFO
podScheduler podschedulers.PodScheduler
}
func New(sched scheduler.Scheduler, podUpdates queue.FIFO, podScheduler podschedulers.PodScheduler) SchedulerAlgorithm {
return &schedulerAlgorithm{
sched: sched,
podUpdates: podUpdates,
podScheduler: podScheduler,
}
}
// Schedule implements the Scheduler interface of Kubernetes.
// It returns the selectedMachine's name and error (if there's any).
func (k *schedulerAlgorithm) Schedule(pod *api.Pod) (string, error) {
log.Infof("Try to schedule pod %v\n", pod.Name)
ctx := api.WithNamespace(api.NewDefaultContext(), pod.Namespace)
// default upstream scheduler passes pod.Name as binding.PodID
podKey, err := podtask.MakePodKey(ctx, pod.Name)
if err != nil {
return "", err
}
k.sched.Lock()
defer k.sched.Unlock()
switch task, state := k.sched.Tasks().ForPod(podKey); state {
case podtask.StateUnknown:
// There's a bit of a potential race here, a pod could have been yielded() and
// then before we get *here* it could be deleted.
// We use meta to index the pod in the store since that's what k8s reflector does.
podName, err := cache.MetaNamespaceKeyFunc(pod)
if err != nil {
log.Warningf("aborting Schedule, unable to understand pod object %+v", pod)
return "", errors.NoSuchPodErr
}
if deleted := k.podUpdates.Poll(podName, queue.DELETE_EVENT); deleted {
// avoid scheduling a pod that's been deleted between yieldPod() and Schedule()
log.Infof("aborting Schedule, pod has been deleted %+v", pod)
return "", errors.NoSuchPodErr
}
podTask, err := podtask.New(ctx, "", pod)
if err != nil {
log.Warningf("aborting Schedule, unable to create podtask object %+v: %v", pod, err)
return "", err
}
podTask, err = k.sched.Tasks().Register(podTask)
if err != nil {
return "", err
}
return k.doSchedule(podTask)
//TODO(jdef) it's possible that the pod state has diverged from what
//we knew previously, we should probably update the task.Pod state here
//before proceeding with scheduling
case podtask.StatePending:
if pod.UID != task.Pod.UID {
// we're dealing with a brand new pod spec here, so the old one must have been
// deleted -- and so our task store is out of sync w/ respect to reality
//TODO(jdef) reconcile task
return "", fmt.Errorf("task %v spec is out of sync with pod %v spec, aborting schedule", task.ID, pod.Name)
} else if task.Has(podtask.Launched) {
// task has been marked as "launched" but the pod binding creation may have failed in k8s,
// but we're going to let someone else handle it, probably the mesos task error handler
return "", fmt.Errorf("task %s has already been launched, aborting schedule", task.ID)
} else {
return k.doSchedule(task)
}
default:
return "", fmt.Errorf("task %s is not pending, nothing to schedule", task.ID)
}
}
// Call ScheduleFunc and subtract some resources, returning the name of the machine the task is scheduled on
func (k *schedulerAlgorithm) doSchedule(task *podtask.T) (string, error) {
var offer offers.Perishable
var err error
if task.HasAcceptedOffer() {
// verify that the offer is still on the table
var ok bool
offer, ok = k.sched.Offers().Get(task.GetOfferId())
if !ok || offer.HasExpired() {
task.Offer.Release()
task.Reset()
if err = k.sched.Tasks().Update(task); err != nil {
return "", err
}
}
}
if offer == nil {
offer, err = k.podScheduler.SchedulePod(k.sched.Offers(), task)
}
if err != nil {
return "", err
}
details := offer.Details()
if details == nil {
return "", fmt.Errorf("offer already invalid/expired for task %v", task.ID)
}
if task.Offer != nil && task.Offer != offer {
return "", fmt.Errorf("task.offer assignment must be idempotent, task %+v: offer %+v", task, offer)
}
task.Offer = offer
if err := k.podScheduler.Procurement()(task, details); err != nil {
offer.Release()
task.Reset()
return "", err
}
if err := k.sched.Tasks().Update(task); err != nil {
offer.Release()
return "", err
}
return details.GetHostname(), nil
}

View File

@@ -0,0 +1,18 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Package algorithm implements the SchedulerAlgorithm
package algorithm

View File

@@ -0,0 +1,19 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Package podschedulers defines an interface (w/ implementations) for matching
// pods against offers.
package podschedulers

View File

@@ -14,7 +14,7 @@ See the License for the specific language governing permissions and
limitations under the License.
*/
package scheduler
package podschedulers
import (
"fmt"
@@ -23,6 +23,7 @@ import (
"k8s.io/kubernetes/contrib/mesos/pkg/node"
"k8s.io/kubernetes/contrib/mesos/pkg/offers"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/errors"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/podtask"
)
@@ -62,7 +63,7 @@ func NewFCFSPodScheduler(as AllocationStrategy, lookupNode node.LookupFunc) PodS
}
// A first-come-first-serve scheduler: acquires the first offer that can support the task
func (fps *fcfsPodScheduler) SchedulePod(r offers.Registry, unused SlaveIndex, task *podtask.T) (offers.Perishable, error) {
func (fps *fcfsPodScheduler) SchedulePod(r offers.Registry, task *podtask.T) (offers.Perishable, error) {
podName := fmt.Sprintf("%s/%s", task.Pod.Namespace, task.Pod.Name)
var acceptedOffer offers.Perishable
err := r.Walk(func(p offers.Perishable) (bool, error) {
@@ -101,5 +102,5 @@ func (fps *fcfsPodScheduler) SchedulePod(r offers.Registry, unused SlaveIndex, t
return nil, err
}
log.V(2).Infof("failed to find a fit for pod: %s", podName)
return nil, noSuitableOffersErr
return nil, errors.NoSuitableOffersErr
}

View File

@@ -14,11 +14,9 @@ See the License for the specific language governing permissions and
limitations under the License.
*/
package scheduler
package podschedulers
import (
"errors"
"k8s.io/kubernetes/contrib/mesos/pkg/offers"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/podtask"
)
@@ -37,25 +35,11 @@ type PodScheduler interface {
// SchedulePod implements how to schedule pods among slaves.
// We can have different implementation for different scheduling policy.
//
// The function accepts a group of slaves (each contains offers from
// that slave) and a single pod, which aligns well with the k8s scheduling
// algorithm. It returns an offerId that is acceptable for the pod, otherwise
// nil. The caller is responsible for filling in task state w/ relevant offer
// details.
// The function accepts a set of offers and a single pod, which aligns well
// with the k8s scheduling algorithm. It returns an offerId that is acceptable
// for the pod, otherwise nil. The caller is responsible for filling in task
// state w/ relevant offer details.
//
// See the FCFSPodScheduler for example.
SchedulePod(r offers.Registry, slaves SlaveIndex, task *podtask.T) (offers.Perishable, error)
}
// A minimal placeholder
type empty struct{}
var (
noSuitableOffersErr = errors.New("No suitable offers for pod/task")
noSuchPodErr = errors.New("No such pod exists")
noSuchTaskErr = errors.New("No such task exists")
)
type SlaveIndex interface {
slaveHostNameFor(id string) string
SchedulePod(r offers.Registry, task *podtask.T) (offers.Perishable, error)
}

View File

@@ -0,0 +1,157 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package binder
import (
"fmt"
"strconv"
log "github.com/golang/glog"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/errors"
annotation "k8s.io/kubernetes/contrib/mesos/pkg/scheduler/meta"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/podtask"
"k8s.io/kubernetes/pkg/api"
)
type Binder interface {
Bind(binding *api.Binding) error
}
type binder struct {
sched scheduler.Scheduler
}
func New(sched scheduler.Scheduler) Binder {
return &binder{
sched: sched,
}
}
// implements binding.Registry, launches the pod-associated-task in mesos
func (b *binder) Bind(binding *api.Binding) error {
ctx := api.WithNamespace(api.NewContext(), binding.Namespace)
// default upstream scheduler passes pod.Name as binding.Name
podKey, err := podtask.MakePodKey(ctx, binding.Name)
if err != nil {
return err
}
b.sched.Lock()
defer b.sched.Unlock()
switch task, state := b.sched.Tasks().ForPod(podKey); state {
case podtask.StatePending:
return b.bind(ctx, binding, task)
default:
// in this case it's likely that the pod has been deleted between Schedule
// and Bind calls
log.Infof("No pending task for pod %s", podKey)
return errors.NoSuchPodErr //TODO(jdef) this error is somewhat misleading since the task could be running?!
}
}
func (b *binder) rollback(task *podtask.T, err error) error {
task.Offer.Release()
task.Reset()
if err2 := b.sched.Tasks().Update(task); err2 != nil {
log.Errorf("failed to update pod task: %v", err2)
}
return err
}
// assumes that: caller has acquired scheduler lock and that the task is still pending
//
// bind does not actually do the binding itself, but launches the pod as a Mesos task. The
// kubernetes executor on the slave will finally do the binding. This is different from the
// upstream scheduler in the sense that the upstream scheduler does the binding and the
// kubelet will notice that and launches the pod.
func (b *binder) bind(ctx api.Context, binding *api.Binding, task *podtask.T) (err error) {
// sanity check: ensure that the task hasAcceptedOffer(), it's possible that between
// Schedule() and now that the offer for this task was rescinded or invalidated.
// ((we should never see this here))
if !task.HasAcceptedOffer() {
return fmt.Errorf("task has not accepted a valid offer %v", task.ID)
}
// By this time, there is a chance that the slave is disconnected.
offerId := task.GetOfferId()
if offer, ok := b.sched.Offers().Get(offerId); !ok || offer.HasExpired() {
// already rescinded or timed out or otherwise invalidated
return b.rollback(task, fmt.Errorf("failed prior to launchTask due to expired offer for task %v", task.ID))
}
if err = b.prepareTaskForLaunch(ctx, binding.Target.Name, task, offerId); err == nil {
log.V(2).Infof("launching task: %q on target %q slave %q for pod \"%v/%v\", cpu %.2f, mem %.2f MB",
task.ID, binding.Target.Name, task.Spec.SlaveID, task.Pod.Namespace, task.Pod.Name, task.Spec.CPU, task.Spec.Memory)
if err = b.sched.LaunchTask(task); err == nil {
b.sched.Offers().Invalidate(offerId)
task.Set(podtask.Launched)
if err = b.sched.Tasks().Update(task); err != nil {
// this should only happen if the task has been removed or has changed status,
// which SHOULD NOT HAPPEN as long as we're synchronizing correctly
log.Errorf("failed to update task w/ Launched status: %v", err)
}
return
}
}
return b.rollback(task, fmt.Errorf("Failed to launch task %v: %v", task.ID, err))
}
//TODO(jdef) unit test this, ensure that task's copy of api.Pod is not modified
func (b *binder) prepareTaskForLaunch(ctx api.Context, machine string, task *podtask.T, offerId string) error {
pod := task.Pod
// we make an effort here to avoid making changes to the task's copy of the pod, since
// we want that to reflect the initial user spec, and not the modified spec that we
// build for the executor to consume.
oemCt := pod.Spec.Containers
pod.Spec.Containers = append([]api.Container{}, oemCt...) // (shallow) clone before mod
if pod.Annotations == nil {
pod.Annotations = make(map[string]string)
}
task.SaveRecoveryInfo(pod.Annotations)
pod.Annotations[annotation.BindingHostKey] = task.Spec.AssignedSlave
for _, entry := range task.Spec.PortMap {
oemPorts := pod.Spec.Containers[entry.ContainerIdx].Ports
ports := append([]api.ContainerPort{}, oemPorts...)
p := &ports[entry.PortIdx]
p.HostPort = int(entry.OfferPort)
op := strconv.FormatUint(entry.OfferPort, 10)
pod.Annotations[fmt.Sprintf(annotation.PortMappingKeyFormat, p.Protocol, p.ContainerPort)] = op
if p.Name != "" {
pod.Annotations[fmt.Sprintf(annotation.PortNameMappingKeyFormat, p.Protocol, p.Name)] = op
}
pod.Spec.Containers[entry.ContainerIdx].Ports = ports
}
// the kubelet-executor uses this to instantiate the pod
log.V(3).Infof("prepared pod spec: %+v", pod)
data, err := api.Codec.Encode(&pod)
if err != nil {
log.V(2).Infof("Failed to marshal the pod spec: %v", err)
return err
}
task.Spec.Data = data
return nil
}

View File

@@ -0,0 +1,19 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Package binder implements the Binder which launched a task and let the
// executor do the actual binding.
package binder

View File

@@ -0,0 +1,107 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package controller
import (
"time"
log "github.com/golang/glog"
"k8s.io/kubernetes/contrib/mesos/pkg/runtime"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/components/algorithm"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/components/binder"
"k8s.io/kubernetes/pkg/api"
"k8s.io/kubernetes/pkg/client/record"
client "k8s.io/kubernetes/pkg/client/unversioned"
)
const (
recoveryDelay = 100 * time.Millisecond // delay after scheduler plugin crashes, before we resume scheduling
FailedScheduling = "FailedScheduling"
Scheduled = "Scheduled"
)
type Controller interface {
Run(<-chan struct{})
}
type controller struct {
algorithm algorithm.SchedulerAlgorithm
binder binder.Binder
nextPod func() *api.Pod
error func(*api.Pod, error)
recorder record.EventRecorder
client *client.Client
started chan<- struct{} // startup latch
}
func New(client *client.Client, algorithm algorithm.SchedulerAlgorithm,
recorder record.EventRecorder, nextPod func() *api.Pod, error func(pod *api.Pod, schedulingErr error),
binder binder.Binder, started chan<- struct{}) Controller {
return &controller{
algorithm: algorithm,
binder: binder,
nextPod: nextPod,
error: error,
recorder: recorder,
client: client,
started: started,
}
}
func (s *controller) Run(done <-chan struct{}) {
defer close(s.started)
go runtime.Until(s.scheduleOne, recoveryDelay, done)
}
// hacked from GoogleCloudPlatform/kubernetes/plugin/pkg/scheduler/scheduler.go,
// with the Modeler stuff removed since we don't use it because we have mesos.
func (s *controller) scheduleOne() {
pod := s.nextPod()
// pods which are pre-scheduled (i.e. NodeName is set) are deleted by the kubelet
// in upstream. Not so in Mesos because the kubelet hasn't see that pod yet. Hence,
// the scheduler has to take care of this:
if pod.Spec.NodeName != "" && pod.DeletionTimestamp != nil {
log.V(3).Infof("deleting pre-scheduled, not yet running pod: %s/%s", pod.Namespace, pod.Name)
s.client.Pods(pod.Namespace).Delete(pod.Name, api.NewDeleteOptions(0))
return
}
log.V(3).Infof("Attempting to schedule: %+v", pod)
dest, err := s.algorithm.Schedule(pod)
if err != nil {
log.V(1).Infof("Failed to schedule: %+v", pod)
s.recorder.Eventf(pod, FailedScheduling, "Error scheduling: %v", err)
s.error(pod, err)
return
}
b := &api.Binding{
ObjectMeta: api.ObjectMeta{Namespace: pod.Namespace, Name: pod.Name},
Target: api.ObjectReference{
Kind: "Node",
Name: dest,
},
}
if err := s.binder.Bind(b); err != nil {
log.V(1).Infof("Failed to bind pod: %+v", err)
s.recorder.Eventf(pod, FailedScheduling, "Binding rejected: %v", err)
s.error(pod, err)
return
}
s.recorder.Eventf(pod, Scheduled, "Successfully assigned %v to %v", pod.Name, dest)
}

View File

@@ -0,0 +1,20 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Package controller implements the scheduling controller which waits for pod
// events from the queuer (i.e. from the apiserver), passes them to the
// SchedulerAlgorithm and in case of success to the binder which does the launch.
package controller

View File

@@ -0,0 +1,125 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package deleter
import (
"time"
log "github.com/golang/glog"
"k8s.io/kubernetes/contrib/mesos/pkg/queue"
"k8s.io/kubernetes/contrib/mesos/pkg/runtime"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/errors"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/podtask"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/queuer"
"k8s.io/kubernetes/pkg/api"
)
type Deleter interface {
Run(updates <-chan queue.Entry, done <-chan struct{})
DeleteOne(pod *queuer.Pod) error
}
type deleter struct {
sched scheduler.Scheduler
qr queuer.Queuer
}
func New(sched scheduler.Scheduler, qr queuer.Queuer) Deleter {
return &deleter{
sched: sched,
qr: qr,
}
}
// currently monitors for "pod deleted" events, upon which handle()
// is invoked.
func (k *deleter) Run(updates <-chan queue.Entry, done <-chan struct{}) {
go runtime.Until(func() {
for {
entry := <-updates
pod := entry.Value().(*queuer.Pod)
if entry.Is(queue.DELETE_EVENT) {
if err := k.DeleteOne(pod); err != nil {
log.Error(err)
}
} else if !entry.Is(queue.POP_EVENT) {
k.qr.UpdatesAvailable()
}
}
}, 1*time.Second, done)
}
func (k *deleter) DeleteOne(pod *queuer.Pod) error {
ctx := api.WithNamespace(api.NewDefaultContext(), pod.Namespace)
podKey, err := podtask.MakePodKey(ctx, pod.Name)
if err != nil {
return err
}
log.V(2).Infof("pod deleted: %v", podKey)
// order is important here: we want to make sure we have the lock before
// removing the pod from the scheduling queue. this makes the concurrent
// execution of scheduler-error-handling and delete-handling easier to
// reason about.
k.sched.Lock()
defer k.sched.Unlock()
// prevent the scheduler from attempting to pop this; it's also possible that
// it's concurrently being scheduled (somewhere between pod scheduling and
// binding) - if so, then we'll end up removing it from taskRegistry which
// will abort Bind()ing
k.qr.Dequeue(pod.GetUID())
switch task, state := k.sched.Tasks().ForPod(podKey); state {
case podtask.StateUnknown:
log.V(2).Infof("Could not resolve pod '%s' to task id", podKey)
return errors.NoSuchPodErr
// determine if the task has already been launched to mesos, if not then
// cleanup is easier (unregister) since there's no state to sync
case podtask.StatePending:
if !task.Has(podtask.Launched) {
// we've been invoked in between Schedule() and Bind()
if task.HasAcceptedOffer() {
task.Offer.Release()
task.Reset()
task.Set(podtask.Deleted)
//TODO(jdef) probably want better handling here
if err := k.sched.Tasks().Update(task); err != nil {
return err
}
}
k.sched.Tasks().Unregister(task)
return nil
}
fallthrough
case podtask.StateRunning:
// signal to watchers that the related pod is going down
task.Set(podtask.Deleted)
if err := k.sched.Tasks().Update(task); err != nil {
log.Errorf("failed to update task w/ Deleted status: %v", err)
}
return k.sched.KillTask(task.ID)
default:
log.Infof("cannot kill pod '%s': non-terminal task not found %v", podKey, task.ID)
return errors.NoSuchTaskErr
}
}

View File

@@ -0,0 +1,160 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package deleter
import (
"testing"
"github.com/stretchr/testify/assert"
"k8s.io/kubernetes/contrib/mesos/pkg/queue"
types "k8s.io/kubernetes/contrib/mesos/pkg/scheduler"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/errors"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/podtask"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/queuer"
"k8s.io/kubernetes/pkg/api"
)
func TestDeleteOne_NonexistentPod(t *testing.T) {
assert := assert.New(t)
obj := &types.MockScheduler{}
reg := podtask.NewInMemoryRegistry()
obj.On("Tasks").Return(reg)
q := queue.NewDelayFIFO()
qr := queuer.New(q, nil)
assert.Equal(0, len(q.List()))
d := New(obj, qr)
pod := &queuer.Pod{Pod: &api.Pod{
ObjectMeta: api.ObjectMeta{
Name: "foo",
Namespace: api.NamespaceDefault,
}}}
err := d.DeleteOne(pod)
assert.Equal(err, errors.NoSuchPodErr)
obj.AssertExpectations(t)
}
func TestDeleteOne_PendingPod(t *testing.T) {
assert := assert.New(t)
obj := &types.MockScheduler{}
reg := podtask.NewInMemoryRegistry()
obj.On("Tasks").Return(reg)
pod := &queuer.Pod{Pod: &api.Pod{
ObjectMeta: api.ObjectMeta{
Name: "foo",
UID: "foo0",
Namespace: api.NamespaceDefault,
}}}
task, err := podtask.New(api.NewDefaultContext(), "bar", pod.Pod)
if err != nil {
t.Fatalf("failed to create task: %v", err)
}
_, err = reg.Register(task)
if err != nil {
t.Fatalf("failed to register task: %v", err)
}
// preconditions
q := queue.NewDelayFIFO()
qr := queuer.New(q, nil)
q.Add(pod, queue.ReplaceExisting)
assert.Equal(1, len(q.List()))
_, found := q.Get("default/foo")
assert.True(found)
// exec & post conditions
d := New(obj, qr)
err = d.DeleteOne(pod)
assert.Nil(err)
_, found = q.Get("foo0")
assert.False(found)
assert.Equal(0, len(q.List()))
obj.AssertExpectations(t)
}
func TestDeleteOne_Running(t *testing.T) {
assert := assert.New(t)
obj := &types.MockScheduler{}
reg := podtask.NewInMemoryRegistry()
obj.On("Tasks").Return(reg)
pod := &queuer.Pod{Pod: &api.Pod{
ObjectMeta: api.ObjectMeta{
Name: "foo",
UID: "foo0",
Namespace: api.NamespaceDefault,
}}}
task, err := podtask.New(api.NewDefaultContext(), "bar", pod.Pod)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
task, err = reg.Register(task)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
task.Set(podtask.Launched)
err = reg.Update(task)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
// preconditions
q := queue.NewDelayFIFO()
qr := queuer.New(q, nil)
q.Add(pod, queue.ReplaceExisting)
assert.Equal(1, len(q.List()))
_, found := q.Get("default/foo")
assert.True(found)
obj.On("KillTask", task.ID).Return(nil)
// exec & post conditions
d := New(obj, qr)
err = d.DeleteOne(pod)
assert.Nil(err)
_, found = q.Get("foo0")
assert.False(found)
assert.Equal(0, len(q.List()))
obj.AssertExpectations(t)
}
func TestDeleteOne_badPodNaming(t *testing.T) {
assert := assert.New(t)
obj := &types.MockScheduler{}
pod := &queuer.Pod{Pod: &api.Pod{}}
q := queue.NewDelayFIFO()
qr := queuer.New(q, nil)
d := New(obj, qr)
err := d.DeleteOne(pod)
assert.NotNil(err)
pod.Pod.ObjectMeta.Name = "foo"
err = d.DeleteOne(pod)
assert.NotNil(err)
pod.Pod.ObjectMeta.Name = ""
pod.Pod.ObjectMeta.Namespace = "bar"
err = d.DeleteOne(pod)
assert.NotNil(err)
obj.AssertExpectations(t)
}

View File

@@ -0,0 +1,19 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Package deleter implements the deleter which listens for pod DELETE events
// from the apiserver and kills tasks for deleted pods.
package deleter

View File

@@ -0,0 +1,20 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Package components implements independent aspects of the scheduler which
// do not use Framework or Scheduler internals, but rely solely on the Scheduler
// interface.
package components

View File

@@ -0,0 +1,19 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Package errorhandler implements the ErrorHandler which handles scheduer error
// and possibly requeue pods for scheduling again.
package errorhandler

View File

@@ -0,0 +1,97 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package errorhandler
import (
log "github.com/golang/glog"
"k8s.io/kubernetes/contrib/mesos/pkg/backoff"
"k8s.io/kubernetes/contrib/mesos/pkg/queue"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/errors"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/podtask"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/queuer"
"k8s.io/kubernetes/pkg/api"
"k8s.io/kubernetes/pkg/util"
)
type ErrorHandler interface {
Error(pod *api.Pod, schedulingErr error)
}
type errorHandler struct {
sched scheduler.Scheduler
backoff *backoff.Backoff
qr queuer.Queuer
newBreakChan func(podKey string) queue.BreakChan
}
func New(sched scheduler.Scheduler, backoff *backoff.Backoff, qr queuer.Queuer, newBC func(podKey string) queue.BreakChan) ErrorHandler {
return &errorHandler{
sched: sched,
backoff: backoff,
qr: qr,
newBreakChan: newBC,
}
}
// implementation of scheduling plugin's Error func; see plugin/pkg/scheduler
func (k *errorHandler) Error(pod *api.Pod, schedulingErr error) {
if schedulingErr == errors.NoSuchPodErr {
log.V(2).Infof("Not rescheduling non-existent pod %v", pod.Name)
return
}
log.Infof("Error scheduling %v: %v; retrying", pod.Name, schedulingErr)
defer util.HandleCrash()
// default upstream scheduler passes pod.Name as binding.PodID
ctx := api.WithNamespace(api.NewDefaultContext(), pod.Namespace)
podKey, err := podtask.MakePodKey(ctx, pod.Name)
if err != nil {
log.Errorf("Failed to construct pod key, aborting scheduling for pod %v: %v", pod.Name, err)
return
}
k.backoff.GC()
k.sched.Lock()
defer k.sched.Unlock()
switch task, state := k.sched.Tasks().ForPod(podKey); state {
case podtask.StateUnknown:
// if we don't have a mapping here any more then someone deleted the pod
log.V(2).Infof("Could not resolve pod to task, aborting pod reschdule: %s", podKey)
return
case podtask.StatePending:
if task.Has(podtask.Launched) {
log.V(2).Infof("Skipping re-scheduling for already-launched pod %v", podKey)
return
}
breakoutEarly := queue.BreakChan(nil)
if schedulingErr == errors.NoSuitableOffersErr {
log.V(3).Infof("adding backoff breakout handler for pod %v", podKey)
breakoutEarly = k.newBreakChan(podKey)
}
delay := k.backoff.Get(podKey)
log.V(3).Infof("requeuing pod %v with delay %v", podKey, delay)
k.qr.Requeue(&queuer.Pod{Pod: pod, Delay: &delay, Notify: breakoutEarly})
default:
log.V(2).Infof("Task is no longer pending, aborting reschedule for pod %v", podKey)
}
}

View File

@@ -14,5 +14,5 @@ See the License for the specific language governing permissions and
limitations under the License.
*/
// Package slave manages node hostnames for slave ids.
package slave
// Package framework implements the Mesos scheduler.
package framework

View File

@@ -14,83 +14,13 @@ See the License for the specific language governing permissions and
limitations under the License.
*/
package scheduler
package framework
import (
"sync"
"testing"
mesos "github.com/mesos/mesos-go/mesosproto"
"github.com/stretchr/testify/mock"
"k8s.io/kubernetes/contrib/mesos/pkg/offers"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/podtask"
"k8s.io/kubernetes/pkg/api"
)
// implements SchedulerInterface
type MockScheduler struct {
sync.RWMutex
mock.Mock
}
func (m *MockScheduler) slaveHostNameFor(id string) (hostName string) {
args := m.Called(id)
x := args.Get(0)
if x != nil {
hostName = x.(string)
}
return
}
func (m *MockScheduler) algorithm() (f PodScheduler) {
args := m.Called()
x := args.Get(0)
if x != nil {
f = x.(PodScheduler)
}
return
}
func (m *MockScheduler) createPodTask(ctx api.Context, pod *api.Pod) (task *podtask.T, err error) {
args := m.Called(ctx, pod)
x := args.Get(0)
if x != nil {
task = x.(*podtask.T)
}
err = args.Error(1)
return
}
func (m *MockScheduler) offers() (f offers.Registry) {
args := m.Called()
x := args.Get(0)
if x != nil {
f = x.(offers.Registry)
}
return
}
func (m *MockScheduler) tasks() (f podtask.Registry) {
args := m.Called()
x := args.Get(0)
if x != nil {
f = x.(podtask.Registry)
}
return
}
func (m *MockScheduler) killTask(taskId string) error {
args := m.Called(taskId)
return args.Error(0)
}
func (m *MockScheduler) launchTask(task *podtask.T) error {
args := m.Called(task)
return args.Error(0)
}
// @deprecated this is a placeholder for me to test the mock package
func TestNoSlavesYet(t *testing.T) {
obj := &MockScheduler{}
obj.On("slaveHostNameFor", "foo").Return(nil)
obj.slaveHostNameFor("foo")
obj.AssertExpectations(t)
}
/*-----------------------------------------------------------------------------
|
| this really belongs in the mesos-go package, but that's being updated soon
@@ -146,57 +76,84 @@ func (m *MockSchedulerDriver) Init() error {
args := m.Called()
return args.Error(0)
}
func (m *MockSchedulerDriver) Start() (mesos.Status, error) {
args := m.Called()
return status(args, 0), args.Error(1)
}
func (m *MockSchedulerDriver) Stop(b bool) (mesos.Status, error) {
args := m.Called(b)
return status(args, 0), args.Error(1)
}
func (m *MockSchedulerDriver) Abort() (mesos.Status, error) {
args := m.Called()
return status(args, 0), args.Error(1)
}
func (m *MockSchedulerDriver) Join() (mesos.Status, error) {
args := m.Called()
return status(args, 0), args.Error(1)
}
func (m *MockSchedulerDriver) Run() (mesos.Status, error) {
args := m.Called()
return status(args, 0), args.Error(1)
}
func (m *MockSchedulerDriver) RequestResources(r []*mesos.Request) (mesos.Status, error) {
args := m.Called(r)
return status(args, 0), args.Error(1)
}
func (m *MockSchedulerDriver) ReconcileTasks(statuses []*mesos.TaskStatus) (mesos.Status, error) {
args := m.Called(statuses)
return status(args, 0), args.Error(1)
}
func (m *MockSchedulerDriver) LaunchTasks(offerIds []*mesos.OfferID, ti []*mesos.TaskInfo, f *mesos.Filters) (mesos.Status, error) {
args := m.Called(offerIds, ti, f)
return status(args, 0), args.Error(1)
}
func (m *MockSchedulerDriver) KillTask(tid *mesos.TaskID) (mesos.Status, error) {
args := m.Called(tid)
return status(args, 0), args.Error(1)
}
func (m *MockSchedulerDriver) DeclineOffer(oid *mesos.OfferID, f *mesos.Filters) (mesos.Status, error) {
args := m.Called(oid, f)
return status(args, 0), args.Error(1)
}
func (m *MockSchedulerDriver) ReviveOffers() (mesos.Status, error) {
args := m.Called()
return status(args, 0), args.Error(0)
}
func (m *MockSchedulerDriver) SendFrameworkMessage(eid *mesos.ExecutorID, sid *mesos.SlaveID, s string) (mesos.Status, error) {
args := m.Called(eid, sid, s)
return status(args, 0), args.Error(1)
}
func (m *MockSchedulerDriver) Destroy() {
m.Called()
}
func (m *MockSchedulerDriver) Wait() {
m.Called()
}
type JoinableDriver struct {
MockSchedulerDriver
joinFunc func() (mesos.Status, error)
}
// Join invokes joinFunc if it has been set, otherwise blocks forever
func (m *JoinableDriver) Join() (mesos.Status, error) {
if m.joinFunc != nil {
return m.joinFunc()
}
select {}
}

View File

@@ -0,0 +1,716 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package framework
import (
"fmt"
"io"
"math"
"net/http"
"sync"
"time"
log "github.com/golang/glog"
mesos "github.com/mesos/mesos-go/mesosproto"
mutil "github.com/mesos/mesos-go/mesosutil"
bindings "github.com/mesos/mesos-go/scheduler"
execcfg "k8s.io/kubernetes/contrib/mesos/pkg/executor/config"
"k8s.io/kubernetes/contrib/mesos/pkg/executor/messages"
"k8s.io/kubernetes/contrib/mesos/pkg/node"
"k8s.io/kubernetes/contrib/mesos/pkg/offers"
offermetrics "k8s.io/kubernetes/contrib/mesos/pkg/offers/metrics"
"k8s.io/kubernetes/contrib/mesos/pkg/proc"
"k8s.io/kubernetes/contrib/mesos/pkg/runtime"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/components/tasksreconciler"
schedcfg "k8s.io/kubernetes/contrib/mesos/pkg/scheduler/config"
merrors "k8s.io/kubernetes/contrib/mesos/pkg/scheduler/errors"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/meta"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/metrics"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/podtask"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/uid"
"k8s.io/kubernetes/pkg/api"
"k8s.io/kubernetes/pkg/api/errors"
client "k8s.io/kubernetes/pkg/client/unversioned"
"k8s.io/kubernetes/pkg/fields"
"k8s.io/kubernetes/pkg/kubelet/container"
kubetypes "k8s.io/kubernetes/pkg/kubelet/types"
"k8s.io/kubernetes/pkg/labels"
"k8s.io/kubernetes/pkg/util/sets"
)
type Framework interface {
bindings.Scheduler
Init(sched scheduler.Scheduler, electedMaster proc.Process, mux *http.ServeMux) error
Registration() <-chan struct{}
Offers() offers.Registry
LaunchTask(t *podtask.T) error
KillTask(id string) error
}
type framework struct {
// We use a lock here to avoid races
// between invoking the mesos callback
*sync.RWMutex
// Config related, write-once
sched scheduler.Scheduler
schedulerConfig *schedcfg.Config
executor *mesos.ExecutorInfo
executorGroup uint64
client *client.Client
failoverTimeout float64 // in seconds
reconcileInterval int64
nodeRegistrator node.Registrator
storeFrameworkId func(id string)
// Mesos context
driver bindings.SchedulerDriver // late initialization
frameworkId *mesos.FrameworkID
masterInfo *mesos.MasterInfo
registered bool
registration chan struct{} // signal chan that closes upon first successful registration
onRegistration sync.Once
offers offers.Registry
slaveHostNames *slaveRegistry
// via deferred init
tasksReconciler taskreconciler.TasksReconciler
mux *http.ServeMux
reconcileCooldown time.Duration
asRegisteredMaster proc.Doer
terminate <-chan struct{} // signal chan, closes when we should kill background tasks
}
type Config struct {
SchedulerConfig schedcfg.Config
Executor *mesos.ExecutorInfo
Client *client.Client
StoreFrameworkId func(id string)
FailoverTimeout float64
ReconcileInterval int64
ReconcileCooldown time.Duration
LookupNode node.LookupFunc
}
// New creates a new Framework
func New(config Config) Framework {
var k *framework
k = &framework{
schedulerConfig: &config.SchedulerConfig,
RWMutex: new(sync.RWMutex),
executor: config.Executor,
executorGroup: uid.Parse(config.Executor.ExecutorId.GetValue()).Group(),
client: config.Client,
failoverTimeout: config.FailoverTimeout,
reconcileInterval: config.ReconcileInterval,
nodeRegistrator: node.NewRegistrator(config.Client, config.LookupNode),
offers: offers.CreateRegistry(offers.RegistryConfig{
Compat: func(o *mesos.Offer) bool {
// the node must be registered and have up-to-date labels
n := config.LookupNode(o.GetHostname())
if n == nil || !node.IsUpToDate(n, node.SlaveAttributesToLabels(o.GetAttributes())) {
return false
}
// the executor IDs must not identify a kubelet-executor with a group that doesn't match ours
for _, eid := range o.GetExecutorIds() {
execuid := uid.Parse(eid.GetValue())
if execuid.Name() == execcfg.DefaultInfoID && execuid.Group() != k.executorGroup {
return false
}
}
return true
},
DeclineOffer: func(id string) <-chan error {
errOnce := proc.NewErrorOnce(k.terminate)
errOuter := k.asRegisteredMaster.Do(func() {
var err error
defer errOnce.Report(err)
offerId := mutil.NewOfferID(id)
filters := &mesos.Filters{}
_, err = k.driver.DeclineOffer(offerId, filters)
})
return errOnce.Send(errOuter).Err()
},
// remember expired offers so that we can tell if a previously scheduler offer relies on one
LingerTTL: config.SchedulerConfig.OfferLingerTTL.Duration,
TTL: config.SchedulerConfig.OfferTTL.Duration,
ListenerDelay: config.SchedulerConfig.ListenerDelay.Duration,
}),
slaveHostNames: newSlaveRegistry(),
reconcileCooldown: config.ReconcileCooldown,
registration: make(chan struct{}),
asRegisteredMaster: proc.DoerFunc(func(proc.Action) <-chan error {
return proc.ErrorChanf("cannot execute action with unregistered scheduler")
}),
storeFrameworkId: config.StoreFrameworkId,
}
return k
}
func (k *framework) Init(sched scheduler.Scheduler, electedMaster proc.Process, mux *http.ServeMux) error {
log.V(1).Infoln("initializing kubernetes mesos scheduler")
k.sched = sched
k.mux = mux
k.asRegisteredMaster = proc.DoerFunc(func(a proc.Action) <-chan error {
if !k.registered {
return proc.ErrorChanf("failed to execute action, scheduler is disconnected")
}
return electedMaster.Do(a)
})
k.terminate = electedMaster.Done()
k.offers.Init(k.terminate)
k.nodeRegistrator.Run(k.terminate)
return k.recoverTasks()
}
func (k *framework) asMaster() proc.Doer {
k.RLock()
defer k.RUnlock()
return k.asRegisteredMaster
}
func (k *framework) installDebugHandlers(mux *http.ServeMux) {
wrappedHandler := func(uri string, h http.Handler) {
mux.HandleFunc(uri, func(w http.ResponseWriter, r *http.Request) {
ch := make(chan struct{})
closer := runtime.Closer(ch)
proc.OnError(k.asMaster().Do(func() {
defer closer()
h.ServeHTTP(w, r)
}), func(err error) {
defer closer()
log.Warningf("failed HTTP request for %s: %v", uri, err)
w.WriteHeader(http.StatusServiceUnavailable)
}, k.terminate)
select {
case <-time.After(k.schedulerConfig.HttpHandlerTimeout.Duration):
log.Warningf("timed out waiting for request to be processed")
w.WriteHeader(http.StatusServiceUnavailable)
return
case <-ch: // noop
}
})
}
requestReconciliation := func(uri string, requestAction func()) {
wrappedHandler(uri, http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
requestAction()
w.WriteHeader(http.StatusNoContent)
}))
}
requestReconciliation("/debug/actions/requestExplicit", k.tasksReconciler.RequestExplicit)
requestReconciliation("/debug/actions/requestImplicit", k.tasksReconciler.RequestImplicit)
wrappedHandler("/debug/actions/kamikaze", http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
slaves := k.slaveHostNames.SlaveIDs()
for _, slaveId := range slaves {
_, err := k.driver.SendFrameworkMessage(
k.executor.ExecutorId,
mutil.NewSlaveID(slaveId),
messages.Kamikaze)
if err != nil {
log.Warningf("failed to send kamikaze message to slave %s: %v", slaveId, err)
} else {
io.WriteString(w, fmt.Sprintf("kamikaze slave %s\n", slaveId))
}
}
io.WriteString(w, "OK")
}))
}
func (k *framework) Registration() <-chan struct{} {
return k.registration
}
// Registered is called when the scheduler registered with the master successfully.
func (k *framework) Registered(drv bindings.SchedulerDriver, fid *mesos.FrameworkID, mi *mesos.MasterInfo) {
log.Infof("Scheduler registered with the master: %v with frameworkId: %v\n", mi, fid)
k.driver = drv
k.frameworkId = fid
k.masterInfo = mi
k.registered = true
k.onRegistration.Do(func() { k.onInitialRegistration(drv) })
k.tasksReconciler.RequestExplicit()
}
// Reregistered is called when the scheduler re-registered with the master successfully.
// This happends when the master fails over.
func (k *framework) Reregistered(drv bindings.SchedulerDriver, mi *mesos.MasterInfo) {
log.Infof("Scheduler reregistered with the master: %v\n", mi)
k.driver = drv
k.masterInfo = mi
k.registered = true
k.onRegistration.Do(func() { k.onInitialRegistration(drv) })
k.tasksReconciler.RequestExplicit()
}
// perform one-time initialization actions upon the first registration event received from Mesos.
func (k *framework) onInitialRegistration(driver bindings.SchedulerDriver) {
defer close(k.registration)
if k.failoverTimeout > 0 {
refreshInterval := k.schedulerConfig.FrameworkIdRefreshInterval.Duration
if k.failoverTimeout < k.schedulerConfig.FrameworkIdRefreshInterval.Duration.Seconds() {
refreshInterval = time.Duration(math.Max(1, k.failoverTimeout/2)) * time.Second
}
go runtime.Until(func() {
k.storeFrameworkId(k.frameworkId.GetValue())
}, refreshInterval, k.terminate)
}
r1 := k.makeTaskRegistryReconciler()
r2 := k.makePodRegistryReconciler()
k.tasksReconciler = taskreconciler.New(k.asRegisteredMaster, taskreconciler.MakeComposite(k.terminate, r1, r2),
k.reconcileCooldown, k.schedulerConfig.ExplicitReconciliationAbortTimeout.Duration, k.terminate)
go k.tasksReconciler.Run(driver, k.terminate)
if k.reconcileInterval > 0 {
ri := time.Duration(k.reconcileInterval) * time.Second
time.AfterFunc(k.schedulerConfig.InitialImplicitReconciliationDelay.Duration, func() { runtime.Until(k.tasksReconciler.RequestImplicit, ri, k.terminate) })
log.Infof("will perform implicit task reconciliation at interval: %v after %v", ri, k.schedulerConfig.InitialImplicitReconciliationDelay.Duration)
}
k.installDebugHandlers(k.mux)
}
// Disconnected is called when the scheduler loses connection to the master.
func (k *framework) Disconnected(driver bindings.SchedulerDriver) {
log.Infof("Master disconnected!\n")
k.registered = false
// discard all cached offers to avoid unnecessary TASK_LOST updates
k.offers.Invalidate("")
}
// ResourceOffers is called when the scheduler receives some offers from the master.
func (k *framework) ResourceOffers(driver bindings.SchedulerDriver, offers []*mesos.Offer) {
log.V(2).Infof("Received offers %+v", offers)
// Record the offers in the global offer map as well as each slave's offer map.
k.offers.Add(offers)
for _, offer := range offers {
slaveId := offer.GetSlaveId().GetValue()
k.slaveHostNames.Register(slaveId, offer.GetHostname())
// create api object if not existing already
if k.nodeRegistrator != nil {
labels := node.SlaveAttributesToLabels(offer.GetAttributes())
_, err := k.nodeRegistrator.Register(offer.GetHostname(), labels)
if err != nil {
log.Error(err)
}
}
}
}
// OfferRescinded is called when the resources are recinded from the scheduler.
func (k *framework) OfferRescinded(driver bindings.SchedulerDriver, offerId *mesos.OfferID) {
log.Infof("Offer rescinded %v\n", offerId)
oid := offerId.GetValue()
k.offers.Delete(oid, offermetrics.OfferRescinded)
}
// StatusUpdate is called when a status update message is sent to the scheduler.
func (k *framework) StatusUpdate(driver bindings.SchedulerDriver, taskStatus *mesos.TaskStatus) {
source, reason := "none", "none"
if taskStatus.Source != nil {
source = (*taskStatus.Source).String()
}
if taskStatus.Reason != nil {
reason = (*taskStatus.Reason).String()
}
taskState := taskStatus.GetState()
metrics.StatusUpdates.WithLabelValues(source, reason, taskState.String()).Inc()
message := "none"
if taskStatus.Message != nil {
message = *taskStatus.Message
}
log.Infof(
"task status update %q from %q for task %q on slave %q executor %q for reason %q with message %q",
taskState.String(),
source,
taskStatus.TaskId.GetValue(),
taskStatus.SlaveId.GetValue(),
taskStatus.ExecutorId.GetValue(),
reason,
message,
)
switch taskState {
case mesos.TaskState_TASK_RUNNING, mesos.TaskState_TASK_FINISHED, mesos.TaskState_TASK_STARTING, mesos.TaskState_TASK_STAGING:
if _, state := k.sched.Tasks().UpdateStatus(taskStatus); state == podtask.StateUnknown {
if taskState != mesos.TaskState_TASK_FINISHED {
//TODO(jdef) what if I receive this after a TASK_LOST or TASK_KILLED?
//I don't want to reincarnate then.. TASK_LOST is a special case because
//the master is stateless and there are scenarios where I may get TASK_LOST
//followed by TASK_RUNNING.
//TODO(jdef) consider running this asynchronously since there are API server
//calls that may be made
k.reconcileNonTerminalTask(driver, taskStatus)
} // else, we don't really care about FINISHED tasks that aren't registered
return
}
if hostName := k.slaveHostNames.HostName(taskStatus.GetSlaveId().GetValue()); hostName == "" {
// a registered task has an update reported by a slave that we don't recognize.
// this should never happen! So we don't reconcile it.
log.Errorf("Ignore status %+v because the slave does not exist", taskStatus)
return
}
case mesos.TaskState_TASK_FAILED, mesos.TaskState_TASK_ERROR:
if task, _ := k.sched.Tasks().UpdateStatus(taskStatus); task != nil {
if task.Has(podtask.Launched) && !task.Has(podtask.Bound) {
go k.sched.Reconcile(task)
return
}
} else {
// unknown task failed, not much we can do about it
return
}
// last-ditch effort to reconcile our records
fallthrough
case mesos.TaskState_TASK_LOST, mesos.TaskState_TASK_KILLED:
k.reconcileTerminalTask(driver, taskStatus)
default:
log.Errorf(
"unknown task status %q from %q for task %q on slave %q executor %q for reason %q with message %q",
taskState.String(),
source,
taskStatus.TaskId.GetValue(),
taskStatus.SlaveId.GetValue(),
taskStatus.ExecutorId.GetValue(),
reason,
message,
)
}
}
func (k *framework) reconcileTerminalTask(driver bindings.SchedulerDriver, taskStatus *mesos.TaskStatus) {
task, state := k.sched.Tasks().UpdateStatus(taskStatus)
if (state == podtask.StateRunning || state == podtask.StatePending) &&
((taskStatus.GetSource() == mesos.TaskStatus_SOURCE_MASTER && taskStatus.GetReason() == mesos.TaskStatus_REASON_RECONCILIATION) ||
(taskStatus.GetSource() == mesos.TaskStatus_SOURCE_SLAVE && taskStatus.GetReason() == mesos.TaskStatus_REASON_EXECUTOR_TERMINATED) ||
(taskStatus.GetSource() == mesos.TaskStatus_SOURCE_SLAVE && taskStatus.GetReason() == mesos.TaskStatus_REASON_EXECUTOR_UNREGISTERED) ||
(taskStatus.GetSource() == mesos.TaskStatus_SOURCE_EXECUTOR && taskStatus.GetMessage() == messages.ContainersDisappeared)) {
//--
// pod-task has metadata that refers to:
// (1) a task that Mesos no longer knows about, or else
// (2) a pod that the Kubelet will never report as "failed"
// (3) a pod that the kubeletExecutor reported as lost (likely due to docker daemon crash/restart)
// For now, destroy the pod and hope that there's a replication controller backing it up.
// TODO(jdef) for case #2 don't delete the pod, just update it's status to Failed
pod := &task.Pod
log.Warningf("deleting rogue pod %v/%v for lost task %v", pod.Namespace, pod.Name, task.ID)
if err := k.client.Pods(pod.Namespace).Delete(pod.Name, api.NewDeleteOptions(0)); err != nil && !errors.IsNotFound(err) {
log.Errorf("failed to delete pod %v/%v for terminal task %v: %v", pod.Namespace, pod.Name, task.ID, err)
}
} else if taskStatus.GetReason() == mesos.TaskStatus_REASON_EXECUTOR_TERMINATED || taskStatus.GetReason() == mesos.TaskStatus_REASON_EXECUTOR_UNREGISTERED {
// attempt to prevent dangling pods in the pod and task registries
log.V(1).Infof("request explicit reconciliation to clean up for task %v after executor reported (terminated/unregistered)", taskStatus.TaskId.GetValue())
k.tasksReconciler.RequestExplicit()
} else if taskStatus.GetState() == mesos.TaskState_TASK_LOST && state == podtask.StateRunning && taskStatus.ExecutorId != nil && taskStatus.SlaveId != nil {
//TODO(jdef) this may not be meaningful once we have proper checkpointing and master detection
//If we're reconciling and receive this then the executor may be
//running a task that we need it to kill. It's possible that the framework
//is unrecognized by the master at this point, so KillTask is not guaranteed
//to do anything. The underlying driver transport may be able to send a
//FrameworkMessage directly to the slave to terminate the task.
log.V(2).Info("forwarding TASK_LOST message to executor %v on slave %v", taskStatus.ExecutorId, taskStatus.SlaveId)
data := fmt.Sprintf("%s:%s", messages.TaskLost, task.ID) //TODO(jdef) use a real message type
if _, err := driver.SendFrameworkMessage(taskStatus.ExecutorId, taskStatus.SlaveId, data); err != nil {
log.Error(err.Error())
}
}
}
// reconcile an unknown (from the perspective of our registry) non-terminal task
func (k *framework) reconcileNonTerminalTask(driver bindings.SchedulerDriver, taskStatus *mesos.TaskStatus) {
// attempt to recover task from pod info:
// - task data may contain an api.PodStatusResult; if status.reason == REASON_RECONCILIATION then status.data == nil
// - the Name can be parsed by container.ParseFullName() to yield a pod Name and Namespace
// - pull the pod metadata down from the api server
// - perform task recovery based on pod metadata
taskId := taskStatus.TaskId.GetValue()
if taskStatus.GetReason() == mesos.TaskStatus_REASON_RECONCILIATION && taskStatus.GetSource() == mesos.TaskStatus_SOURCE_MASTER {
// there will be no data in the task status that we can use to determine the associated pod
switch taskStatus.GetState() {
case mesos.TaskState_TASK_STAGING:
// there is still hope for this task, don't kill it just yet
//TODO(jdef) there should probably be a limit for how long we tolerate tasks stuck in this state
return
default:
// for TASK_{STARTING,RUNNING} we should have already attempted to recoverTasks() for.
// if the scheduler failed over before the executor fired TASK_STARTING, then we should *not*
// be processing this reconciliation update before we process the one from the executor.
// point: we don't know what this task is (perhaps there was unrecoverable metadata in the pod),
// so it gets killed.
log.Errorf("killing non-terminal, unrecoverable task %v", taskId)
}
} else if podStatus, err := podtask.ParsePodStatusResult(taskStatus); err != nil {
// possible rogue pod exists at this point because we can't identify it; should kill the task
log.Errorf("possible rogue pod; illegal task status data for task %v, expected an api.PodStatusResult: %v", taskId, err)
} else if name, namespace, err := container.ParsePodFullName(podStatus.Name); err != nil {
// possible rogue pod exists at this point because we can't identify it; should kill the task
log.Errorf("possible rogue pod; illegal api.PodStatusResult, unable to parse full pod name from: '%v' for task %v: %v",
podStatus.Name, taskId, err)
} else if pod, err := k.client.Pods(namespace).Get(name); err == nil {
if t, ok, err := podtask.RecoverFrom(*pod); ok {
log.Infof("recovered task %v from metadata in pod %v/%v", taskId, namespace, name)
_, err := k.sched.Tasks().Register(t)
if err != nil {
// someone beat us to it?!
log.Warningf("failed to register recovered task: %v", err)
return
} else {
k.sched.Tasks().UpdateStatus(taskStatus)
}
return
} else if err != nil {
//should kill the pod and the task
log.Errorf("killing pod, failed to recover task from pod %v/%v: %v", namespace, name, err)
if err := k.client.Pods(namespace).Delete(name, nil); err != nil {
log.Errorf("failed to delete pod %v/%v: %v", namespace, name, err)
}
} else {
//this is pretty unexpected: we received a TASK_{STARTING,RUNNING} message, but the apiserver's pod
//metadata is not appropriate for task reconstruction -- which should almost certainly never
//be the case unless someone swapped out the pod on us (and kept the same namespace/name) while
//we were failed over.
//kill this task, allow the newly launched scheduler to schedule the new pod
log.Warningf("unexpected pod metadata for task %v in apiserver, assuming new unscheduled pod spec: %+v", taskId, pod)
}
} else if errors.IsNotFound(err) {
// pod lookup failed, should delete the task since the pod is no longer valid; may be redundant, that's ok
log.Infof("killing task %v since pod %v/%v no longer exists", taskId, namespace, name)
} else if errors.IsServerTimeout(err) {
log.V(2).Infof("failed to reconcile task due to API server timeout: %v", err)
return
} else {
log.Errorf("unexpected API server error, aborting reconcile for task %v: %v", taskId, err)
return
}
if _, err := driver.KillTask(taskStatus.TaskId); err != nil {
log.Errorf("failed to kill task %v: %v", taskId, err)
}
}
// FrameworkMessage is called when the scheduler receives a message from the executor.
func (k *framework) FrameworkMessage(driver bindings.SchedulerDriver,
executorId *mesos.ExecutorID, slaveId *mesos.SlaveID, message string) {
log.Infof("Received messages from executor %v of slave %v, %v\n", executorId, slaveId, message)
}
// SlaveLost is called when some slave is lost.
func (k *framework) SlaveLost(driver bindings.SchedulerDriver, slaveId *mesos.SlaveID) {
log.Infof("Slave %v is lost\n", slaveId)
sid := slaveId.GetValue()
k.offers.InvalidateForSlave(sid)
// TODO(jdef): delete slave from our internal list? probably not since we may need to reconcile
// tasks. it would be nice to somehow flag the slave as lost so that, perhaps, we can periodically
// flush lost slaves older than X, and for which no tasks or pods reference.
// unfinished tasks/pods will be dropped. use a replication controller if you want pods to
// be restarted when slaves die.
}
// ExecutorLost is called when some executor is lost.
func (k *framework) ExecutorLost(driver bindings.SchedulerDriver, executorId *mesos.ExecutorID, slaveId *mesos.SlaveID, status int) {
log.Infof("Executor %v of slave %v is lost, status: %v\n", executorId, slaveId, status)
// TODO(yifan): Restart any unfinished tasks of the executor.
}
// Error is called when there is an unrecoverable error in the scheduler or scheduler driver.
// The driver should have been aborted before this is invoked.
func (k *framework) Error(driver bindings.SchedulerDriver, message string) {
log.Fatalf("fatal scheduler error: %v\n", message)
}
// filter func used for explicit task reconciliation, selects only non-terminal tasks which
// have been communicated to mesos (read: launched).
func explicitTaskFilter(t *podtask.T) bool {
switch t.State {
case podtask.StateRunning:
return true
case podtask.StatePending:
return t.Has(podtask.Launched)
default:
return false
}
}
// reconciler action factory, performs explicit task reconciliation for non-terminal
// tasks listed in the scheduler's internal taskRegistry.
func (k *framework) makeTaskRegistryReconciler() taskreconciler.Action {
return taskreconciler.Action(func(drv bindings.SchedulerDriver, cancel <-chan struct{}) <-chan error {
taskToSlave := make(map[string]string)
for _, t := range k.sched.Tasks().List(explicitTaskFilter) {
if t.Spec.SlaveID != "" {
taskToSlave[t.ID] = t.Spec.SlaveID
}
}
return proc.ErrorChan(k.explicitlyReconcileTasks(drv, taskToSlave, cancel))
})
}
// reconciler action factory, performs explicit task reconciliation for non-terminal
// tasks identified by annotations in the Kubernetes pod registry.
func (k *framework) makePodRegistryReconciler() taskreconciler.Action {
return taskreconciler.Action(func(drv bindings.SchedulerDriver, cancel <-chan struct{}) <-chan error {
podList, err := k.client.Pods(api.NamespaceAll).List(labels.Everything(), fields.Everything())
if err != nil {
return proc.ErrorChanf("failed to reconcile pod registry: %v", err)
}
taskToSlave := make(map[string]string)
for _, pod := range podList.Items {
if len(pod.Annotations) == 0 {
continue
}
taskId, found := pod.Annotations[meta.TaskIdKey]
if !found {
continue
}
slaveId, found := pod.Annotations[meta.SlaveIdKey]
if !found {
continue
}
taskToSlave[taskId] = slaveId
}
return proc.ErrorChan(k.explicitlyReconcileTasks(drv, taskToSlave, cancel))
})
}
// execute an explicit task reconciliation, as per http://mesos.apache.org/documentation/latest/reconciliation/
func (k *framework) explicitlyReconcileTasks(driver bindings.SchedulerDriver, taskToSlave map[string]string, cancel <-chan struct{}) error {
log.Info("explicit reconcile tasks")
// tell mesos to send us the latest status updates for all the non-terminal tasks that we know about
statusList := []*mesos.TaskStatus{}
remaining := sets.StringKeySet(taskToSlave)
for taskId, slaveId := range taskToSlave {
if slaveId == "" {
delete(taskToSlave, taskId)
continue
}
statusList = append(statusList, &mesos.TaskStatus{
TaskId: mutil.NewTaskID(taskId),
SlaveId: mutil.NewSlaveID(slaveId),
State: mesos.TaskState_TASK_RUNNING.Enum(), // req'd field, doesn't have to reflect reality
})
}
select {
case <-cancel:
return merrors.ReconciliationCancelledErr
default:
if _, err := driver.ReconcileTasks(statusList); err != nil {
return err
}
}
start := time.Now()
first := true
for backoff := 1 * time.Second; first || remaining.Len() > 0; backoff = backoff * 2 {
first = false
// nothing to do here other than wait for status updates..
if backoff > k.schedulerConfig.ExplicitReconciliationMaxBackoff.Duration {
backoff = k.schedulerConfig.ExplicitReconciliationMaxBackoff.Duration
}
select {
case <-cancel:
return merrors.ReconciliationCancelledErr
case <-time.After(backoff):
for taskId := range remaining {
if task, _ := k.sched.Tasks().Get(taskId); task != nil && explicitTaskFilter(task) && task.UpdatedTime.Before(start) {
// keep this task in remaining list
continue
}
remaining.Delete(taskId)
}
}
}
return nil
}
func (ks *framework) recoverTasks() error {
podList, err := ks.client.Pods(api.NamespaceAll).List(labels.Everything(), fields.Everything())
if err != nil {
log.V(1).Infof("failed to recover pod registry, madness may ensue: %v", err)
return err
}
recoverSlave := func(t *podtask.T) {
slaveId := t.Spec.SlaveID
ks.slaveHostNames.Register(slaveId, t.Offer.Host())
}
for _, pod := range podList.Items {
if _, isMirrorPod := pod.Annotations[kubetypes.ConfigMirrorAnnotationKey]; isMirrorPod {
// mirrored pods are never reconciled because the scheduler isn't responsible for
// scheduling them; they're started by the executor/kubelet upon instantiation and
// reflected in the apiserver afterward. the scheduler has no knowledge of them.
continue
}
if t, ok, err := podtask.RecoverFrom(pod); err != nil {
log.Errorf("failed to recover task from pod, will attempt to delete '%v/%v': %v", pod.Namespace, pod.Name, err)
err := ks.client.Pods(pod.Namespace).Delete(pod.Name, nil)
//TODO(jdef) check for temporary or not-found errors
if err != nil {
log.Errorf("failed to delete pod '%v/%v': %v", pod.Namespace, pod.Name, err)
}
} else if ok {
ks.sched.Tasks().Register(t)
recoverSlave(t)
log.Infof("recovered task %v from pod %v/%v", t.ID, pod.Namespace, pod.Name)
}
}
return nil
}
func (ks *framework) KillTask(id string) error {
killTaskId := mutil.NewTaskID(id)
_, err := ks.driver.KillTask(killTaskId)
return err
}
func (ks *framework) LaunchTask(t *podtask.T) error {
// assume caller is holding scheduler lock
taskList := []*mesos.TaskInfo{t.BuildTaskInfo(ks.executor)}
offerIds := []*mesos.OfferID{t.Offer.Details().Id}
filters := &mesos.Filters{}
_, err := ks.driver.LaunchTasks(offerIds, taskList, filters)
return err
}
func (ks *framework) Offers() offers.Registry {
return ks.offers
}

View File

@@ -14,7 +14,7 @@ See the License for the specific language governing permissions and
limitations under the License.
*/
package scheduler
package framework
import (
"reflect"
@@ -25,9 +25,9 @@ import (
"github.com/stretchr/testify/assert"
"k8s.io/kubernetes/contrib/mesos/pkg/offers"
"k8s.io/kubernetes/contrib/mesos/pkg/proc"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler"
schedcfg "k8s.io/kubernetes/contrib/mesos/pkg/scheduler/config"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/podtask"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/slave"
"k8s.io/kubernetes/pkg/api"
"k8s.io/kubernetes/pkg/client/cache"
)
@@ -81,12 +81,19 @@ func (r *mockRegistrator) Register(hostName string, labels map[string]string) (b
}
}
func mockScheduler() scheduler.Scheduler {
mockScheduler := &scheduler.MockScheduler{}
reg := podtask.NewInMemoryRegistry()
mockScheduler.On("Tasks").Return(reg)
return mockScheduler
}
//test adding of ressource offer, should be added to offer registry and slaves
func TestResourceOffer_Add(t *testing.T) {
assert := assert.New(t)
registrator := &mockRegistrator{cache.NewStore(cache.MetaNamespaceKeyFunc)}
testScheduler := &KubernetesScheduler{
testFramework := &framework{
offers: offers.CreateRegistry(offers.RegistryConfig{
Compat: func(o *mesos.Offer) bool {
return true
@@ -99,39 +106,40 @@ func TestResourceOffer_Add(t *testing.T) {
TTL: schedcfg.DefaultOfferTTL,
ListenerDelay: schedcfg.DefaultListenerDelay,
}),
slaveHostNames: slave.NewRegistry(),
slaveHostNames: newSlaveRegistry(),
nodeRegistrator: registrator,
sched: mockScheduler(),
}
hostname := "h1"
offerID1 := util.NewOfferID("test1")
offer1 := &mesos.Offer{Id: offerID1, Hostname: &hostname, SlaveId: util.NewSlaveID(hostname)}
offers1 := []*mesos.Offer{offer1}
testScheduler.ResourceOffers(nil, offers1)
testFramework.ResourceOffers(nil, offers1)
assert.Equal(1, len(registrator.store.List()))
assert.Equal(1, getNumberOffers(testScheduler.offers))
assert.Equal(1, getNumberOffers(testFramework.offers))
//check slave hostname
assert.Equal(1, len(testScheduler.slaveHostNames.SlaveIDs()))
assert.Equal(1, len(testFramework.slaveHostNames.SlaveIDs()))
//add another offer
hostname2 := "h2"
offer2 := &mesos.Offer{Id: util.NewOfferID("test2"), Hostname: &hostname2, SlaveId: util.NewSlaveID(hostname2)}
offers2 := []*mesos.Offer{offer2}
testScheduler.ResourceOffers(nil, offers2)
testFramework.ResourceOffers(nil, offers2)
//check it is stored in registry
assert.Equal(2, getNumberOffers(testScheduler.offers))
assert.Equal(2, getNumberOffers(testFramework.offers))
//check slave hostnames
assert.Equal(2, len(testScheduler.slaveHostNames.SlaveIDs()))
assert.Equal(2, len(testFramework.slaveHostNames.SlaveIDs()))
}
//test adding of ressource offer, should be added to offer registry and slavesf
func TestResourceOffer_Add_Rescind(t *testing.T) {
assert := assert.New(t)
testScheduler := &KubernetesScheduler{
testFramework := &framework{
offers: offers.CreateRegistry(offers.RegistryConfig{
Compat: func(o *mesos.Offer) bool {
return true
@@ -144,42 +152,43 @@ func TestResourceOffer_Add_Rescind(t *testing.T) {
TTL: schedcfg.DefaultOfferTTL,
ListenerDelay: schedcfg.DefaultListenerDelay,
}),
slaveHostNames: slave.NewRegistry(),
slaveHostNames: newSlaveRegistry(),
sched: mockScheduler(),
}
hostname := "h1"
offerID1 := util.NewOfferID("test1")
offer1 := &mesos.Offer{Id: offerID1, Hostname: &hostname, SlaveId: util.NewSlaveID(hostname)}
offers1 := []*mesos.Offer{offer1}
testScheduler.ResourceOffers(nil, offers1)
testFramework.ResourceOffers(nil, offers1)
assert.Equal(1, getNumberOffers(testScheduler.offers))
assert.Equal(1, getNumberOffers(testFramework.offers))
//check slave hostname
assert.Equal(1, len(testScheduler.slaveHostNames.SlaveIDs()))
assert.Equal(1, len(testFramework.slaveHostNames.SlaveIDs()))
//add another offer
hostname2 := "h2"
offer2 := &mesos.Offer{Id: util.NewOfferID("test2"), Hostname: &hostname2, SlaveId: util.NewSlaveID(hostname2)}
offers2 := []*mesos.Offer{offer2}
testScheduler.ResourceOffers(nil, offers2)
testFramework.ResourceOffers(nil, offers2)
assert.Equal(2, getNumberOffers(testScheduler.offers))
assert.Equal(2, getNumberOffers(testFramework.offers))
//check slave hostnames
assert.Equal(2, len(testScheduler.slaveHostNames.SlaveIDs()))
assert.Equal(2, len(testFramework.slaveHostNames.SlaveIDs()))
//next whether offers can be rescinded
testScheduler.OfferRescinded(nil, offerID1)
assert.Equal(1, getNumberOffers(testScheduler.offers))
testFramework.OfferRescinded(nil, offerID1)
assert.Equal(1, getNumberOffers(testFramework.offers))
//next whether offers can be rescinded
testScheduler.OfferRescinded(nil, util.NewOfferID("test2"))
testFramework.OfferRescinded(nil, util.NewOfferID("test2"))
//walk offers again and check it is removed from registry
assert.Equal(0, getNumberOffers(testScheduler.offers))
assert.Equal(0, getNumberOffers(testFramework.offers))
//remove non existing ID
testScheduler.OfferRescinded(nil, util.NewOfferID("notExist"))
testFramework.OfferRescinded(nil, util.NewOfferID("notExist"))
}
//test that when a slave is lost we remove all offers
@@ -187,7 +196,7 @@ func TestSlave_Lost(t *testing.T) {
assert := assert.New(t)
//
testScheduler := &KubernetesScheduler{
testFramework := &framework{
offers: offers.CreateRegistry(offers.RegistryConfig{
Compat: func(o *mesos.Offer) bool {
return true
@@ -197,45 +206,46 @@ func TestSlave_Lost(t *testing.T) {
TTL: schedcfg.DefaultOfferTTL,
ListenerDelay: schedcfg.DefaultListenerDelay,
}),
slaveHostNames: slave.NewRegistry(),
slaveHostNames: newSlaveRegistry(),
sched: mockScheduler(),
}
hostname := "h1"
offer1 := &mesos.Offer{Id: util.NewOfferID("test1"), Hostname: &hostname, SlaveId: util.NewSlaveID(hostname)}
offers1 := []*mesos.Offer{offer1}
testScheduler.ResourceOffers(nil, offers1)
testFramework.ResourceOffers(nil, offers1)
offer2 := &mesos.Offer{Id: util.NewOfferID("test2"), Hostname: &hostname, SlaveId: util.NewSlaveID(hostname)}
offers2 := []*mesos.Offer{offer2}
testScheduler.ResourceOffers(nil, offers2)
testFramework.ResourceOffers(nil, offers2)
//add another offer from different slaveID
hostname2 := "h2"
offer3 := &mesos.Offer{Id: util.NewOfferID("test3"), Hostname: &hostname2, SlaveId: util.NewSlaveID(hostname2)}
offers3 := []*mesos.Offer{offer3}
testScheduler.ResourceOffers(nil, offers3)
testFramework.ResourceOffers(nil, offers3)
//test precondition
assert.Equal(3, getNumberOffers(testScheduler.offers))
assert.Equal(2, len(testScheduler.slaveHostNames.SlaveIDs()))
assert.Equal(3, getNumberOffers(testFramework.offers))
assert.Equal(2, len(testFramework.slaveHostNames.SlaveIDs()))
//remove first slave
testScheduler.SlaveLost(nil, util.NewSlaveID(hostname))
testFramework.SlaveLost(nil, util.NewSlaveID(hostname))
//offers should be removed
assert.Equal(1, getNumberOffers(testScheduler.offers))
assert.Equal(1, getNumberOffers(testFramework.offers))
//slave hostnames should still be all present
assert.Equal(2, len(testScheduler.slaveHostNames.SlaveIDs()))
assert.Equal(2, len(testFramework.slaveHostNames.SlaveIDs()))
//remove second slave
testScheduler.SlaveLost(nil, util.NewSlaveID(hostname2))
testFramework.SlaveLost(nil, util.NewSlaveID(hostname2))
//offers should be removed
assert.Equal(0, getNumberOffers(testScheduler.offers))
assert.Equal(0, getNumberOffers(testFramework.offers))
//slave hostnames should still be all present
assert.Equal(2, len(testScheduler.slaveHostNames.SlaveIDs()))
assert.Equal(2, len(testFramework.slaveHostNames.SlaveIDs()))
//try to remove non existing slave
testScheduler.SlaveLost(nil, util.NewSlaveID("notExist"))
testFramework.SlaveLost(nil, util.NewSlaveID("notExist"))
}
@@ -244,7 +254,7 @@ func TestDisconnect(t *testing.T) {
assert := assert.New(t)
//
testScheduler := &KubernetesScheduler{
testFramework := &framework{
offers: offers.CreateRegistry(offers.RegistryConfig{
Compat: func(o *mesos.Offer) bool {
return true
@@ -254,30 +264,31 @@ func TestDisconnect(t *testing.T) {
TTL: schedcfg.DefaultOfferTTL,
ListenerDelay: schedcfg.DefaultListenerDelay,
}),
slaveHostNames: slave.NewRegistry(),
slaveHostNames: newSlaveRegistry(),
sched: mockScheduler(),
}
hostname := "h1"
offer1 := &mesos.Offer{Id: util.NewOfferID("test1"), Hostname: &hostname, SlaveId: util.NewSlaveID(hostname)}
offers1 := []*mesos.Offer{offer1}
testScheduler.ResourceOffers(nil, offers1)
testFramework.ResourceOffers(nil, offers1)
offer2 := &mesos.Offer{Id: util.NewOfferID("test2"), Hostname: &hostname, SlaveId: util.NewSlaveID(hostname)}
offers2 := []*mesos.Offer{offer2}
testScheduler.ResourceOffers(nil, offers2)
testFramework.ResourceOffers(nil, offers2)
//add another offer from different slaveID
hostname2 := "h2"
offer3 := &mesos.Offer{Id: util.NewOfferID("test2"), Hostname: &hostname2, SlaveId: util.NewSlaveID(hostname2)}
offers3 := []*mesos.Offer{offer3}
testScheduler.ResourceOffers(nil, offers3)
testFramework.ResourceOffers(nil, offers3)
//disconnect
testScheduler.Disconnected(nil)
testFramework.Disconnected(nil)
//all offers should be removed
assert.Equal(0, getNumberOffers(testScheduler.offers))
assert.Equal(0, getNumberOffers(testFramework.offers))
//slave hostnames should still be all present
assert.Equal(2, len(testScheduler.slaveHostNames.SlaveIDs()))
assert.Equal(2, len(testFramework.slaveHostNames.SlaveIDs()))
}
//test we can handle different status updates, TODO check state transitions
@@ -287,7 +298,7 @@ func TestStatus_Update(t *testing.T) {
// setup expectations
mockdriver.On("KillTask", util.NewTaskID("test-task-001")).Return(mesos.Status_DRIVER_RUNNING, nil)
testScheduler := &KubernetesScheduler{
testFramework := &framework{
offers: offers.CreateRegistry(offers.RegistryConfig{
Compat: func(o *mesos.Offer) bool {
return true
@@ -297,28 +308,28 @@ func TestStatus_Update(t *testing.T) {
TTL: schedcfg.DefaultOfferTTL,
ListenerDelay: schedcfg.DefaultListenerDelay,
}),
slaveHostNames: slave.NewRegistry(),
slaveHostNames: newSlaveRegistry(),
driver: &mockdriver,
taskRegistry: podtask.NewInMemoryRegistry(),
sched: mockScheduler(),
}
taskStatus_task_starting := util.NewTaskStatus(
util.NewTaskID("test-task-001"),
mesos.TaskState_TASK_RUNNING,
)
testScheduler.StatusUpdate(testScheduler.driver, taskStatus_task_starting)
testFramework.StatusUpdate(testFramework.driver, taskStatus_task_starting)
taskStatus_task_running := util.NewTaskStatus(
util.NewTaskID("test-task-001"),
mesos.TaskState_TASK_RUNNING,
)
testScheduler.StatusUpdate(testScheduler.driver, taskStatus_task_running)
testFramework.StatusUpdate(testFramework.driver, taskStatus_task_running)
taskStatus_task_failed := util.NewTaskStatus(
util.NewTaskID("test-task-001"),
mesos.TaskState_TASK_FAILED,
)
testScheduler.StatusUpdate(testScheduler.driver, taskStatus_task_failed)
testFramework.StatusUpdate(testFramework.driver, taskStatus_task_failed)
//assert that mock was invoked
mockdriver.AssertExpectations(t)

View File

@@ -14,25 +14,26 @@ See the License for the specific language governing permissions and
limitations under the License.
*/
package slave
package framework
import (
"sync"
)
type Registry struct {
// slaveRegistry manages node hostnames for slave ids.
type slaveRegistry struct {
lock sync.Mutex
hostNames map[string]string
}
func NewRegistry() *Registry {
return &Registry{
func newSlaveRegistry() *slaveRegistry {
return &slaveRegistry{
hostNames: map[string]string{},
}
}
// Register creates a mapping between a slaveId and slave if not existing.
func (st *Registry) Register(slaveId, slaveHostname string) {
func (st *slaveRegistry) Register(slaveId, slaveHostname string) {
st.lock.Lock()
defer st.lock.Unlock()
_, exists := st.hostNames[slaveId]
@@ -42,7 +43,7 @@ func (st *Registry) Register(slaveId, slaveHostname string) {
}
// SlaveIDs returns the keys of the registry
func (st *Registry) SlaveIDs() []string {
func (st *slaveRegistry) SlaveIDs() []string {
st.lock.Lock()
defer st.lock.Unlock()
slaveIds := make([]string, 0, len(st.hostNames))
@@ -53,7 +54,7 @@ func (st *Registry) SlaveIDs() []string {
}
// HostName looks up a hostname for a given slaveId
func (st *Registry) HostName(slaveId string) string {
func (st *slaveRegistry) HostName(slaveId string) string {
st.lock.Lock()
defer st.lock.Unlock()
return st.hostNames[slaveId]

View File

@@ -14,7 +14,7 @@ See the License for the specific language governing permissions and
limitations under the License.
*/
package slave
package framework
import (
"testing"
@@ -26,7 +26,7 @@ import (
func TestSlaveStorage_Register(t *testing.T) {
assert := assert.New(t)
slaveStorage := NewRegistry()
slaveStorage := newSlaveRegistry()
assert.Equal(0, len(slaveStorage.hostNames))
slaveId := "slave1"
@@ -42,7 +42,7 @@ func TestSlaveStorage_Register(t *testing.T) {
func TestSlaveStorage_HostName(t *testing.T) {
assert := assert.New(t)
slaveStorage := NewRegistry()
slaveStorage := newSlaveRegistry()
assert.Equal(0, len(slaveStorage.hostNames))
slaveId := "slave1"
@@ -62,7 +62,7 @@ func TestSlaveStorage_HostName(t *testing.T) {
func TestSlaveStorage_SlaveIds(t *testing.T) {
assert := assert.New(t)
slaveStorage := NewRegistry()
slaveStorage := newSlaveRegistry()
assert.Equal(0, len(slaveStorage.hostNames))
slaveId := "1"

View File

@@ -0,0 +1,19 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Package podreconciler implements pod reconcilation of pods which failed
// to launch, i.e. before binding by the executor took place.
package podreconciler

View File

@@ -0,0 +1,120 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package podreconciler
import (
"time"
log "github.com/golang/glog"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/components/deleter"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/errors"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/podtask"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/queuer"
"k8s.io/kubernetes/pkg/api"
apierrors "k8s.io/kubernetes/pkg/api/errors"
client "k8s.io/kubernetes/pkg/client/unversioned"
)
// PodReconciler reconciles a pod with the apiserver
type PodReconciler interface {
Reconcile(t *podtask.T)
}
type podReconciler struct {
sched scheduler.Scheduler
client *client.Client
qr queuer.Queuer
deleter deleter.Deleter
}
func New(sched scheduler.Scheduler, client *client.Client, qr queuer.Queuer, deleter deleter.Deleter) PodReconciler {
return &podReconciler{
sched: sched,
client: client,
qr: qr,
deleter: deleter,
}
}
// this pod may be out of sync with respect to the API server registry:
// this pod | apiserver registry
// -------------|----------------------
// host=.* | 404 ; pod was deleted
// host=.* | 5xx ; failed to sync, try again later?
// host="" | host="" ; perhaps no updates to process?
// host="" | host="..." ; pod has been scheduled and assigned, is there a task assigned? (check TaskIdKey in binding?)
// host="..." | host="" ; pod is no longer scheduled, does it need to be re-queued?
// host="..." | host="..." ; perhaps no updates to process?
//
// TODO(jdef) this needs an integration test
func (s *podReconciler) Reconcile(t *podtask.T) {
log.V(1).Infof("reconcile pod %v, assigned to slave %q", t.Pod.Name, t.Spec.AssignedSlave)
ctx := api.WithNamespace(api.NewDefaultContext(), t.Pod.Namespace)
pod, err := s.client.Pods(api.NamespaceValue(ctx)).Get(t.Pod.Name)
if err != nil {
if apierrors.IsNotFound(err) {
// attempt to delete
if err = s.deleter.DeleteOne(&queuer.Pod{Pod: &t.Pod}); err != nil && err != errors.NoSuchPodErr && err != errors.NoSuchTaskErr {
log.Errorf("failed to delete pod: %v: %v", t.Pod.Name, err)
}
} else {
//TODO(jdef) other errors should probably trigger a retry (w/ backoff).
//For now, drop the pod on the floor
log.Warning("aborting reconciliation for pod %v: %v", t.Pod.Name, err)
}
return
}
log.Infof("pod %v scheduled on %q according to apiserver", pod.Name, pod.Spec.NodeName)
if t.Spec.AssignedSlave != pod.Spec.NodeName {
if pod.Spec.NodeName == "" {
// pod is unscheduled.
// it's possible that we dropped the pod in the scheduler error handler
// because of task misalignment with the pod (task.Has(podtask.Launched) == true)
podKey, err := podtask.MakePodKey(ctx, pod.Name)
if err != nil {
log.Error(err)
return
}
s.sched.Lock()
defer s.sched.Unlock()
if _, state := s.sched.Tasks().ForPod(podKey); state != podtask.StateUnknown {
//TODO(jdef) reconcile the task
log.Errorf("task already registered for pod %v", pod.Name)
return
}
now := time.Now()
log.V(3).Infof("reoffering pod %v", podKey)
s.qr.Reoffer(queuer.NewPodWithDeadline(pod, &now))
} else {
// pod is scheduled.
// not sure how this happened behind our backs. attempt to reconstruct
// at least a partial podtask.T record.
//TODO(jdef) reconcile the task
log.Errorf("pod already scheduled: %v", pod.Name)
}
} else {
//TODO(jdef) for now, ignore the fact that the rest of the spec may be different
//and assume that our knowledge of the pod aligns with that of the apiserver
log.Error("pod reconciliation does not support updates; not yet implemented")
}
}

View File

@@ -0,0 +1,63 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package components
import (
"k8s.io/kubernetes/contrib/mesos/pkg/queue"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/queuer"
"k8s.io/kubernetes/pkg/api"
)
// Consumes *api.Pod, produces *Pod; the k8s reflector wants to push *api.Pod
// objects at us, but we want to store more flexible (Pod) type defined in
// this package. The adapter implementation facilitates this. It's a little
// hackish since the object type going in is different than the object type
// coming out -- you've been warned.
type podStoreAdapter struct {
queue.FIFO
}
func (psa *podStoreAdapter) Add(obj interface{}) error {
pod := obj.(*api.Pod)
return psa.FIFO.Add(&queuer.Pod{Pod: pod})
}
func (psa *podStoreAdapter) Update(obj interface{}) error {
pod := obj.(*api.Pod)
return psa.FIFO.Update(&queuer.Pod{Pod: pod})
}
func (psa *podStoreAdapter) Delete(obj interface{}) error {
pod := obj.(*api.Pod)
return psa.FIFO.Delete(&queuer.Pod{Pod: pod})
}
func (psa *podStoreAdapter) Get(obj interface{}) (interface{}, bool, error) {
pod := obj.(*api.Pod)
return psa.FIFO.Get(&queuer.Pod{Pod: pod})
}
// Replace will delete the contents of the store, using instead the
// given map. This store implementation does NOT take ownership of the map.
func (psa *podStoreAdapter) Replace(objs []interface{}, resourceVersion string) error {
newobjs := make([]interface{}, len(objs))
for i, v := range objs {
pod := v.(*api.Pod)
newobjs[i] = &queuer.Pod{Pod: pod}
}
return psa.FIFO.Replace(newobjs, resourceVersion)
}

View File

@@ -0,0 +1,137 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package components
import (
"net/http"
"sync"
mesos "github.com/mesos/mesos-go/mesosproto"
"k8s.io/kubernetes/contrib/mesos/pkg/backoff"
"k8s.io/kubernetes/contrib/mesos/pkg/offers"
"k8s.io/kubernetes/contrib/mesos/pkg/queue"
"k8s.io/kubernetes/contrib/mesos/pkg/runtime"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/components/algorithm"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/components/algorithm/podschedulers"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/components/binder"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/components/controller"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/components/deleter"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/components/errorhandler"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/components/framework"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/components/podreconciler"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/config"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/podtask"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/queuer"
"k8s.io/kubernetes/pkg/api"
"k8s.io/kubernetes/pkg/client/cache"
"k8s.io/kubernetes/pkg/client/record"
client "k8s.io/kubernetes/pkg/client/unversioned"
)
// sched implements the Scheduler interface.
type sched struct {
podReconciler podreconciler.PodReconciler
framework framework.Framework
controller controller.Controller
// unsafe state, needs to be guarded, especially changes to podtask.T objects
sync.RWMutex
taskRegistry podtask.Registry
}
func New(c *config.Config, fw framework.Framework, ps podschedulers.PodScheduler,
client *client.Client, recorder record.EventRecorder, terminate <-chan struct{}, mux *http.ServeMux, lw *cache.ListWatch) scheduler.Scheduler {
core := &sched{
framework: fw,
taskRegistry: podtask.NewInMemoryRegistry(),
}
// Watch and queue pods that need scheduling.
podUpdatesBypass := make(chan queue.Entry, c.UpdatesBacklog)
podUpdates := &podStoreAdapter{queue.NewHistorical(podUpdatesBypass)}
reflector := cache.NewReflector(lw, &api.Pod{}, podUpdates, 0)
q := queuer.New(queue.NewDelayFIFO(), podUpdates)
algorithm := algorithm.New(core, podUpdates, ps)
podDeleter := deleter.New(core, q)
core.podReconciler = podreconciler.New(core, client, q, podDeleter)
bo := backoff.New(c.InitialPodBackoff.Duration, c.MaxPodBackoff.Duration)
newBC := func(podKey string) queue.BreakChan {
return queue.BreakChan(core.Offers().Listen(podKey, func(offer *mesos.Offer) bool {
core.Lock()
defer core.Unlock()
switch task, state := core.Tasks().ForPod(podKey); state {
case podtask.StatePending:
// Assess fitness of pod with the current offer. The scheduler normally
// "backs off" when it can't find an offer that matches up with a pod.
// The backoff period for a pod can terminate sooner if an offer becomes
// available that matches up.
return !task.Has(podtask.Launched) && ps.FitPredicate()(task, offer, nil)
default:
// no point in continuing to check for matching offers
return true
}
}))
}
errorHandler := errorhandler.New(core, bo, q, newBC)
binder := binder.New(core)
startLatch := make(chan struct{})
runtime.On(startLatch, func() {
reflector.Run() // TODO(jdef) should listen for termination
podDeleter.Run(podUpdatesBypass, terminate)
q.Run(terminate)
q.InstallDebugHandlers(mux)
podtask.InstallDebugHandlers(core.Tasks(), mux)
})
core.controller = controller.New(client, algorithm, recorder, q.Yield, errorHandler.Error, binder, startLatch)
return core
}
func (c *sched) Run(done <-chan struct{}) {
c.controller.Run(done)
}
func (c *sched) Reconcile(t *podtask.T) {
c.podReconciler.Reconcile(t)
}
func (c *sched) Tasks() podtask.Registry {
return c.taskRegistry
}
func (c *sched) Offers() offers.Registry {
return c.framework.Offers()
}
func (c *sched) KillTask(id string) error {
return c.framework.KillTask(id)
}
func (c *sched) LaunchTask(t *podtask.T) error {
return c.framework.LaunchTask(t)
}

View File

@@ -0,0 +1,18 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Package taskreconciler implement Mesos task reconcilation.
package taskreconciler

View File

@@ -0,0 +1,235 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package taskreconciler
import (
"fmt"
"time"
log "github.com/golang/glog"
mesos "github.com/mesos/mesos-go/mesosproto"
bindings "github.com/mesos/mesos-go/scheduler"
"k8s.io/kubernetes/contrib/mesos/pkg/proc"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/errors"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/metrics"
)
type Action func(driver bindings.SchedulerDriver, cancel <-chan struct{}) <-chan error
type TasksReconciler interface {
RequestExplicit()
RequestImplicit()
Run(driver bindings.SchedulerDriver, done <-chan struct{})
}
type tasksReconciler struct {
proc.Doer
Action Action
explicit chan struct{} // send an empty struct to trigger explicit reconciliation
implicit chan struct{} // send an empty struct to trigger implicit reconciliation
cooldown time.Duration
explicitReconciliationAbortTimeout time.Duration
}
func New(doer proc.Doer, action Action,
cooldown, explicitReconciliationAbortTimeout time.Duration, done <-chan struct{}) TasksReconciler {
return &tasksReconciler{
Doer: doer,
explicit: make(chan struct{}, 1),
implicit: make(chan struct{}, 1),
cooldown: cooldown,
explicitReconciliationAbortTimeout: explicitReconciliationAbortTimeout,
Action: func(driver bindings.SchedulerDriver, cancel <-chan struct{}) <-chan error {
// trigged the reconciler action in the doer's execution context,
// but it could take a while and the scheduler needs to be able to
// process updates, the callbacks for which ALSO execute in the SAME
// deferred execution context -- so the action MUST be executed async.
errOnce := proc.NewErrorOnce(cancel)
return errOnce.Send(doer.Do(func() {
// only triggers the action if we're the currently elected,
// registered master and runs the action async.
go func() {
var err <-chan error
defer errOnce.Send(err)
err = action(driver, cancel)
}()
})).Err()
},
}
}
func (r *tasksReconciler) RequestExplicit() {
select {
case r.explicit <- struct{}{}: // noop
default: // request queue full; noop
}
}
func (r *tasksReconciler) RequestImplicit() {
select {
case r.implicit <- struct{}{}: // noop
default: // request queue full; noop
}
}
// execute task reconciliation, returns when r.done is closed. intended to run as a goroutine.
// if reconciliation is requested while another is in progress, the in-progress operation will be
// cancelled before the new reconciliation operation begins.
func (r *tasksReconciler) Run(driver bindings.SchedulerDriver, done <-chan struct{}) {
var cancel, finished chan struct{}
requestLoop:
for {
select {
case <-done:
return
default: // proceed
}
select {
case <-r.implicit:
metrics.ReconciliationRequested.WithLabelValues("implicit").Inc()
select {
case <-done:
return
case <-r.explicit:
break // give preference to a pending request for explicit
default: // continue
// don't run implicit reconciliation while explicit is ongoing
if finished != nil {
select {
case <-finished: // continue w/ implicit
default:
log.Infoln("skipping implicit reconcile because explicit reconcile is ongoing")
continue requestLoop
}
}
errOnce := proc.NewErrorOnce(done)
errCh := r.Do(func() {
var err error
defer errOnce.Report(err)
log.Infoln("implicit reconcile tasks")
metrics.ReconciliationExecuted.WithLabelValues("implicit").Inc()
if _, err = driver.ReconcileTasks([]*mesos.TaskStatus{}); err != nil {
log.V(1).Infof("failed to request implicit reconciliation from mesos: %v", err)
}
})
proc.OnError(errOnce.Send(errCh).Err(), func(err error) {
log.Errorf("failed to run implicit reconciliation: %v", err)
}, done)
goto slowdown
}
case <-done:
return
case <-r.explicit: // continue
metrics.ReconciliationRequested.WithLabelValues("explicit").Inc()
}
if cancel != nil {
close(cancel)
cancel = nil
// play nice and wait for the prior operation to finish, complain
// if it doesn't
select {
case <-done:
return
case <-finished: // noop, expected
case <-time.After(r.explicitReconciliationAbortTimeout): // very unexpected
log.Error("reconciler action failed to stop upon cancellation")
}
}
// copy 'finished' to 'fin' here in case we end up with simultaneous go-routines,
// if cancellation takes too long or fails - we don't want to close the same chan
// more than once
cancel = make(chan struct{})
finished = make(chan struct{})
go func(fin chan struct{}) {
startedAt := time.Now()
defer func() {
metrics.ReconciliationLatency.Observe(metrics.InMicroseconds(time.Since(startedAt)))
}()
metrics.ReconciliationExecuted.WithLabelValues("explicit").Inc()
defer close(fin)
err := <-r.Action(driver, cancel)
if err == errors.ReconciliationCancelledErr {
metrics.ReconciliationCancelled.WithLabelValues("explicit").Inc()
log.Infoln(err.Error())
} else if err != nil {
log.Errorf("reconciler action failed: %v", err)
}
}(finished)
slowdown:
// don't allow reconciliation to run very frequently, either explicit or implicit
select {
case <-done:
return
case <-time.After(r.cooldown): // noop
}
} // for
}
// MakeComposite invokes the given ReconcilerAction funcs in sequence, aborting the sequence if reconciliation
// is cancelled. if any other errors occur the composite reconciler will attempt to complete the
// sequence, reporting only the last generated error.
func MakeComposite(done <-chan struct{}, actions ...Action) Action {
if x := len(actions); x == 0 {
// programming error
panic("no actions specified for composite reconciler")
} else if x == 1 {
return actions[0]
}
chained := func(d bindings.SchedulerDriver, c <-chan struct{}, a, b Action) <-chan error {
ech := a(d, c)
ch := make(chan error, 1)
go func() {
select {
case <-done:
case <-c:
case e := <-ech:
if e != nil {
ch <- e
return
}
ech = b(d, c)
select {
case <-done:
case <-c:
case e := <-ech:
if e != nil {
ch <- e
return
}
close(ch)
return
}
}
ch <- fmt.Errorf("aborting composite reconciler action")
}()
return ch
}
result := func(d bindings.SchedulerDriver, c <-chan struct{}) <-chan error {
return chained(d, c, actions[0], actions[1])
}
for i := 2; i < len(actions); i++ {
i := i
next := func(d bindings.SchedulerDriver, c <-chan struct{}) <-chan error {
return chained(d, c, Action(result), actions[i])
}
result = next
}
return Action(result)
}

View File

@@ -16,3 +16,58 @@ limitations under the License.
// Package scheduler implements the Kubernetes Mesos scheduler.
package scheduler
// Created from contrib/mesos/docs/scheduler.monopic:
//
// ┌───────────────────────────────────────────────────────────────────────┐
// │ ┌───────────────────────────────────────┐ ┌─┴──────────────────────┐ ┌───────────────┐
// ┌────────▼─────────┐ │Queuer │ Await() │ podUpdates │ │ │
// │ podUpdatesBypass │ │- Yield() *api.Pod ├──pod CRUD ─▶ (queue.HistoricalFIFO) ◀──reflector──▶pods ListWatch ├──apiserver──▶
// └────────▲─────────┘ │- Requeue(pod)/Dequeue(id)/Reoffer(pod)│ events │ │ │ │
// │ └───────────────────▲───────────────────┘ └───────────┬────────────┘ └───────────────┘
// │ │ │
// │ │ │
// └───────────────┐┌───────────────────▲────────────────────▲─────────────────────┐ └───────────────────────┐
// ││ │ │ ┌────────────────────┼─────────────────┐
// ┌───────────────────┼┼──────────────────────────────────────┐ │ ┌───────────────────┼────┼───────────┐ │ │
// ┌───────────▼──────────┐┌───────┴┴───────┐ ┌───────────────────┐ ┌──┴─┴─┴──────┐ ┌────────┴────┴───┐ ┌────▼────────▼─────────────┐ │
// │Binder (task launcher)││Deleter │ │PodReconciler │ │Controller │ │ ErrorHandler │ │SchedulerAlgorithm │ │
// │- Bind(binding) ││- DeleteOne(pod)│ │- Reconcile(pod) │ │- Run() │ │- Error(pod, err)│ │- Schedule(pod) -> NodeName│ │
// │ ││ │◀──│ │ │ │──▶│ │ │ │ │
// │ ┌─────┐││ ┌─────┐ │ │ ┌─────┐ │ │ ┌─────┐ │ │ ┌─────┐ │ │┌─────┐ │ │
// └───────────────┤sched├┘└────┤sched├─────┘ └──────┤sched├───▲──┘ └───┤sched├───┘ └────┤sched├──────┘ └┤sched├──────────────┬─────┘ │
// ├-│││-┴──────┴--││-┴────────────────┴--│--┴───┼──────────┴--│--┴────────────┴-│---┴──────────┴-│││-┤ ┌────────────▼─────────▼─────────┐
// │ │││ ││ │ │ │ │ │││ │ │ podScheduler │
// │ ││└───────────▼┼─────────────────────▼──────┼─────────────▼─────────────────▼────────────────┘││ │ │ (e.g. fcfsPodScheduler) │
// │ │└─────────────┼────────────────────────────┼─────────────┼──────────────────▼────────────────┘│ │ │ │
// │ │ │ │ │ │ │ │ │ scheduleOne(pod, offers ...) │
// │ │ │ │ │ │ │ │ │ ┌──────────────────────────┤
// │ │ │ ╲ │ │ │ │ │ │ ▼ │ │ │ allocationStrategy │
// │ │ │ ╲ └┐ │ ┌┘ │ │ │ │ │ │ - FitPredicate │
// │ │ │ ╲ │ │ │ │ │ │ │ │ │ - Procurement │
// │ │ │ ╲ └┐ │ ┌┘ │ │ │ │ └─────┴──────────────────────────┘
// │┌▼────────────┐┌▼──────────┐┌─▼─▼─▼─▼─▼─┐┌───┴────────┐┌───▼───┐ ┌────▼───┐ │
// ││LaunchTask(t)││KillTask(t)││sync.Mutex ││reconcile(t)││Tasks()│ │Offers()│ │
// │└──────┬──────┘└─────┬─────┘└───────────┘└────────▲───┘└───┬───┘ └────┬───┘ │
// │ │ │ │ │ │ │
// │ │ └──────────────────┐ │ ┌───▼────────────┐ │ │
// │ └──────────────────────────────┐ │ │ │podtask.Registry│ │ │
// │ │ │ │ └────────────────┘ │ │ ┌──────────────────────┐
// │ │ │ │ │ │ │ │
// │Scheduler │ └──────┐ │ │ │ │ A ──────────▶ B │
// └──────────────────────────────────────┼────────┼─┬│----┬──────────────────────┼───────────────────┘ │ │
// ┌──────────────────────────────────────┼────────┼─┤sched├──────────────────────┼─────────────────────────┐ │ A has a reference │
// │Framework │ │ └─────┘ ┌────▼───┐ │ │ on B and calls B │
// │ ┌──────▼──────┐┌▼──────────┐ │Offers()│ │ │ │
// │ │LaunchTask(t)││KillTask(t)│ └────┬───┘ │ └──────────────────────┘
// │ └─────────┬───┘└──────┬────┘ ┌────────▼───────┐ │
// │implements: mesos-go/scheduler.Scheduler └───────────▼ │offers.Registry │ │
// │ │ └────────────────┘ │
// │ ┌─────────────────┐ ┌──▼─────────────┐ │
// └────────────────────────┤ ├───────┤ Mesos ├────────────────────────────────────┘
// │ TasksReconciler │ │ Scheduler │
// │ ├───────▶ Driver │
// └─────────────────┘ └────────┬───────┘
// │
// │
// ▼

View File

@@ -0,0 +1,18 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Package errors contains all scheduler wide used errors
package errors

View File

@@ -0,0 +1,28 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package errors
import (
"errors"
)
var (
NoSuchPodErr = errors.New("No such pod exists")
NoSuchTaskErr = errors.New("No such task exists")
ReconciliationCancelledErr = errors.New("explicit task reconciliation cancelled")
NoSuitableOffersErr = errors.New("No suitable offers for pod/task")
)

View File

@@ -112,10 +112,10 @@ type SchedulerProcess struct {
fin chan struct{}
}
func New(sched bindings.Scheduler) *SchedulerProcess {
func New(framework bindings.Scheduler) *SchedulerProcess {
p := &SchedulerProcess{
Process: proc.New(),
Scheduler: sched,
Scheduler: framework,
stage: initStage,
elected: make(chan struct{}),
failover: make(chan struct{}),

View File

@@ -0,0 +1,18 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Package integration implements integration tests.
package integration

View File

@@ -14,7 +14,7 @@ See the License for the specific language governing permissions and
limitations under the License.
*/
package scheduler
package integration
import (
"encoding/json"
@@ -25,14 +25,6 @@ import (
"testing"
"time"
"k8s.io/kubernetes/pkg/api"
"k8s.io/kubernetes/pkg/api/testapi"
"k8s.io/kubernetes/pkg/api/unversioned"
"k8s.io/kubernetes/pkg/client/cache"
client "k8s.io/kubernetes/pkg/client/unversioned"
"k8s.io/kubernetes/pkg/runtime"
"k8s.io/kubernetes/pkg/watch"
log "github.com/golang/glog"
mesos "github.com/mesos/mesos-go/mesosproto"
"github.com/mesos/mesos-go/mesosutil"
@@ -41,13 +33,24 @@ import (
"github.com/stretchr/testify/mock"
assertext "k8s.io/kubernetes/contrib/mesos/pkg/assert"
"k8s.io/kubernetes/contrib/mesos/pkg/executor/messages"
"k8s.io/kubernetes/contrib/mesos/pkg/queue"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/components"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/components/algorithm/podschedulers"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/components/controller"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/components/framework"
schedcfg "k8s.io/kubernetes/contrib/mesos/pkg/scheduler/config"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/ha"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/meta"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/podtask"
mresource "k8s.io/kubernetes/contrib/mesos/pkg/scheduler/resource"
"k8s.io/kubernetes/pkg/api"
"k8s.io/kubernetes/pkg/api/testapi"
"k8s.io/kubernetes/pkg/api/unversioned"
"k8s.io/kubernetes/pkg/client/cache"
client "k8s.io/kubernetes/pkg/client/unversioned"
"k8s.io/kubernetes/pkg/runtime"
"k8s.io/kubernetes/pkg/util"
"k8s.io/kubernetes/pkg/watch"
)
// A apiserver mock which partially mocks the pods API
@@ -399,19 +402,6 @@ func (a *EventAssertions) EventWithReason(observer *EventObserver, reason string
}, msgAndArgs...)
}
type joinableDriver struct {
MockSchedulerDriver
joinFunc func() (mesos.Status, error)
}
// Join invokes joinFunc if it has been set, otherwise blocks forever
func (m *joinableDriver) Join() (mesos.Status, error) {
if m.joinFunc != nil {
return m.joinFunc()
}
select {}
}
// Create mesos.TaskStatus for a given task
func newTaskStatusForTask(task *mesos.TaskInfo, state mesos.TaskState) *mesos.TaskStatus {
healthy := state == mesos.TaskState_TASK_RUNNING
@@ -436,12 +426,12 @@ type LaunchedTask struct {
type lifecycleTest struct {
apiServer *TestServer
driver *joinableDriver
driver *framework.JoinableDriver
eventObs *EventObserver
plugin *schedulingPlugin
podsListWatch *MockPodsListWatch
scheduler *KubernetesScheduler
framework framework.Framework
schedulerProc *ha.SchedulerProcess
sched scheduler.Scheduler
t *testing.T
}
@@ -454,15 +444,33 @@ func newLifecycleTest(t *testing.T) lifecycleTest {
// create fake apiserver
apiServer := NewTestServer(t, api.NamespaceDefault, podsListWatch)
// create executor with some data for static pods if set
executor := mesosutil.NewExecutorInfo(
// create ExecutorInfo with some data for static pods if set
ei := mesosutil.NewExecutorInfo(
mesosutil.NewExecutorID("executor-id"),
mesosutil.NewCommandInfo("executor-cmd"),
)
executor.Data = []byte{0, 1, 2}
ei.Data = []byte{0, 1, 2}
// create scheduler
strategy := NewAllocationStrategy(
// create framework
client := client.NewOrDie(&client.Config{
Host: apiServer.server.URL,
Version: testapi.Default.Version(),
})
c := *schedcfg.CreateDefaultConfig()
fw := framework.New(framework.Config{
Executor: ei,
Client: client,
SchedulerConfig: c,
LookupNode: apiServer.LookupNode,
})
// TODO(sttts): re-enable the following tests
// assert.NotNil(framework.client, "client is nil")
// assert.NotNil(framework.executor, "executor is nil")
// assert.NotNil(framework.offers, "offer registry is nil")
// create pod scheduler
strategy := podschedulers.NewAllocationStrategy(
podtask.NewDefaultPredicate(
mresource.DefaultDefaultContainerCPULimit,
mresource.DefaultDefaultContainerMemLimit,
@@ -472,64 +480,39 @@ func newLifecycleTest(t *testing.T) lifecycleTest {
mresource.DefaultDefaultContainerMemLimit,
),
)
scheduler := New(Config{
Executor: executor,
Client: client.NewOrDie(&client.Config{
Host: apiServer.server.URL,
Version: testapi.Default.Version(),
}),
Scheduler: NewFCFSPodScheduler(strategy, apiServer.LookupNode),
Schedcfg: *schedcfg.CreateDefaultConfig(),
LookupNode: apiServer.LookupNode,
})
assert.NotNil(scheduler.client, "client is nil")
assert.NotNil(scheduler.executor, "executor is nil")
assert.NotNil(scheduler.offers, "offer registry is nil")
fcfs := podschedulers.NewFCFSPodScheduler(strategy, apiServer.LookupNode)
// create scheduler process
schedulerProc := ha.New(scheduler)
schedulerProc := ha.New(fw)
// get plugin config from it
config := scheduler.NewPluginConfig(
schedulerProc.Terminal(),
http.DefaultServeMux,
&podsListWatch.ListWatch,
)
assert.NotNil(config)
// make events observable
// create scheduler
eventObs := NewEventObserver()
config.Recorder = eventObs
// create plugin
plugin := NewPlugin(config).(*schedulingPlugin)
assert.NotNil(plugin)
scheduler := components.New(&c, fw, fcfs, client, eventObs, schedulerProc.Terminal(), http.DefaultServeMux, &podsListWatch.ListWatch)
assert.NotNil(scheduler)
// create mock mesos scheduler driver
driver := &joinableDriver{}
driver := &framework.JoinableDriver{}
return lifecycleTest{
apiServer: apiServer,
driver: driver,
eventObs: eventObs,
plugin: plugin,
podsListWatch: podsListWatch,
scheduler: scheduler,
framework: fw,
schedulerProc: schedulerProc,
sched: scheduler,
t: t,
}
}
func (lt lifecycleTest) Start() <-chan LaunchedTask {
assert := &EventAssertions{*assert.New(lt.t)}
lt.plugin.Run(lt.schedulerProc.Terminal())
lt.sched.Run(lt.schedulerProc.Terminal())
// init scheduler
err := lt.scheduler.Init(
// init framework
err := lt.framework.Init(
lt.sched,
lt.schedulerProc.Master(),
lt.plugin,
http.DefaultServeMux,
)
assert.NoError(err)
@@ -582,7 +565,7 @@ func (lt lifecycleTest) Start() <-chan LaunchedTask {
<-started
// tell scheduler to be registered
lt.scheduler.Registered(
lt.framework.Registered(
lt.driver,
mesosutil.NewFrameworkID("kubernetes-id"),
mesosutil.NewMasterInfo("master-id", (192<<24)+(168<<16)+(0<<8)+1, 5050),
@@ -601,19 +584,10 @@ func (lt lifecycleTest) End() <-chan struct{} {
return lt.schedulerProc.End()
}
// Test to create the scheduler plugin with an empty plugin config
func TestPlugin_New(t *testing.T) {
assert := assert.New(t)
c := PluginConfig{}
p := NewPlugin(&c)
assert.NotNil(p)
}
// TestPlugin_LifeCycle creates a scheduler plugin with the config returned by the scheduler,
// TestScheduler_LifeCycle creates a scheduler plugin with the config returned by the scheduler,
// and plays through the whole life cycle of the plugin while creating pods, deleting
// and failing them.
func TestPlugin_LifeCycle(t *testing.T) {
func TestScheduler_LifeCycle(t *testing.T) {
assert := &EventAssertions{*assert.New(t)}
lt := newLifecycleTest(t)
defer lt.Close()
@@ -627,29 +601,29 @@ func TestPlugin_LifeCycle(t *testing.T) {
lt.podsListWatch.Add(pod, true) // notify watchers
// wait for failedScheduling event because there is no offer
assert.EventWithReason(lt.eventObs, FailedScheduling, "failedScheduling event not received")
assert.EventWithReason(lt.eventObs, controller.FailedScheduling, "failedScheduling event not received")
// add some matching offer
offers := []*mesos.Offer{NewTestOffer(fmt.Sprintf("offer%d", i))}
lt.scheduler.ResourceOffers(nil, offers)
lt.framework.ResourceOffers(nil, offers)
// first offer is declined because node is not available yet
lt.apiServer.WaitForNode("some_hostname")
// add one more offer
lt.scheduler.ResourceOffers(nil, offers)
lt.framework.ResourceOffers(nil, offers)
// and wait for scheduled pod
assert.EventWithReason(lt.eventObs, Scheduled)
assert.EventWithReason(lt.eventObs, controller.Scheduled)
select {
case launchedTask := <-launchedTasks:
// report back that the task has been staged, and then started by mesos
lt.scheduler.StatusUpdate(
lt.framework.StatusUpdate(
lt.driver,
newTaskStatusForTask(launchedTask.taskInfo, mesos.TaskState_TASK_STAGING),
)
lt.scheduler.StatusUpdate(
lt.framework.StatusUpdate(
lt.driver,
newTaskStatusForTask(launchedTask.taskInfo, mesos.TaskState_TASK_RUNNING),
)
@@ -660,7 +634,7 @@ func TestPlugin_LifeCycle(t *testing.T) {
// report back that the task has been lost
lt.driver.AssertNumberOfCalls(t, "SendFrameworkMessage", 0)
lt.scheduler.StatusUpdate(
lt.framework.StatusUpdate(
lt.driver,
newTaskStatusForTask(launchedTask.taskInfo, mesos.TaskState_TASK_LOST),
)
@@ -677,22 +651,22 @@ func TestPlugin_LifeCycle(t *testing.T) {
// Launch a pod and wait until the scheduler driver is called
schedulePodWithOffers := func(pod *api.Pod, offers []*mesos.Offer) (*api.Pod, *LaunchedTask, *mesos.Offer) {
// wait for failedScheduling event because there is no offer
assert.EventWithReason(lt.eventObs, FailedScheduling, "failedScheduling event not received")
assert.EventWithReason(lt.eventObs, controller.FailedScheduling, "failedScheduling event not received")
// supply a matching offer
lt.scheduler.ResourceOffers(lt.driver, offers)
lt.framework.ResourceOffers(lt.driver, offers)
for _, offer := range offers {
if _, ok := offeredNodes[offer.GetHostname()]; !ok {
offeredNodes[offer.GetHostname()] = struct{}{}
lt.apiServer.WaitForNode(offer.GetHostname())
// reoffer since it must have been declined above
lt.scheduler.ResourceOffers(lt.driver, []*mesos.Offer{offer})
lt.framework.ResourceOffers(lt.driver, []*mesos.Offer{offer})
}
}
// and wait to get scheduled
assert.EventWithReason(lt.eventObs, Scheduled)
assert.EventWithReason(lt.eventObs, controller.Scheduled)
// wait for driver.launchTasks call
select {
@@ -722,11 +696,11 @@ func TestPlugin_LifeCycle(t *testing.T) {
pod, launchedTask, offer := launchPodWithOffers(pod, offers)
if pod != nil {
// report back status
lt.scheduler.StatusUpdate(
lt.framework.StatusUpdate(
lt.driver,
newTaskStatusForTask(launchedTask.taskInfo, mesos.TaskState_TASK_STAGING),
)
lt.scheduler.StatusUpdate(
lt.framework.StatusUpdate(
lt.driver,
newTaskStatusForTask(launchedTask.taskInfo, mesos.TaskState_TASK_RUNNING),
)
@@ -762,7 +736,7 @@ func TestPlugin_LifeCycle(t *testing.T) {
select {
case <-killTaskCalled:
// report back that the task is finished
lt.scheduler.StatusUpdate(
lt.framework.StatusUpdate(
lt.driver,
newTaskStatusForTask(launchedTask.taskInfo, mesos.TaskState_TASK_FINISHED),
)
@@ -787,8 +761,8 @@ func TestPlugin_LifeCycle(t *testing.T) {
assert.Equal(offers[1].Id.GetValue(), usedOffer.Id.GetValue())
assert.Equal(pod.Spec.NodeName, *usedOffer.Hostname)
lt.scheduler.OfferRescinded(lt.driver, offers[0].Id)
lt.scheduler.OfferRescinded(lt.driver, offers[2].Id)
lt.framework.OfferRescinded(lt.driver, offers[0].Id)
lt.framework.OfferRescinded(lt.driver, offers[2].Id)
// start pods:
// - which are failing while binding,
@@ -800,7 +774,7 @@ func TestPlugin_LifeCycle(t *testing.T) {
status := newTaskStatusForTask(task, mesos.TaskState_TASK_FAILED)
message := messages.CreateBindingFailure
status.Message = &message
lt.scheduler.StatusUpdate(lt.driver, status)
lt.framework.StatusUpdate(lt.driver, status)
// wait until pod is looked up at the apiserver
assertext.EventuallyTrue(t, util.ForeverTestTimeout, func() bool {
@@ -822,7 +796,7 @@ func TestPlugin_LifeCycle(t *testing.T) {
podKey, _ := podtask.MakePodKey(api.NewDefaultContext(), pod.Name)
assertext.EventuallyTrue(t, util.ForeverTestTimeout, func() bool {
t, _ := lt.plugin.api.tasks().ForPod(podKey)
t, _ := lt.sched.Tasks().ForPod(podKey)
return t == nil
})
@@ -845,143 +819,3 @@ func TestPlugin_LifeCycle(t *testing.T) {
time.Sleep(time.Second / 2)
failPodFromExecutor(launchedTask.taskInfo)
}
func TestDeleteOne_NonexistentPod(t *testing.T) {
assert := assert.New(t)
obj := &MockScheduler{}
reg := podtask.NewInMemoryRegistry()
obj.On("tasks").Return(reg)
qr := newQueuer(nil)
assert.Equal(0, len(qr.podQueue.List()))
d := &deleter{
api: obj,
qr: qr,
}
pod := &Pod{Pod: &api.Pod{
ObjectMeta: api.ObjectMeta{
Name: "foo",
Namespace: api.NamespaceDefault,
}}}
err := d.deleteOne(pod)
assert.Equal(err, noSuchPodErr)
obj.AssertExpectations(t)
}
func TestDeleteOne_PendingPod(t *testing.T) {
assert := assert.New(t)
obj := &MockScheduler{}
reg := podtask.NewInMemoryRegistry()
obj.On("tasks").Return(reg)
pod := &Pod{Pod: &api.Pod{
ObjectMeta: api.ObjectMeta{
Name: "foo",
UID: "foo0",
Namespace: api.NamespaceDefault,
}}}
task, err := podtask.New(api.NewDefaultContext(), "bar", *pod.Pod, &mesos.ExecutorInfo{})
if err != nil {
t.Fatalf("failed to create task: %v", err)
}
_, err = reg.Register(task)
if err != nil {
t.Fatalf("failed to register task: %v", err)
}
// preconditions
qr := newQueuer(nil)
qr.podQueue.Add(pod, queue.ReplaceExisting)
assert.Equal(1, len(qr.podQueue.List()))
_, found := qr.podQueue.Get("default/foo")
assert.True(found)
// exec & post conditions
d := &deleter{
api: obj,
qr: qr,
}
err = d.deleteOne(pod)
assert.Nil(err)
_, found = qr.podQueue.Get("foo0")
assert.False(found)
assert.Equal(0, len(qr.podQueue.List()))
obj.AssertExpectations(t)
}
func TestDeleteOne_Running(t *testing.T) {
assert := assert.New(t)
obj := &MockScheduler{}
reg := podtask.NewInMemoryRegistry()
obj.On("tasks").Return(reg)
pod := &Pod{Pod: &api.Pod{
ObjectMeta: api.ObjectMeta{
Name: "foo",
UID: "foo0",
Namespace: api.NamespaceDefault,
}}}
task, err := podtask.New(api.NewDefaultContext(), "bar", *pod.Pod, &mesos.ExecutorInfo{})
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
task, err = reg.Register(task)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
task.Set(podtask.Launched)
err = reg.Update(task)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
// preconditions
qr := newQueuer(nil)
qr.podQueue.Add(pod, queue.ReplaceExisting)
assert.Equal(1, len(qr.podQueue.List()))
_, found := qr.podQueue.Get("default/foo")
assert.True(found)
obj.On("killTask", task.ID).Return(nil)
// exec & post conditions
d := &deleter{
api: obj,
qr: qr,
}
err = d.deleteOne(pod)
assert.Nil(err)
_, found = qr.podQueue.Get("foo0")
assert.False(found)
assert.Equal(0, len(qr.podQueue.List()))
obj.AssertExpectations(t)
}
func TestDeleteOne_badPodNaming(t *testing.T) {
assert := assert.New(t)
obj := &MockScheduler{}
pod := &Pod{Pod: &api.Pod{}}
d := &deleter{
api: obj,
qr: newQueuer(nil),
}
err := d.deleteOne(pod)
assert.NotNil(err)
pod.Pod.ObjectMeta.Name = "foo"
err = d.deleteOne(pod)
assert.NotNil(err)
pod.Pod.ObjectMeta.Name = ""
pod.Pod.ObjectMeta.Namespace = "bar"
err = d.deleteOne(pod)
assert.NotNil(err)
obj.AssertExpectations(t)
}

View File

@@ -25,7 +25,6 @@ const (
TaskIdKey = "k8s.mesosphere.io/taskId"
SlaveIdKey = "k8s.mesosphere.io/slaveId"
OfferIdKey = "k8s.mesosphere.io/offerId"
ExecutorIdKey = "k8s.mesosphere.io/executorId"
PortMappingKeyPrefix = "k8s.mesosphere.io/port_"
PortMappingKeyFormat = PortMappingKeyPrefix + "%s_%d"
PortNameMappingKeyPrefix = "k8s.mesosphere.io/portName_"

View File

@@ -1,930 +0,0 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package scheduler
import (
"fmt"
"io"
"net/http"
"strconv"
"sync"
"time"
log "github.com/golang/glog"
mesos "github.com/mesos/mesos-go/mesosproto"
mutil "github.com/mesos/mesos-go/mesosutil"
"k8s.io/kubernetes/contrib/mesos/pkg/backoff"
"k8s.io/kubernetes/contrib/mesos/pkg/offers"
"k8s.io/kubernetes/contrib/mesos/pkg/queue"
"k8s.io/kubernetes/contrib/mesos/pkg/runtime"
annotation "k8s.io/kubernetes/contrib/mesos/pkg/scheduler/meta"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/podtask"
"k8s.io/kubernetes/pkg/api"
"k8s.io/kubernetes/pkg/api/errors"
"k8s.io/kubernetes/pkg/client/cache"
"k8s.io/kubernetes/pkg/client/record"
client "k8s.io/kubernetes/pkg/client/unversioned"
"k8s.io/kubernetes/pkg/fields"
"k8s.io/kubernetes/pkg/util"
plugin "k8s.io/kubernetes/plugin/pkg/scheduler"
"k8s.io/kubernetes/plugin/pkg/scheduler/algorithm"
)
const (
enqueuePopTimeout = 200 * time.Millisecond
enqueueWaitTimeout = 1 * time.Second
yieldPopTimeout = 200 * time.Millisecond
yieldWaitTimeout = 1 * time.Second
pluginRecoveryDelay = 100 * time.Millisecond // delay after scheduler plugin crashes, before we resume scheduling
)
const (
FailedScheduling = "FailedScheduling"
Scheduled = "Scheduled"
)
// scheduler abstraction to allow for easier unit testing
type schedulerInterface interface {
sync.Locker // synchronize scheduler plugin operations
SlaveIndex
algorithm() PodScheduler
offers() offers.Registry
tasks() podtask.Registry
// driver calls
killTask(taskId string) error
launchTask(*podtask.T) error
// convenience
createPodTask(api.Context, *api.Pod) (*podtask.T, error)
}
type k8smScheduler struct {
sync.Mutex
internal *KubernetesScheduler
}
func (k *k8smScheduler) algorithm() PodScheduler {
return k.internal
}
func (k *k8smScheduler) offers() offers.Registry {
return k.internal.offers
}
func (k *k8smScheduler) tasks() podtask.Registry {
return k.internal.taskRegistry
}
func (k *k8smScheduler) createPodTask(ctx api.Context, pod *api.Pod) (*podtask.T, error) {
return podtask.New(ctx, "", *pod, k.internal.executor)
}
func (k *k8smScheduler) slaveHostNameFor(id string) string {
return k.internal.slaveHostNames.HostName(id)
}
func (k *k8smScheduler) killTask(taskId string) error {
killTaskId := mutil.NewTaskID(taskId)
_, err := k.internal.driver.KillTask(killTaskId)
return err
}
func (k *k8smScheduler) launchTask(task *podtask.T) error {
// assume caller is holding scheduler lock
taskList := []*mesos.TaskInfo{task.BuildTaskInfo()}
offerIds := []*mesos.OfferID{task.Offer.Details().Id}
filters := &mesos.Filters{}
_, err := k.internal.driver.LaunchTasks(offerIds, taskList, filters)
return err
}
type binder struct {
api schedulerInterface
}
// implements binding.Registry, launches the pod-associated-task in mesos
func (b *binder) Bind(binding *api.Binding) error {
ctx := api.WithNamespace(api.NewContext(), binding.Namespace)
// default upstream scheduler passes pod.Name as binding.Name
podKey, err := podtask.MakePodKey(ctx, binding.Name)
if err != nil {
return err
}
b.api.Lock()
defer b.api.Unlock()
switch task, state := b.api.tasks().ForPod(podKey); state {
case podtask.StatePending:
return b.bind(ctx, binding, task)
default:
// in this case it's likely that the pod has been deleted between Schedule
// and Bind calls
log.Infof("No pending task for pod %s", podKey)
return noSuchPodErr //TODO(jdef) this error is somewhat misleading since the task could be running?!
}
}
func (b *binder) rollback(task *podtask.T, err error) error {
task.Offer.Release()
task.Reset()
if err2 := b.api.tasks().Update(task); err2 != nil {
log.Errorf("failed to update pod task: %v", err2)
}
return err
}
// assumes that: caller has acquired scheduler lock and that the task is still pending
//
// bind does not actually do the binding itself, but launches the pod as a Mesos task. The
// kubernetes executor on the slave will finally do the binding. This is different from the
// upstream scheduler in the sense that the upstream scheduler does the binding and the
// kubelet will notice that and launches the pod.
func (b *binder) bind(ctx api.Context, binding *api.Binding, task *podtask.T) (err error) {
// sanity check: ensure that the task hasAcceptedOffer(), it's possible that between
// Schedule() and now that the offer for this task was rescinded or invalidated.
// ((we should never see this here))
if !task.HasAcceptedOffer() {
return fmt.Errorf("task has not accepted a valid offer %v", task.ID)
}
// By this time, there is a chance that the slave is disconnected.
offerId := task.GetOfferId()
if offer, ok := b.api.offers().Get(offerId); !ok || offer.HasExpired() {
// already rescinded or timed out or otherwise invalidated
return b.rollback(task, fmt.Errorf("failed prior to launchTask due to expired offer for task %v", task.ID))
}
if err = b.prepareTaskForLaunch(ctx, binding.Target.Name, task, offerId); err == nil {
log.V(2).Infof("launching task: %q on target %q slave %q for pod \"%v/%v\", cpu %.2f, mem %.2f MB",
task.ID, binding.Target.Name, task.Spec.SlaveID, task.Pod.Namespace, task.Pod.Name, task.Spec.CPU, task.Spec.Memory)
if err = b.api.launchTask(task); err == nil {
b.api.offers().Invalidate(offerId)
task.Set(podtask.Launched)
if err = b.api.tasks().Update(task); err != nil {
// this should only happen if the task has been removed or has changed status,
// which SHOULD NOT HAPPEN as long as we're synchronizing correctly
log.Errorf("failed to update task w/ Launched status: %v", err)
}
return
}
}
return b.rollback(task, fmt.Errorf("Failed to launch task %v: %v", task.ID, err))
}
//TODO(jdef) unit test this, ensure that task's copy of api.Pod is not modified
func (b *binder) prepareTaskForLaunch(ctx api.Context, machine string, task *podtask.T, offerId string) error {
pod := task.Pod
// we make an effort here to avoid making changes to the task's copy of the pod, since
// we want that to reflect the initial user spec, and not the modified spec that we
// build for the executor to consume.
oemCt := pod.Spec.Containers
pod.Spec.Containers = append([]api.Container{}, oemCt...) // (shallow) clone before mod
if pod.Annotations == nil {
pod.Annotations = make(map[string]string)
}
task.SaveRecoveryInfo(pod.Annotations)
pod.Annotations[annotation.BindingHostKey] = task.Spec.AssignedSlave
for _, entry := range task.Spec.PortMap {
oemPorts := pod.Spec.Containers[entry.ContainerIdx].Ports
ports := append([]api.ContainerPort{}, oemPorts...)
p := &ports[entry.PortIdx]
p.HostPort = int(entry.OfferPort)
op := strconv.FormatUint(entry.OfferPort, 10)
pod.Annotations[fmt.Sprintf(annotation.PortMappingKeyFormat, p.Protocol, p.ContainerPort)] = op
if p.Name != "" {
pod.Annotations[fmt.Sprintf(annotation.PortNameMappingKeyFormat, p.Protocol, p.Name)] = op
}
pod.Spec.Containers[entry.ContainerIdx].Ports = ports
}
// the kubelet-executor uses this to instantiate the pod
log.V(3).Infof("prepared pod spec: %+v", pod)
data, err := api.Codec.Encode(&pod)
if err != nil {
log.V(2).Infof("Failed to marshal the pod spec: %v", err)
return err
}
task.Spec.Data = data
return nil
}
type kubeScheduler struct {
api schedulerInterface
podUpdates queue.FIFO
}
// recoverAssignedSlave recovers the assigned Mesos slave from a pod by searching
// the BindingHostKey. For tasks in the registry of the scheduler, the same
// value is stored in T.Spec.AssignedSlave. Before launching, the BindingHostKey
// annotation is added and the executor will eventually persist that to the
// apiserver on binding.
func recoverAssignedSlave(pod *api.Pod) string {
return pod.Annotations[annotation.BindingHostKey]
}
// Schedule implements the Scheduler interface of Kubernetes.
// It returns the selectedMachine's name and error (if there's any).
func (k *kubeScheduler) Schedule(pod *api.Pod, unused algorithm.NodeLister) (string, error) {
log.Infof("Try to schedule pod %v\n", pod.Name)
ctx := api.WithNamespace(api.NewDefaultContext(), pod.Namespace)
// default upstream scheduler passes pod.Name as binding.PodID
podKey, err := podtask.MakePodKey(ctx, pod.Name)
if err != nil {
return "", err
}
k.api.Lock()
defer k.api.Unlock()
switch task, state := k.api.tasks().ForPod(podKey); state {
case podtask.StateUnknown:
// There's a bit of a potential race here, a pod could have been yielded() and
// then before we get *here* it could be deleted.
// We use meta to index the pod in the store since that's what k8s reflector does.
podName, err := cache.MetaNamespaceKeyFunc(pod)
if err != nil {
log.Warningf("aborting Schedule, unable to understand pod object %+v", pod)
return "", noSuchPodErr
}
if deleted := k.podUpdates.Poll(podName, queue.DELETE_EVENT); deleted {
// avoid scheduling a pod that's been deleted between yieldPod() and Schedule()
log.Infof("aborting Schedule, pod has been deleted %+v", pod)
return "", noSuchPodErr
}
task, err := k.api.createPodTask(ctx, pod)
if err != nil {
return "", err
}
task, err = k.api.tasks().Register(task)
if err != nil {
return "", err
}
return k.doSchedule(task)
//TODO(jdef) it's possible that the pod state has diverged from what
//we knew previously, we should probably update the task.Pod state here
//before proceeding with scheduling
case podtask.StatePending:
if pod.UID != task.Pod.UID {
// we're dealing with a brand new pod spec here, so the old one must have been
// deleted -- and so our task store is out of sync w/ respect to reality
//TODO(jdef) reconcile task
return "", fmt.Errorf("task %v spec is out of sync with pod %v spec, aborting schedule", task.ID, pod.Name)
} else if task.Has(podtask.Launched) {
// task has been marked as "launched" but the pod binding creation may have failed in k8s,
// but we're going to let someone else handle it, probably the mesos task error handler
return "", fmt.Errorf("task %s has already been launched, aborting schedule", task.ID)
} else {
return k.doSchedule(task)
}
default:
return "", fmt.Errorf("task %s is not pending, nothing to schedule", task.ID)
}
}
// doSchedule schedules the given task and returns the machine the task is scheduled on
// or an error if the scheduling failed.
func (k *kubeScheduler) doSchedule(task *podtask.T) (string, error) {
var offer offers.Perishable
var err error
if task.HasAcceptedOffer() {
// verify that the offer is still on the table
var ok bool
offer, ok = k.api.offers().Get(task.GetOfferId())
if !ok || offer.HasExpired() {
task.Offer.Release()
task.Reset()
if err = k.api.tasks().Update(task); err != nil {
return "", err
}
}
}
if offer == nil {
offer, err = k.api.algorithm().SchedulePod(k.api.offers(), k.api, task)
}
if err != nil {
return "", err
}
details := offer.Details()
if details == nil {
return "", fmt.Errorf("offer already invalid/expired for task %v", task.ID)
}
slaveId := details.GetSlaveId().GetValue()
slaveHostName := k.api.slaveHostNameFor(slaveId)
if slaveHostName == "" {
// not much sense in Release()ing the offer here since its owner died
offer.Release()
k.api.offers().Invalidate(details.Id.GetValue())
return "", fmt.Errorf("Slave disappeared (%v) while scheduling task %v", slaveId, task.ID)
}
if task.Offer != nil && task.Offer != offer {
return "", fmt.Errorf("task.offer assignment must be idempotent, task %+v: offer %+v", task, offer)
}
task.Offer = offer
if err := k.api.algorithm().Procurement()(task, details); err != nil {
offer.Release()
task.Reset()
return "", err
}
if err := k.api.tasks().Update(task); err != nil {
offer.Release()
return "", err
}
return slaveHostName, nil
}
type queuer struct {
lock sync.Mutex // shared by condition variables of this struct
podUpdates queue.FIFO // queue of pod updates to be processed
podQueue *queue.DelayFIFO // queue of pods to be scheduled
deltaCond sync.Cond // pod changes are available for processing
unscheduledCond sync.Cond // there are unscheduled pods for processing
}
func newQueuer(store queue.FIFO) *queuer {
q := &queuer{
podQueue: queue.NewDelayFIFO(),
podUpdates: store,
}
q.deltaCond.L = &q.lock
q.unscheduledCond.L = &q.lock
return q
}
func (q *queuer) installDebugHandlers(mux *http.ServeMux) {
mux.HandleFunc("/debug/scheduler/podqueue", func(w http.ResponseWriter, r *http.Request) {
for _, x := range q.podQueue.List() {
if _, err := io.WriteString(w, fmt.Sprintf("%+v\n", x)); err != nil {
break
}
}
})
mux.HandleFunc("/debug/scheduler/podstore", func(w http.ResponseWriter, r *http.Request) {
for _, x := range q.podUpdates.List() {
if _, err := io.WriteString(w, fmt.Sprintf("%+v\n", x)); err != nil {
break
}
}
})
}
// signal that there are probably pod updates waiting to be processed
func (q *queuer) updatesAvailable() {
q.deltaCond.Broadcast()
}
// delete a pod from the to-be-scheduled queue
func (q *queuer) dequeue(id string) {
q.podQueue.Delete(id)
}
// re-add a pod to the to-be-scheduled queue, will not overwrite existing pod data (that
// may have already changed).
func (q *queuer) requeue(pod *Pod) {
// use KeepExisting in case the pod has already been updated (can happen if binding fails
// due to constraint voilations); we don't want to overwrite a newer entry with stale data.
q.podQueue.Add(pod, queue.KeepExisting)
q.unscheduledCond.Broadcast()
}
// same as requeue but calls podQueue.Offer instead of podQueue.Add
func (q *queuer) reoffer(pod *Pod) {
// use KeepExisting in case the pod has already been updated (can happen if binding fails
// due to constraint voilations); we don't want to overwrite a newer entry with stale data.
if q.podQueue.Offer(pod, queue.KeepExisting) {
q.unscheduledCond.Broadcast()
}
}
// spawns a go-routine to watch for unscheduled pods and queue them up
// for scheduling. returns immediately.
func (q *queuer) Run(done <-chan struct{}) {
go runtime.Until(func() {
log.Info("Watching for newly created pods")
q.lock.Lock()
defer q.lock.Unlock()
for {
// limit blocking here for short intervals so that scheduling
// may proceed even if there have been no recent pod changes
p := q.podUpdates.Await(enqueuePopTimeout)
if p == nil {
signalled := runtime.After(q.deltaCond.Wait)
// we've yielded the lock
select {
case <-time.After(enqueueWaitTimeout):
q.deltaCond.Broadcast() // abort Wait()
<-signalled // wait for lock re-acquisition
log.V(4).Infoln("timed out waiting for a pod update")
case <-signalled:
// we've acquired the lock and there may be
// changes for us to process now
}
continue
}
pod := p.(*Pod)
if recoverAssignedSlave(pod.Pod) != "" {
log.V(3).Infof("dequeuing assigned pod for scheduling: %v", pod.Pod.Name)
q.dequeue(pod.GetUID())
} else {
// use ReplaceExisting because we are always pushing the latest state
now := time.Now()
pod.deadline = &now
if q.podQueue.Offer(pod, queue.ReplaceExisting) {
q.unscheduledCond.Broadcast()
log.V(3).Infof("queued pod for scheduling: %v", pod.Pod.Name)
} else {
log.Warningf("failed to queue pod for scheduling: %v", pod.Pod.Name)
}
}
}
}, 1*time.Second, done)
}
// implementation of scheduling plugin's NextPod func; see k8s plugin/pkg/scheduler
func (q *queuer) yield() *api.Pod {
log.V(2).Info("attempting to yield a pod")
q.lock.Lock()
defer q.lock.Unlock()
for {
// limit blocking here to short intervals so that we don't block the
// enqueuer Run() routine for very long
kpod := q.podQueue.Await(yieldPopTimeout)
if kpod == nil {
signalled := runtime.After(q.unscheduledCond.Wait)
// lock is yielded at this point and we're going to wait for either
// a timeout, or a signal that there's data
select {
case <-time.After(yieldWaitTimeout):
q.unscheduledCond.Broadcast() // abort Wait()
<-signalled // wait for the go-routine, and the lock
log.V(4).Infoln("timed out waiting for a pod to yield")
case <-signalled:
// we have acquired the lock, and there
// may be a pod for us to pop now
}
continue
}
pod := kpod.(*Pod).Pod
if podName, err := cache.MetaNamespaceKeyFunc(pod); err != nil {
log.Warningf("yield unable to understand pod object %+v, will skip: %v", pod, err)
} else if !q.podUpdates.Poll(podName, queue.POP_EVENT) {
log.V(1).Infof("yield popped a transitioning pod, skipping: %+v", pod)
} else if recoverAssignedSlave(pod) != "" {
// should never happen if enqueuePods is filtering properly
log.Warningf("yield popped an already-scheduled pod, skipping: %+v", pod)
} else {
return pod
}
}
}
type errorHandler struct {
api schedulerInterface
backoff *backoff.Backoff
qr *queuer
}
// implementation of scheduling plugin's Error func; see plugin/pkg/scheduler
func (k *errorHandler) handleSchedulingError(pod *api.Pod, schedulingErr error) {
if schedulingErr == noSuchPodErr {
log.V(2).Infof("Not rescheduling non-existent pod %v", pod.Name)
return
}
log.Infof("Error scheduling %v: %v; retrying", pod.Name, schedulingErr)
defer util.HandleCrash()
// default upstream scheduler passes pod.Name as binding.PodID
ctx := api.WithNamespace(api.NewDefaultContext(), pod.Namespace)
podKey, err := podtask.MakePodKey(ctx, pod.Name)
if err != nil {
log.Errorf("Failed to construct pod key, aborting scheduling for pod %v: %v", pod.Name, err)
return
}
k.backoff.GC()
k.api.Lock()
defer k.api.Unlock()
switch task, state := k.api.tasks().ForPod(podKey); state {
case podtask.StateUnknown:
// if we don't have a mapping here any more then someone deleted the pod
log.V(2).Infof("Could not resolve pod to task, aborting pod reschdule: %s", podKey)
return
case podtask.StatePending:
if task.Has(podtask.Launched) {
log.V(2).Infof("Skipping re-scheduling for already-launched pod %v", podKey)
return
}
breakoutEarly := queue.BreakChan(nil)
if schedulingErr == noSuitableOffersErr {
log.V(3).Infof("adding backoff breakout handler for pod %v", podKey)
breakoutEarly = queue.BreakChan(k.api.offers().Listen(podKey, func(offer *mesos.Offer) bool {
k.api.Lock()
defer k.api.Unlock()
switch task, state := k.api.tasks().Get(task.ID); state {
case podtask.StatePending:
// Assess fitness of pod with the current offer. The scheduler normally
// "backs off" when it can't find an offer that matches up with a pod.
// The backoff period for a pod can terminate sooner if an offer becomes
// available that matches up.
return !task.Has(podtask.Launched) && k.api.algorithm().FitPredicate()(task, offer, nil)
default:
// no point in continuing to check for matching offers
return true
}
}))
}
delay := k.backoff.Get(podKey)
log.V(3).Infof("requeuing pod %v with delay %v", podKey, delay)
k.qr.requeue(&Pod{Pod: pod, delay: &delay, notify: breakoutEarly})
default:
log.V(2).Infof("Task is no longer pending, aborting reschedule for pod %v", podKey)
}
}
type deleter struct {
api schedulerInterface
qr *queuer
}
// currently monitors for "pod deleted" events, upon which handle()
// is invoked.
func (k *deleter) Run(updates <-chan queue.Entry, done <-chan struct{}) {
go runtime.Until(func() {
for {
entry := <-updates
pod := entry.Value().(*Pod)
if entry.Is(queue.DELETE_EVENT) {
if err := k.deleteOne(pod); err != nil {
log.Error(err)
}
} else if !entry.Is(queue.POP_EVENT) {
k.qr.updatesAvailable()
}
}
}, 1*time.Second, done)
}
func (k *deleter) deleteOne(pod *Pod) error {
ctx := api.WithNamespace(api.NewDefaultContext(), pod.Namespace)
podKey, err := podtask.MakePodKey(ctx, pod.Name)
if err != nil {
return err
}
log.V(2).Infof("pod deleted: %v", podKey)
// order is important here: we want to make sure we have the lock before
// removing the pod from the scheduling queue. this makes the concurrent
// execution of scheduler-error-handling and delete-handling easier to
// reason about.
k.api.Lock()
defer k.api.Unlock()
// prevent the scheduler from attempting to pop this; it's also possible that
// it's concurrently being scheduled (somewhere between pod scheduling and
// binding) - if so, then we'll end up removing it from taskRegistry which
// will abort Bind()ing
k.qr.dequeue(pod.GetUID())
switch task, state := k.api.tasks().ForPod(podKey); state {
case podtask.StateUnknown:
log.V(2).Infof("Could not resolve pod '%s' to task id", podKey)
return noSuchPodErr
// determine if the task has already been launched to mesos, if not then
// cleanup is easier (unregister) since there's no state to sync
case podtask.StatePending:
if !task.Has(podtask.Launched) {
// we've been invoked in between Schedule() and Bind()
if task.HasAcceptedOffer() {
task.Offer.Release()
task.Reset()
task.Set(podtask.Deleted)
//TODO(jdef) probably want better handling here
if err := k.api.tasks().Update(task); err != nil {
return err
}
}
k.api.tasks().Unregister(task)
return nil
}
fallthrough
case podtask.StateRunning:
// signal to watchers that the related pod is going down
task.Set(podtask.Deleted)
if err := k.api.tasks().Update(task); err != nil {
log.Errorf("failed to update task w/ Deleted status: %v", err)
}
return k.api.killTask(task.ID)
default:
log.Infof("cannot kill pod '%s': non-terminal task not found %v", podKey, task.ID)
return noSuchTaskErr
}
}
// Create creates a scheduler plugin and all supporting background functions.
func (k *KubernetesScheduler) NewDefaultPluginConfig(terminate <-chan struct{}, mux *http.ServeMux) *PluginConfig {
// use ListWatch watching pods using the client by default
return k.NewPluginConfig(terminate, mux, createAllPodsLW(k.client))
}
func (k *KubernetesScheduler) NewPluginConfig(terminate <-chan struct{}, mux *http.ServeMux,
podsWatcher *cache.ListWatch) *PluginConfig {
// Watch and queue pods that need scheduling.
updates := make(chan queue.Entry, k.schedcfg.UpdatesBacklog)
podUpdates := &podStoreAdapter{queue.NewHistorical(updates)}
reflector := cache.NewReflector(podsWatcher, &api.Pod{}, podUpdates, 0)
// lock that guards critial sections that involve transferring pods from
// the store (cache) to the scheduling queue; its purpose is to maintain
// an ordering (vs interleaving) of operations that's easier to reason about.
kapi := &k8smScheduler{internal: k}
q := newQueuer(podUpdates)
podDeleter := &deleter{
api: kapi,
qr: q,
}
eh := &errorHandler{
api: kapi,
backoff: backoff.New(k.schedcfg.InitialPodBackoff.Duration, k.schedcfg.MaxPodBackoff.Duration),
qr: q,
}
startLatch := make(chan struct{})
eventBroadcaster := record.NewBroadcaster()
runtime.On(startLatch, func() {
eventBroadcaster.StartRecordingToSink(k.client.Events(""))
reflector.Run() // TODO(jdef) should listen for termination
podDeleter.Run(updates, terminate)
q.Run(terminate)
q.installDebugHandlers(mux)
podtask.InstallDebugHandlers(k.taskRegistry, mux)
})
return &PluginConfig{
Config: &plugin.Config{
NodeLister: nil,
Algorithm: &kubeScheduler{
api: kapi,
podUpdates: podUpdates,
},
Binder: &binder{api: kapi},
NextPod: q.yield,
Error: eh.handleSchedulingError,
Recorder: eventBroadcaster.NewRecorder(api.EventSource{Component: "scheduler"}),
},
api: kapi,
client: k.client,
qr: q,
deleter: podDeleter,
starting: startLatch,
}
}
type PluginConfig struct {
*plugin.Config
api schedulerInterface
client *client.Client
qr *queuer
deleter *deleter
starting chan struct{} // startup latch
}
func NewPlugin(c *PluginConfig) PluginInterface {
return &schedulingPlugin{
config: c.Config,
api: c.api,
client: c.client,
qr: c.qr,
deleter: c.deleter,
starting: c.starting,
}
}
type schedulingPlugin struct {
config *plugin.Config
api schedulerInterface
client *client.Client
qr *queuer
deleter *deleter
starting chan struct{}
}
func (s *schedulingPlugin) Run(done <-chan struct{}) {
defer close(s.starting)
go runtime.Until(s.scheduleOne, pluginRecoveryDelay, done)
}
// hacked from GoogleCloudPlatform/kubernetes/plugin/pkg/scheduler/scheduler.go,
// with the Modeler stuff removed since we don't use it because we have mesos.
func (s *schedulingPlugin) scheduleOne() {
pod := s.config.NextPod()
// pods which are pre-scheduled (i.e. NodeName is set) are deleted by the kubelet
// in upstream. Not so in Mesos because the kubelet hasn't see that pod yet. Hence,
// the scheduler has to take care of this:
if pod.Spec.NodeName != "" && pod.DeletionTimestamp != nil {
log.V(3).Infof("deleting pre-scheduled, not yet running pod: %s/%s", pod.Namespace, pod.Name)
s.client.Pods(pod.Namespace).Delete(pod.Name, api.NewDeleteOptions(0))
return
}
log.V(3).Infof("Attempting to schedule: %+v", pod)
dest, err := s.config.Algorithm.Schedule(pod, s.config.NodeLister) // call kubeScheduler.Schedule
if err != nil {
log.V(1).Infof("Failed to schedule: %+v", pod)
s.config.Recorder.Eventf(pod, FailedScheduling, "Error scheduling: %v", err)
s.config.Error(pod, err)
return
}
b := &api.Binding{
ObjectMeta: api.ObjectMeta{Namespace: pod.Namespace, Name: pod.Name},
Target: api.ObjectReference{
Kind: "Node",
Name: dest,
},
}
if err := s.config.Binder.Bind(b); err != nil {
log.V(1).Infof("Failed to bind pod: %+v", err)
s.config.Recorder.Eventf(pod, FailedScheduling, "Binding rejected: %v", err)
s.config.Error(pod, err)
return
}
s.config.Recorder.Eventf(pod, Scheduled, "Successfully assigned %v to %v", pod.Name, dest)
}
// this pod may be out of sync with respect to the API server registry:
// this pod | apiserver registry
// -------------|----------------------
// host=.* | 404 ; pod was deleted
// host=.* | 5xx ; failed to sync, try again later?
// host="" | host="" ; perhaps no updates to process?
// host="" | host="..." ; pod has been scheduled and assigned, is there a task assigned? (check TaskIdKey in binding?)
// host="..." | host="" ; pod is no longer scheduled, does it need to be re-queued?
// host="..." | host="..." ; perhaps no updates to process?
//
// TODO(jdef) this needs an integration test
func (s *schedulingPlugin) reconcileTask(t *podtask.T) {
log.V(1).Infof("reconcile pod %v, assigned to slave %q", t.Pod.Name, t.Spec.AssignedSlave)
ctx := api.WithNamespace(api.NewDefaultContext(), t.Pod.Namespace)
pod, err := s.client.Pods(api.NamespaceValue(ctx)).Get(t.Pod.Name)
if err != nil {
if errors.IsNotFound(err) {
// attempt to delete
if err = s.deleter.deleteOne(&Pod{Pod: &t.Pod}); err != nil && err != noSuchPodErr && err != noSuchTaskErr {
log.Errorf("failed to delete pod: %v: %v", t.Pod.Name, err)
}
} else {
//TODO(jdef) other errors should probably trigger a retry (w/ backoff).
//For now, drop the pod on the floor
log.Warning("aborting reconciliation for pod %v: %v", t.Pod.Name, err)
}
return
}
log.Infof("pod %v scheduled on %q according to apiserver", pod.Name, pod.Spec.NodeName)
if t.Spec.AssignedSlave != pod.Spec.NodeName {
if pod.Spec.NodeName == "" {
// pod is unscheduled.
// it's possible that we dropped the pod in the scheduler error handler
// because of task misalignment with the pod (task.Has(podtask.Launched) == true)
podKey, err := podtask.MakePodKey(ctx, pod.Name)
if err != nil {
log.Error(err)
return
}
s.api.Lock()
defer s.api.Unlock()
if _, state := s.api.tasks().ForPod(podKey); state != podtask.StateUnknown {
//TODO(jdef) reconcile the task
log.Errorf("task already registered for pod %v", pod.Name)
return
}
now := time.Now()
log.V(3).Infof("reoffering pod %v", podKey)
s.qr.reoffer(&Pod{
Pod: pod,
deadline: &now,
})
} else {
// pod is scheduled.
// not sure how this happened behind our backs. attempt to reconstruct
// at least a partial podtask.T record.
//TODO(jdef) reconcile the task
log.Errorf("pod already scheduled: %v", pod.Name)
}
} else {
//TODO(jdef) for now, ignore the fact that the rest of the spec may be different
//and assume that our knowledge of the pod aligns with that of the apiserver
log.Error("pod reconciliation does not support updates; not yet implemented")
}
}
func parseSelectorOrDie(s string) fields.Selector {
selector, err := fields.ParseSelector(s)
if err != nil {
panic(err)
}
return selector
}
// createAllPodsLW returns a listWatch that finds all pods
func createAllPodsLW(cl *client.Client) *cache.ListWatch {
return cache.NewListWatchFromClient(cl, "pods", api.NamespaceAll, parseSelectorOrDie(""))
}
// Consumes *api.Pod, produces *Pod; the k8s reflector wants to push *api.Pod
// objects at us, but we want to store more flexible (Pod) type defined in
// this package. The adapter implementation facilitates this. It's a little
// hackish since the object type going in is different than the object type
// coming out -- you've been warned.
type podStoreAdapter struct {
queue.FIFO
}
func (psa *podStoreAdapter) Add(obj interface{}) error {
pod := obj.(*api.Pod)
return psa.FIFO.Add(&Pod{Pod: pod})
}
func (psa *podStoreAdapter) Update(obj interface{}) error {
pod := obj.(*api.Pod)
return psa.FIFO.Update(&Pod{Pod: pod})
}
func (psa *podStoreAdapter) Delete(obj interface{}) error {
pod := obj.(*api.Pod)
return psa.FIFO.Delete(&Pod{Pod: pod})
}
func (psa *podStoreAdapter) Get(obj interface{}) (interface{}, bool, error) {
pod := obj.(*api.Pod)
return psa.FIFO.Get(&Pod{Pod: pod})
}
// Replace will delete the contents of the store, using instead the
// given map. This store implementation does NOT take ownership of the map.
func (psa *podStoreAdapter) Replace(objs []interface{}, resourceVersion string) error {
newobjs := make([]interface{}, len(objs))
for i, v := range objs {
pod := v.(*api.Pod)
newobjs[i] = &Pod{Pod: pod}
}
return psa.FIFO.Replace(newobjs, resourceVersion)
}

View File

@@ -18,6 +18,7 @@ package podtask
import (
"fmt"
"strings"
"time"
"github.com/gogo/protobuf/proto"
@@ -62,7 +63,6 @@ type T struct {
UpdatedTime time.Time // time of the most recent StatusUpdate we've seen from the mesos master
podStatus api.PodStatus
executor *mesos.ExecutorInfo // readonly
podKey string
launchTime time.Time
bindTime time.Time
@@ -130,21 +130,49 @@ func generateTaskName(pod *api.Pod) string {
return fmt.Sprintf("%s.%s.pods", pod.Name, ns)
}
func (t *T) BuildTaskInfo() *mesos.TaskInfo {
func setCommandArgument(ei *mesos.ExecutorInfo, flag, value string, create bool) {
argv := []string{}
overwrite := false
if ei.Command != nil && ei.Command.Arguments != nil {
argv = ei.Command.Arguments
for i, arg := range argv {
if strings.HasPrefix(arg, flag+"=") {
overwrite = true
argv[i] = flag + "=" + value
break
}
}
}
if !overwrite && create {
argv = append(argv, flag+"="+value)
if ei.Command == nil {
ei.Command = &mesos.CommandInfo{}
}
ei.Command.Arguments = argv
}
}
func (t *T) BuildTaskInfo(prototype *mesos.ExecutorInfo) *mesos.TaskInfo {
info := &mesos.TaskInfo{
Name: proto.String(generateTaskName(&t.Pod)),
TaskId: mutil.NewTaskID(t.ID),
SlaveId: mutil.NewSlaveID(t.Spec.SlaveID),
Executor: t.executor,
Executor: proto.Clone(prototype).(*mesos.ExecutorInfo),
Data: t.Spec.Data,
Resources: []*mesos.Resource{
mutil.NewScalarResource("cpus", float64(t.Spec.CPU)),
mutil.NewScalarResource("mem", float64(t.Spec.Memory)),
},
}
if portsResource := rangeResource("ports", t.Spec.Ports); portsResource != nil {
info.Resources = append(info.Resources, portsResource)
}
// hostname needs of the executor needs to match that of the offer, otherwise
// the kubelet node status checker/updater is very unhappy
setCommandArgument(info.Executor, "--hostname-override", t.Spec.AssignedSlave, true)
return info
}
@@ -170,10 +198,7 @@ func (t *T) Has(f FlagType) (exists bool) {
return
}
func New(ctx api.Context, id string, pod api.Pod, executor *mesos.ExecutorInfo) (*T, error) {
if executor == nil {
return nil, fmt.Errorf("illegal argument: executor was nil")
}
func New(ctx api.Context, id string, pod *api.Pod) (*T, error) {
key, err := MakePodKey(ctx, pod.Name)
if err != nil {
return nil, err
@@ -182,13 +207,12 @@ func New(ctx api.Context, id string, pod api.Pod, executor *mesos.ExecutorInfo)
id = "pod." + uuid.NewUUID().String()
}
task := &T{
ID: id,
Pod: pod,
State: StatePending,
podKey: key,
mapper: MappingTypeForPod(&pod),
Flags: make(map[FlagType]struct{}),
executor: proto.Clone(executor).(*mesos.ExecutorInfo),
ID: id,
Pod: *pod,
State: StatePending,
podKey: key,
mapper: MappingTypeForPod(pod),
Flags: make(map[FlagType]struct{}),
}
task.CreateTime = time.Now()
return task, nil
@@ -198,7 +222,6 @@ func (t *T) SaveRecoveryInfo(dict map[string]string) {
dict[annotation.TaskIdKey] = t.ID
dict[annotation.SlaveIdKey] = t.Spec.SlaveID
dict[annotation.OfferIdKey] = t.Offer.Details().Id.GetValue()
dict[annotation.ExecutorIdKey] = t.executor.ExecutorId.GetValue()
}
// reconstruct a task from metadata stashed in a pod entry. there are limited pod states that
@@ -256,7 +279,6 @@ func RecoverFrom(pod api.Pod) (*T, bool, error) {
annotation.TaskIdKey,
annotation.SlaveIdKey,
annotation.OfferIdKey,
annotation.ExecutorIdKey,
} {
v, found := pod.Annotations[k]
if !found {
@@ -271,10 +293,6 @@ func RecoverFrom(pod api.Pod) (*T, bool, error) {
offerId = v
case annotation.TaskIdKey:
t.ID = v
case annotation.ExecutorIdKey:
// this is nowhere near sufficient to re-launch a task, but we really just
// want this for tracking
t.executor = &mesos.ExecutorInfo{ExecutorId: mutil.NewExecutorID(v)}
}
}
t.Offer = offers.Expired(offerId, t.Spec.AssignedSlave, 0)

View File

@@ -35,12 +35,12 @@ const (
)
func fakePodTask(id string) (*T, error) {
return New(api.NewDefaultContext(), "", api.Pod{
return New(api.NewDefaultContext(), "", &api.Pod{
ObjectMeta: api.ObjectMeta{
Name: id,
Namespace: api.NamespaceDefault,
},
}, &mesos.ExecutorInfo{})
})
}
func TestUnlimitedResources(t *testing.T) {

View File

@@ -52,7 +52,7 @@ func TestDefaultHostPortMatching(t *testing.T) {
}},
}},
}
task, err = New(api.NewDefaultContext(), "", *pod, &mesos.ExecutorInfo{})
task, err = New(api.NewDefaultContext(), "", pod)
if err != nil {
t.Fatal(err)
}
@@ -100,7 +100,7 @@ func TestWildcardHostPortMatching(t *testing.T) {
}},
}},
}
task, err = New(api.NewDefaultContext(), "", *pod, &mesos.ExecutorInfo{})
task, err = New(api.NewDefaultContext(), "", pod)
if err != nil {
t.Fatal(err)
}
@@ -123,7 +123,7 @@ func TestWildcardHostPortMatching(t *testing.T) {
}},
}},
}
task, err = New(api.NewDefaultContext(), "", *pod, &mesos.ExecutorInfo{})
task, err = New(api.NewDefaultContext(), "", pod)
if err != nil {
t.Fatal(err)
}
@@ -144,7 +144,7 @@ func TestWildcardHostPortMatching(t *testing.T) {
}},
}},
}
task, err = New(api.NewDefaultContext(), "", *pod, &mesos.ExecutorInfo{})
task, err = New(api.NewDefaultContext(), "", pod)
if err != nil {
t.Fatal(err)
}

View File

@@ -17,8 +17,6 @@ limitations under the License.
package podtask
import (
"strings"
log "github.com/golang/glog"
mesos "github.com/mesos/mesos-go/mesosproto"
mresource "k8s.io/kubernetes/contrib/mesos/pkg/scheduler/resource"
@@ -74,31 +72,11 @@ func ValidateProcurement(t *T, offer *mesos.Offer) error {
return nil
}
func setCommandArgument(ei *mesos.ExecutorInfo, flag, value string, create bool) {
argv := ei.Command.Arguments
overwrite := false
for i, arg := range argv {
if strings.HasPrefix(arg, flag+"=") {
overwrite = true
argv[i] = flag + "=" + value
break
}
}
if !overwrite && create {
ei.Command.Arguments = append(argv, flag+"="+value)
}
}
// NodeProcurement updates t.Spec in preparation for the task to be launched on the
// slave associated with the offer.
func NodeProcurement(t *T, offer *mesos.Offer) error {
t.Spec.SlaveID = offer.GetSlaveId().GetValue()
t.Spec.AssignedSlave = offer.GetHostname()
// hostname needs of the executor needs to match that of the offer, otherwise
// the kubelet node status checker/updater is very unhappy
setCommandArgument(t.executor, "--hostname-override", offer.GetHostname(), true)
return nil
}

View File

@@ -0,0 +1,19 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Package queuer implements a Pod Queuer which stores and yields pods waiting
// being scheduled.
package queuer

View File

@@ -14,7 +14,7 @@ See the License for the specific language governing permissions and
limitations under the License.
*/
package scheduler
package queuer
import (
"fmt"
@@ -29,8 +29,12 @@ import (
type Pod struct {
*api.Pod
deadline *time.Time
delay *time.Duration
notify queue.BreakChan
Delay *time.Duration
Notify queue.BreakChan
}
func NewPodWithDeadline(pod *api.Pod, deadline *time.Time) *Pod {
return &Pod{Pod: pod, deadline: deadline}
}
// implements Copyable
@@ -54,21 +58,21 @@ func (p *Pod) GetUID() string {
// implements Deadlined
func (dp *Pod) Deadline() (time.Time, bool) {
if dp.deadline != nil {
if dp.Deadline != nil {
return *(dp.deadline), true
}
return time.Time{}, false
}
func (dp *Pod) GetDelay() time.Duration {
if dp.delay != nil {
return *(dp.delay)
if dp.Delay != nil {
return *(dp.Delay)
}
return 0
}
func (p *Pod) Breaker() queue.BreakChan {
return p.notify
return p.Notify
}
func (p *Pod) String() string {

View File

@@ -0,0 +1,209 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package queuer
import (
"fmt"
"io"
"net/http"
"sync"
"time"
log "github.com/golang/glog"
"k8s.io/kubernetes/contrib/mesos/pkg/queue"
"k8s.io/kubernetes/contrib/mesos/pkg/runtime"
annotation "k8s.io/kubernetes/contrib/mesos/pkg/scheduler/meta"
"k8s.io/kubernetes/pkg/api"
"k8s.io/kubernetes/pkg/client/cache"
)
const (
enqueuePopTimeout = 200 * time.Millisecond
enqueueWaitTimeout = 1 * time.Second
yieldPopTimeout = 200 * time.Millisecond
yieldWaitTimeout = 1 * time.Second
)
type Queuer interface {
InstallDebugHandlers(mux *http.ServeMux)
UpdatesAvailable()
Dequeue(id string)
Requeue(pod *Pod)
Reoffer(pod *Pod)
Yield() *api.Pod
Run(done <-chan struct{})
}
type queuer struct {
lock sync.Mutex // shared by condition variables of this struct
updates queue.FIFO // queue of pod updates to be processed
queue *queue.DelayFIFO // queue of pods to be scheduled
deltaCond sync.Cond // pod changes are available for processing
unscheduledCond sync.Cond // there are unscheduled pods for processing
}
func New(queue *queue.DelayFIFO, updates queue.FIFO) Queuer {
q := &queuer{
queue: queue,
updates: updates,
}
q.deltaCond.L = &q.lock
q.unscheduledCond.L = &q.lock
return q
}
func (q *queuer) InstallDebugHandlers(mux *http.ServeMux) {
mux.HandleFunc("/debug/scheduler/podqueue", func(w http.ResponseWriter, r *http.Request) {
for _, x := range q.queue.List() {
if _, err := io.WriteString(w, fmt.Sprintf("%+v\n", x)); err != nil {
break
}
}
})
mux.HandleFunc("/debug/scheduler/podstore", func(w http.ResponseWriter, r *http.Request) {
for _, x := range q.updates.List() {
if _, err := io.WriteString(w, fmt.Sprintf("%+v\n", x)); err != nil {
break
}
}
})
}
// signal that there are probably pod updates waiting to be processed
func (q *queuer) UpdatesAvailable() {
q.deltaCond.Broadcast()
}
// delete a pod from the to-be-scheduled queue
func (q *queuer) Dequeue(id string) {
q.queue.Delete(id)
}
// re-add a pod to the to-be-scheduled queue, will not overwrite existing pod data (that
// may have already changed).
func (q *queuer) Requeue(pod *Pod) {
// use KeepExisting in case the pod has already been updated (can happen if binding fails
// due to constraint voilations); we don't want to overwrite a newer entry with stale data.
q.queue.Add(pod, queue.KeepExisting)
q.unscheduledCond.Broadcast()
}
// same as Requeue but calls podQueue.Offer instead of podQueue.Add
func (q *queuer) Reoffer(pod *Pod) {
// use KeepExisting in case the pod has already been updated (can happen if binding fails
// due to constraint voilations); we don't want to overwrite a newer entry with stale data.
if q.queue.Offer(pod, queue.KeepExisting) {
q.unscheduledCond.Broadcast()
}
}
// spawns a go-routine to watch for unscheduled pods and queue them up
// for scheduling. returns immediately.
func (q *queuer) Run(done <-chan struct{}) {
go runtime.Until(func() {
log.Info("Watching for newly created pods")
q.lock.Lock()
defer q.lock.Unlock()
for {
// limit blocking here for short intervals so that scheduling
// may proceed even if there have been no recent pod changes
p := q.updates.Await(enqueuePopTimeout)
if p == nil {
signalled := runtime.After(q.deltaCond.Wait)
// we've yielded the lock
select {
case <-time.After(enqueueWaitTimeout):
q.deltaCond.Broadcast() // abort Wait()
<-signalled // wait for lock re-acquisition
log.V(4).Infoln("timed out waiting for a pod update")
case <-signalled:
// we've acquired the lock and there may be
// changes for us to process now
}
continue
}
pod := p.(*Pod)
if recoverAssignedSlave(pod.Pod) != "" {
log.V(3).Infof("dequeuing assigned pod for scheduling: %v", pod.Pod.Name)
q.Dequeue(pod.GetUID())
} else {
// use ReplaceExisting because we are always pushing the latest state
now := time.Now()
pod.deadline = &now
if q.queue.Offer(pod, queue.ReplaceExisting) {
q.unscheduledCond.Broadcast()
log.V(3).Infof("queued pod for scheduling: %v", pod.Pod.Name)
} else {
log.Warningf("failed to queue pod for scheduling: %v", pod.Pod.Name)
}
}
}
}, 1*time.Second, done)
}
// implementation of scheduling plugin's NextPod func; see k8s plugin/pkg/scheduler
func (q *queuer) Yield() *api.Pod {
log.V(2).Info("attempting to yield a pod")
q.lock.Lock()
defer q.lock.Unlock()
for {
// limit blocking here to short intervals so that we don't block the
// enqueuer Run() routine for very long
kpod := q.queue.Await(yieldPopTimeout)
if kpod == nil {
signalled := runtime.After(q.unscheduledCond.Wait)
// lock is yielded at this point and we're going to wait for either
// a timeout, or a signal that there's data
select {
case <-time.After(yieldWaitTimeout):
q.unscheduledCond.Broadcast() // abort Wait()
<-signalled // wait for the go-routine, and the lock
log.V(4).Infoln("timed out waiting for a pod to yield")
case <-signalled:
// we have acquired the lock, and there
// may be a pod for us to pop now
}
continue
}
pod := kpod.(*Pod).Pod
if podName, err := cache.MetaNamespaceKeyFunc(pod); err != nil {
log.Warningf("yield unable to understand pod object %+v, will skip: %v", pod, err)
} else if !q.updates.Poll(podName, queue.POP_EVENT) {
log.V(1).Infof("yield popped a transitioning pod, skipping: %+v", pod)
} else if recoverAssignedSlave(pod) != "" {
// should never happen if enqueuePods is filtering properly
log.Warningf("yield popped an already-scheduled pod, skipping: %+v", pod)
} else {
return pod
}
}
}
// recoverAssignedSlave recovers the assigned Mesos slave from a pod by searching
// the BindingHostKey. For tasks in the registry of the scheduler, the same
// value is stored in T.Spec.AssignedSlave. Before launching, the BindingHostKey
// annotation is added and the executor will eventually persist that to the
// apiserver on binding.
func recoverAssignedSlave(pod *api.Pod) string {
return pod.Annotations[annotation.BindingHostKey]
}

View File

@@ -17,905 +17,22 @@ limitations under the License.
package scheduler
import (
"fmt"
"io"
"math"
"net/http"
"sync"
"time"
log "github.com/golang/glog"
mesos "github.com/mesos/mesos-go/mesosproto"
mutil "github.com/mesos/mesos-go/mesosutil"
bindings "github.com/mesos/mesos-go/scheduler"
execcfg "k8s.io/kubernetes/contrib/mesos/pkg/executor/config"
"k8s.io/kubernetes/contrib/mesos/pkg/executor/messages"
"k8s.io/kubernetes/contrib/mesos/pkg/node"
"k8s.io/kubernetes/contrib/mesos/pkg/offers"
offermetrics "k8s.io/kubernetes/contrib/mesos/pkg/offers/metrics"
"k8s.io/kubernetes/contrib/mesos/pkg/proc"
"k8s.io/kubernetes/contrib/mesos/pkg/runtime"
schedcfg "k8s.io/kubernetes/contrib/mesos/pkg/scheduler/config"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/meta"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/metrics"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/podtask"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/slave"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/uid"
"k8s.io/kubernetes/pkg/api"
"k8s.io/kubernetes/pkg/api/errors"
client "k8s.io/kubernetes/pkg/client/unversioned"
"k8s.io/kubernetes/pkg/fields"
"k8s.io/kubernetes/pkg/kubelet/container"
kubetypes "k8s.io/kubernetes/pkg/kubelet/types"
"k8s.io/kubernetes/pkg/labels"
"k8s.io/kubernetes/pkg/tools"
"k8s.io/kubernetes/pkg/util/sets"
)
type PluginInterface interface {
// the apiserver may have a different state for the pod than we do
// so reconcile our records, but only for this one pod
reconcileTask(*podtask.T)
// Scheduler abstracts everything other components of the scheduler need
// to access from eachother
type Scheduler interface {
Tasks() podtask.Registry
sync.Locker // synchronize changes to tasks, i.e. lock, get task, change task, store task, unlock
// execute the Scheduling plugin, should start a go routine and return immediately
Run(<-chan struct{})
}
// KubernetesScheduler implements:
// 1: A mesos scheduler.
// 2: A kubernetes scheduler plugin.
// 3: A kubernetes pod.Registry.
type KubernetesScheduler struct {
// We use a lock here to avoid races
// between invoking the mesos callback
// and the invoking the pod registry interfaces.
// In particular, changes to podtask.T objects are currently guarded by this lock.
*sync.RWMutex
PodScheduler
// Config related, write-once
schedcfg *schedcfg.Config
executor *mesos.ExecutorInfo
executorGroup uint64
client *client.Client
etcdClient tools.EtcdClient
failoverTimeout float64 // in seconds
reconcileInterval int64
nodeRegistrator node.Registrator
// Mesos context.
driver bindings.SchedulerDriver // late initialization
frameworkId *mesos.FrameworkID
masterInfo *mesos.MasterInfo
registered bool
registration chan struct{} // signal chan that closes upon first successful registration
onRegistration sync.Once
offers offers.Registry
slaveHostNames *slave.Registry
// unsafe state, needs to be guarded
taskRegistry podtask.Registry
// via deferred init
plugin PluginInterface
reconciler *Reconciler
reconcileCooldown time.Duration
asRegisteredMaster proc.Doer
terminate <-chan struct{} // signal chan, closes when we should kill background tasks
}
type Config struct {
Schedcfg schedcfg.Config
Executor *mesos.ExecutorInfo
Scheduler PodScheduler
Client *client.Client
EtcdClient tools.EtcdClient
FailoverTimeout float64
ReconcileInterval int64
ReconcileCooldown time.Duration
LookupNode node.LookupFunc
}
// New creates a new KubernetesScheduler
func New(config Config) *KubernetesScheduler {
var k *KubernetesScheduler
k = &KubernetesScheduler{
schedcfg: &config.Schedcfg,
RWMutex: new(sync.RWMutex),
executor: config.Executor,
executorGroup: uid.Parse(config.Executor.ExecutorId.GetValue()).Group(),
PodScheduler: config.Scheduler,
client: config.Client,
etcdClient: config.EtcdClient,
failoverTimeout: config.FailoverTimeout,
reconcileInterval: config.ReconcileInterval,
nodeRegistrator: node.NewRegistrator(config.Client, config.LookupNode),
offers: offers.CreateRegistry(offers.RegistryConfig{
Compat: func(o *mesos.Offer) bool {
// the node must be registered and have up-to-date labels
n := config.LookupNode(o.GetHostname())
if n == nil || !node.IsUpToDate(n, node.SlaveAttributesToLabels(o.GetAttributes())) {
return false
}
// the executor IDs must not identify a kubelet-executor with a group that doesn't match ours
for _, eid := range o.GetExecutorIds() {
execuid := uid.Parse(eid.GetValue())
if execuid.Name() == execcfg.DefaultInfoID && execuid.Group() != k.executorGroup {
return false
}
}
return true
},
DeclineOffer: func(id string) <-chan error {
errOnce := proc.NewErrorOnce(k.terminate)
errOuter := k.asRegisteredMaster.Do(func() {
var err error
defer errOnce.Report(err)
offerId := mutil.NewOfferID(id)
filters := &mesos.Filters{}
_, err = k.driver.DeclineOffer(offerId, filters)
})
return errOnce.Send(errOuter).Err()
},
// remember expired offers so that we can tell if a previously scheduler offer relies on one
LingerTTL: config.Schedcfg.OfferLingerTTL.Duration,
TTL: config.Schedcfg.OfferTTL.Duration,
ListenerDelay: config.Schedcfg.ListenerDelay.Duration,
}),
slaveHostNames: slave.NewRegistry(),
taskRegistry: podtask.NewInMemoryRegistry(),
reconcileCooldown: config.ReconcileCooldown,
registration: make(chan struct{}),
asRegisteredMaster: proc.DoerFunc(func(proc.Action) <-chan error {
return proc.ErrorChanf("cannot execute action with unregistered scheduler")
}),
}
return k
}
func (k *KubernetesScheduler) Init(electedMaster proc.Process, pl PluginInterface, mux *http.ServeMux) error {
log.V(1).Infoln("initializing kubernetes mesos scheduler")
k.asRegisteredMaster = proc.DoerFunc(func(a proc.Action) <-chan error {
if !k.registered {
return proc.ErrorChanf("failed to execute action, scheduler is disconnected")
}
return electedMaster.Do(a)
})
k.terminate = electedMaster.Done()
k.plugin = pl
k.offers.Init(k.terminate)
k.InstallDebugHandlers(mux)
k.nodeRegistrator.Run(k.terminate)
return k.recoverTasks()
}
func (k *KubernetesScheduler) asMaster() proc.Doer {
k.RLock()
defer k.RUnlock()
return k.asRegisteredMaster
}
func (k *KubernetesScheduler) InstallDebugHandlers(mux *http.ServeMux) {
wrappedHandler := func(uri string, h http.Handler) {
mux.HandleFunc(uri, func(w http.ResponseWriter, r *http.Request) {
ch := make(chan struct{})
closer := runtime.Closer(ch)
proc.OnError(k.asMaster().Do(func() {
defer closer()
h.ServeHTTP(w, r)
}), func(err error) {
defer closer()
log.Warningf("failed HTTP request for %s: %v", uri, err)
w.WriteHeader(http.StatusServiceUnavailable)
}, k.terminate)
select {
case <-time.After(k.schedcfg.HttpHandlerTimeout.Duration):
log.Warningf("timed out waiting for request to be processed")
w.WriteHeader(http.StatusServiceUnavailable)
return
case <-ch: // noop
}
})
}
requestReconciliation := func(uri string, requestAction func()) {
wrappedHandler(uri, http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
requestAction()
w.WriteHeader(http.StatusNoContent)
}))
}
requestReconciliation("/debug/actions/requestExplicit", k.reconciler.RequestExplicit)
requestReconciliation("/debug/actions/requestImplicit", k.reconciler.RequestImplicit)
wrappedHandler("/debug/actions/kamikaze", http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
slaves := k.slaveHostNames.SlaveIDs()
for _, slaveId := range slaves {
_, err := k.driver.SendFrameworkMessage(
k.executor.ExecutorId,
mutil.NewSlaveID(slaveId),
messages.Kamikaze)
if err != nil {
log.Warningf("failed to send kamikaze message to slave %s: %v", slaveId, err)
} else {
io.WriteString(w, fmt.Sprintf("kamikaze slave %s\n", slaveId))
}
}
io.WriteString(w, "OK")
}))
}
func (k *KubernetesScheduler) Registration() <-chan struct{} {
return k.registration
}
// Registered is called when the scheduler registered with the master successfully.
func (k *KubernetesScheduler) Registered(drv bindings.SchedulerDriver, fid *mesos.FrameworkID, mi *mesos.MasterInfo) {
log.Infof("Scheduler registered with the master: %v with frameworkId: %v\n", mi, fid)
k.driver = drv
k.frameworkId = fid
k.masterInfo = mi
k.registered = true
k.onRegistration.Do(func() { k.onInitialRegistration(drv) })
k.reconciler.RequestExplicit()
}
func (k *KubernetesScheduler) storeFrameworkId() {
// TODO(jdef): port FrameworkId store to generic Kubernetes config store as soon as available
_, err := k.etcdClient.Set(meta.FrameworkIDKey, k.frameworkId.GetValue(), uint64(k.failoverTimeout))
if err != nil {
log.Errorf("failed to renew frameworkId TTL: %v", err)
}
}
// Reregistered is called when the scheduler re-registered with the master successfully.
// This happends when the master fails over.
func (k *KubernetesScheduler) Reregistered(drv bindings.SchedulerDriver, mi *mesos.MasterInfo) {
log.Infof("Scheduler reregistered with the master: %v\n", mi)
k.driver = drv
k.masterInfo = mi
k.registered = true
k.onRegistration.Do(func() { k.onInitialRegistration(drv) })
k.reconciler.RequestExplicit()
}
// perform one-time initialization actions upon the first registration event received from Mesos.
func (k *KubernetesScheduler) onInitialRegistration(driver bindings.SchedulerDriver) {
defer close(k.registration)
if k.failoverTimeout > 0 {
refreshInterval := k.schedcfg.FrameworkIdRefreshInterval.Duration
if k.failoverTimeout < k.schedcfg.FrameworkIdRefreshInterval.Duration.Seconds() {
refreshInterval = time.Duration(math.Max(1, k.failoverTimeout/2)) * time.Second
}
go runtime.Until(k.storeFrameworkId, refreshInterval, k.terminate)
}
r1 := k.makeTaskRegistryReconciler()
r2 := k.makePodRegistryReconciler()
k.reconciler = newReconciler(k.asRegisteredMaster, k.makeCompositeReconciler(r1, r2),
k.reconcileCooldown, k.schedcfg.ExplicitReconciliationAbortTimeout.Duration, k.terminate)
go k.reconciler.Run(driver)
if k.reconcileInterval > 0 {
ri := time.Duration(k.reconcileInterval) * time.Second
time.AfterFunc(k.schedcfg.InitialImplicitReconciliationDelay.Duration, func() { runtime.Until(k.reconciler.RequestImplicit, ri, k.terminate) })
log.Infof("will perform implicit task reconciliation at interval: %v after %v", ri, k.schedcfg.InitialImplicitReconciliationDelay.Duration)
}
}
// Disconnected is called when the scheduler loses connection to the master.
func (k *KubernetesScheduler) Disconnected(driver bindings.SchedulerDriver) {
log.Infof("Master disconnected!\n")
k.registered = false
// discard all cached offers to avoid unnecessary TASK_LOST updates
k.offers.Invalidate("")
}
// ResourceOffers is called when the scheduler receives some offers from the master.
func (k *KubernetesScheduler) ResourceOffers(driver bindings.SchedulerDriver, offers []*mesos.Offer) {
log.V(2).Infof("Received offers %+v", offers)
// Record the offers in the global offer map as well as each slave's offer map.
k.offers.Add(offers)
for _, offer := range offers {
slaveId := offer.GetSlaveId().GetValue()
k.slaveHostNames.Register(slaveId, offer.GetHostname())
// create api object if not existing already
if k.nodeRegistrator != nil {
labels := node.SlaveAttributesToLabels(offer.GetAttributes())
_, err := k.nodeRegistrator.Register(offer.GetHostname(), labels)
if err != nil {
log.Error(err)
}
}
}
}
// OfferRescinded is called when the resources are recinded from the scheduler.
func (k *KubernetesScheduler) OfferRescinded(driver bindings.SchedulerDriver, offerId *mesos.OfferID) {
log.Infof("Offer rescinded %v\n", offerId)
oid := offerId.GetValue()
k.offers.Delete(oid, offermetrics.OfferRescinded)
}
// StatusUpdate is called when a status update message is sent to the scheduler.
func (k *KubernetesScheduler) StatusUpdate(driver bindings.SchedulerDriver, taskStatus *mesos.TaskStatus) {
source, reason := "none", "none"
if taskStatus.Source != nil {
source = (*taskStatus.Source).String()
}
if taskStatus.Reason != nil {
reason = (*taskStatus.Reason).String()
}
taskState := taskStatus.GetState()
metrics.StatusUpdates.WithLabelValues(source, reason, taskState.String()).Inc()
message := "none"
if taskStatus.Message != nil {
message = *taskStatus.Message
}
log.Infof(
"task status update %q from %q for task %q on slave %q executor %q for reason %q with message %q",
taskState.String(),
source,
taskStatus.TaskId.GetValue(),
taskStatus.SlaveId.GetValue(),
taskStatus.ExecutorId.GetValue(),
reason,
message,
)
switch taskState {
case mesos.TaskState_TASK_RUNNING, mesos.TaskState_TASK_FINISHED, mesos.TaskState_TASK_STARTING, mesos.TaskState_TASK_STAGING:
if _, state := k.taskRegistry.UpdateStatus(taskStatus); state == podtask.StateUnknown {
if taskState != mesos.TaskState_TASK_FINISHED {
//TODO(jdef) what if I receive this after a TASK_LOST or TASK_KILLED?
//I don't want to reincarnate then.. TASK_LOST is a special case because
//the master is stateless and there are scenarios where I may get TASK_LOST
//followed by TASK_RUNNING.
//TODO(jdef) consider running this asynchronously since there are API server
//calls that may be made
k.reconcileNonTerminalTask(driver, taskStatus)
} // else, we don't really care about FINISHED tasks that aren't registered
return
}
if hostName := k.slaveHostNames.HostName(taskStatus.GetSlaveId().GetValue()); hostName == "" {
// a registered task has an update reported by a slave that we don't recognize.
// this should never happen! So we don't reconcile it.
log.Errorf("Ignore status %+v because the slave does not exist", taskStatus)
return
}
case mesos.TaskState_TASK_FAILED, mesos.TaskState_TASK_ERROR:
if task, _ := k.taskRegistry.UpdateStatus(taskStatus); task != nil {
if task.Has(podtask.Launched) && !task.Has(podtask.Bound) {
go k.plugin.reconcileTask(task)
return
}
} else {
// unknown task failed, not much we can do about it
return
}
// last-ditch effort to reconcile our records
fallthrough
case mesos.TaskState_TASK_LOST, mesos.TaskState_TASK_KILLED:
k.reconcileTerminalTask(driver, taskStatus)
default:
log.Errorf(
"unknown task status %q from %q for task %q on slave %q executor %q for reason %q with message %q",
taskState.String(),
source,
taskStatus.TaskId.GetValue(),
taskStatus.SlaveId.GetValue(),
taskStatus.ExecutorId.GetValue(),
reason,
message,
)
}
}
func (k *KubernetesScheduler) reconcileTerminalTask(driver bindings.SchedulerDriver, taskStatus *mesos.TaskStatus) {
task, state := k.taskRegistry.UpdateStatus(taskStatus)
if (state == podtask.StateRunning || state == podtask.StatePending) &&
((taskStatus.GetSource() == mesos.TaskStatus_SOURCE_MASTER && taskStatus.GetReason() == mesos.TaskStatus_REASON_RECONCILIATION) ||
(taskStatus.GetSource() == mesos.TaskStatus_SOURCE_SLAVE && taskStatus.GetReason() == mesos.TaskStatus_REASON_EXECUTOR_TERMINATED) ||
(taskStatus.GetSource() == mesos.TaskStatus_SOURCE_SLAVE && taskStatus.GetReason() == mesos.TaskStatus_REASON_EXECUTOR_UNREGISTERED) ||
(taskStatus.GetSource() == mesos.TaskStatus_SOURCE_EXECUTOR && taskStatus.GetMessage() == messages.ContainersDisappeared)) {
//--
// pod-task has metadata that refers to:
// (1) a task that Mesos no longer knows about, or else
// (2) a pod that the Kubelet will never report as "failed"
// (3) a pod that the kubeletExecutor reported as lost (likely due to docker daemon crash/restart)
// For now, destroy the pod and hope that there's a replication controller backing it up.
// TODO(jdef) for case #2 don't delete the pod, just update it's status to Failed
pod := &task.Pod
log.Warningf("deleting rogue pod %v/%v for lost task %v", pod.Namespace, pod.Name, task.ID)
if err := k.client.Pods(pod.Namespace).Delete(pod.Name, api.NewDeleteOptions(0)); err != nil && !errors.IsNotFound(err) {
log.Errorf("failed to delete pod %v/%v for terminal task %v: %v", pod.Namespace, pod.Name, task.ID, err)
}
} else if taskStatus.GetReason() == mesos.TaskStatus_REASON_EXECUTOR_TERMINATED || taskStatus.GetReason() == mesos.TaskStatus_REASON_EXECUTOR_UNREGISTERED {
// attempt to prevent dangling pods in the pod and task registries
log.V(1).Infof("request explicit reconciliation to clean up for task %v after executor reported (terminated/unregistered)", taskStatus.TaskId.GetValue())
k.reconciler.RequestExplicit()
} else if taskStatus.GetState() == mesos.TaskState_TASK_LOST && state == podtask.StateRunning && taskStatus.ExecutorId != nil && taskStatus.SlaveId != nil {
//TODO(jdef) this may not be meaningful once we have proper checkpointing and master detection
//If we're reconciling and receive this then the executor may be
//running a task that we need it to kill. It's possible that the framework
//is unrecognized by the master at this point, so KillTask is not guaranteed
//to do anything. The underlying driver transport may be able to send a
//FrameworkMessage directly to the slave to terminate the task.
log.V(2).Info("forwarding TASK_LOST message to executor %v on slave %v", taskStatus.ExecutorId, taskStatus.SlaveId)
data := fmt.Sprintf("%s:%s", messages.TaskLost, task.ID) //TODO(jdef) use a real message type
if _, err := driver.SendFrameworkMessage(taskStatus.ExecutorId, taskStatus.SlaveId, data); err != nil {
log.Error(err.Error())
}
}
}
// reconcile an unknown (from the perspective of our registry) non-terminal task
func (k *KubernetesScheduler) reconcileNonTerminalTask(driver bindings.SchedulerDriver, taskStatus *mesos.TaskStatus) {
// attempt to recover task from pod info:
// - task data may contain an api.PodStatusResult; if status.reason == REASON_RECONCILIATION then status.data == nil
// - the Name can be parsed by container.ParseFullName() to yield a pod Name and Namespace
// - pull the pod metadata down from the api server
// - perform task recovery based on pod metadata
taskId := taskStatus.TaskId.GetValue()
if taskStatus.GetReason() == mesos.TaskStatus_REASON_RECONCILIATION && taskStatus.GetSource() == mesos.TaskStatus_SOURCE_MASTER {
// there will be no data in the task status that we can use to determine the associated pod
switch taskStatus.GetState() {
case mesos.TaskState_TASK_STAGING:
// there is still hope for this task, don't kill it just yet
//TODO(jdef) there should probably be a limit for how long we tolerate tasks stuck in this state
return
default:
// for TASK_{STARTING,RUNNING} we should have already attempted to recoverTasks() for.
// if the scheduler failed over before the executor fired TASK_STARTING, then we should *not*
// be processing this reconciliation update before we process the one from the executor.
// point: we don't know what this task is (perhaps there was unrecoverable metadata in the pod),
// so it gets killed.
log.Errorf("killing non-terminal, unrecoverable task %v", taskId)
}
} else if podStatus, err := podtask.ParsePodStatusResult(taskStatus); err != nil {
// possible rogue pod exists at this point because we can't identify it; should kill the task
log.Errorf("possible rogue pod; illegal task status data for task %v, expected an api.PodStatusResult: %v", taskId, err)
} else if name, namespace, err := container.ParsePodFullName(podStatus.Name); err != nil {
// possible rogue pod exists at this point because we can't identify it; should kill the task
log.Errorf("possible rogue pod; illegal api.PodStatusResult, unable to parse full pod name from: '%v' for task %v: %v",
podStatus.Name, taskId, err)
} else if pod, err := k.client.Pods(namespace).Get(name); err == nil {
if t, ok, err := podtask.RecoverFrom(*pod); ok {
log.Infof("recovered task %v from metadata in pod %v/%v", taskId, namespace, name)
_, err := k.taskRegistry.Register(t)
if err != nil {
// someone beat us to it?!
log.Warningf("failed to register recovered task: %v", err)
return
} else {
k.taskRegistry.UpdateStatus(taskStatus)
}
return
} else if err != nil {
//should kill the pod and the task
log.Errorf("killing pod, failed to recover task from pod %v/%v: %v", namespace, name, err)
if err := k.client.Pods(namespace).Delete(name, nil); err != nil {
log.Errorf("failed to delete pod %v/%v: %v", namespace, name, err)
}
} else {
//this is pretty unexpected: we received a TASK_{STARTING,RUNNING} message, but the apiserver's pod
//metadata is not appropriate for task reconstruction -- which should almost certainly never
//be the case unless someone swapped out the pod on us (and kept the same namespace/name) while
//we were failed over.
//kill this task, allow the newly launched scheduler to schedule the new pod
log.Warningf("unexpected pod metadata for task %v in apiserver, assuming new unscheduled pod spec: %+v", taskId, pod)
}
} else if errors.IsNotFound(err) {
// pod lookup failed, should delete the task since the pod is no longer valid; may be redundant, that's ok
log.Infof("killing task %v since pod %v/%v no longer exists", taskId, namespace, name)
} else if errors.IsServerTimeout(err) {
log.V(2).Infof("failed to reconcile task due to API server timeout: %v", err)
return
} else {
log.Errorf("unexpected API server error, aborting reconcile for task %v: %v", taskId, err)
return
}
if _, err := driver.KillTask(taskStatus.TaskId); err != nil {
log.Errorf("failed to kill task %v: %v", taskId, err)
}
}
// FrameworkMessage is called when the scheduler receives a message from the executor.
func (k *KubernetesScheduler) FrameworkMessage(driver bindings.SchedulerDriver,
executorId *mesos.ExecutorID, slaveId *mesos.SlaveID, message string) {
log.Infof("Received messages from executor %v of slave %v, %v\n", executorId, slaveId, message)
}
// SlaveLost is called when some slave is lost.
func (k *KubernetesScheduler) SlaveLost(driver bindings.SchedulerDriver, slaveId *mesos.SlaveID) {
log.Infof("Slave %v is lost\n", slaveId)
sid := slaveId.GetValue()
k.offers.InvalidateForSlave(sid)
// TODO(jdef): delete slave from our internal list? probably not since we may need to reconcile
// tasks. it would be nice to somehow flag the slave as lost so that, perhaps, we can periodically
// flush lost slaves older than X, and for which no tasks or pods reference.
// unfinished tasks/pods will be dropped. use a replication controller if you want pods to
// be restarted when slaves die.
}
// ExecutorLost is called when some executor is lost.
func (k *KubernetesScheduler) ExecutorLost(driver bindings.SchedulerDriver, executorId *mesos.ExecutorID, slaveId *mesos.SlaveID, status int) {
log.Infof("Executor %v of slave %v is lost, status: %v\n", executorId, slaveId, status)
// TODO(yifan): Restart any unfinished tasks of the executor.
}
// Error is called when there is an unrecoverable error in the scheduler or scheduler driver.
// The driver should have been aborted before this is invoked.
func (k *KubernetesScheduler) Error(driver bindings.SchedulerDriver, message string) {
log.Fatalf("fatal scheduler error: %v\n", message)
}
// filter func used for explicit task reconciliation, selects only non-terminal tasks which
// have been communicated to mesos (read: launched).
func explicitTaskFilter(t *podtask.T) bool {
switch t.State {
case podtask.StateRunning:
return true
case podtask.StatePending:
return t.Has(podtask.Launched)
default:
return false
}
}
// invoke the given ReconcilerAction funcs in sequence, aborting the sequence if reconciliation
// is cancelled. if any other errors occur the composite reconciler will attempt to complete the
// sequence, reporting only the last generated error.
func (k *KubernetesScheduler) makeCompositeReconciler(actions ...ReconcilerAction) ReconcilerAction {
if x := len(actions); x == 0 {
// programming error
panic("no actions specified for composite reconciler")
} else if x == 1 {
return actions[0]
}
chained := func(d bindings.SchedulerDriver, c <-chan struct{}, a, b ReconcilerAction) <-chan error {
ech := a(d, c)
ch := make(chan error, 1)
go func() {
select {
case <-k.terminate:
case <-c:
case e := <-ech:
if e != nil {
ch <- e
return
}
ech = b(d, c)
select {
case <-k.terminate:
case <-c:
case e := <-ech:
if e != nil {
ch <- e
return
}
close(ch)
return
}
}
ch <- fmt.Errorf("aborting composite reconciler action")
}()
return ch
}
result := func(d bindings.SchedulerDriver, c <-chan struct{}) <-chan error {
return chained(d, c, actions[0], actions[1])
}
for i := 2; i < len(actions); i++ {
i := i
next := func(d bindings.SchedulerDriver, c <-chan struct{}) <-chan error {
return chained(d, c, ReconcilerAction(result), actions[i])
}
result = next
}
return ReconcilerAction(result)
}
// reconciler action factory, performs explicit task reconciliation for non-terminal
// tasks listed in the scheduler's internal taskRegistry.
func (k *KubernetesScheduler) makeTaskRegistryReconciler() ReconcilerAction {
return ReconcilerAction(func(drv bindings.SchedulerDriver, cancel <-chan struct{}) <-chan error {
taskToSlave := make(map[string]string)
for _, t := range k.taskRegistry.List(explicitTaskFilter) {
if t.Spec.SlaveID != "" {
taskToSlave[t.ID] = t.Spec.SlaveID
}
}
return proc.ErrorChan(k.explicitlyReconcileTasks(drv, taskToSlave, cancel))
})
}
// reconciler action factory, performs explicit task reconciliation for non-terminal
// tasks identified by annotations in the Kubernetes pod registry.
func (k *KubernetesScheduler) makePodRegistryReconciler() ReconcilerAction {
return ReconcilerAction(func(drv bindings.SchedulerDriver, cancel <-chan struct{}) <-chan error {
podList, err := k.client.Pods(api.NamespaceAll).List(labels.Everything(), fields.Everything())
if err != nil {
return proc.ErrorChanf("failed to reconcile pod registry: %v", err)
}
taskToSlave := make(map[string]string)
for _, pod := range podList.Items {
if len(pod.Annotations) == 0 {
continue
}
taskId, found := pod.Annotations[meta.TaskIdKey]
if !found {
continue
}
slaveId, found := pod.Annotations[meta.SlaveIdKey]
if !found {
continue
}
taskToSlave[taskId] = slaveId
}
return proc.ErrorChan(k.explicitlyReconcileTasks(drv, taskToSlave, cancel))
})
}
// execute an explicit task reconciliation, as per http://mesos.apache.org/documentation/latest/reconciliation/
func (k *KubernetesScheduler) explicitlyReconcileTasks(driver bindings.SchedulerDriver, taskToSlave map[string]string, cancel <-chan struct{}) error {
log.Info("explicit reconcile tasks")
// tell mesos to send us the latest status updates for all the non-terminal tasks that we know about
statusList := []*mesos.TaskStatus{}
remaining := sets.StringKeySet(taskToSlave)
for taskId, slaveId := range taskToSlave {
if slaveId == "" {
delete(taskToSlave, taskId)
continue
}
statusList = append(statusList, &mesos.TaskStatus{
TaskId: mutil.NewTaskID(taskId),
SlaveId: mutil.NewSlaveID(slaveId),
State: mesos.TaskState_TASK_RUNNING.Enum(), // req'd field, doesn't have to reflect reality
})
}
select {
case <-cancel:
return reconciliationCancelledErr
default:
if _, err := driver.ReconcileTasks(statusList); err != nil {
return err
}
}
start := time.Now()
first := true
for backoff := 1 * time.Second; first || remaining.Len() > 0; backoff = backoff * 2 {
first = false
// nothing to do here other than wait for status updates..
if backoff > k.schedcfg.ExplicitReconciliationMaxBackoff.Duration {
backoff = k.schedcfg.ExplicitReconciliationMaxBackoff.Duration
}
select {
case <-cancel:
return reconciliationCancelledErr
case <-time.After(backoff):
for taskId := range remaining {
if task, _ := k.taskRegistry.Get(taskId); task != nil && explicitTaskFilter(task) && task.UpdatedTime.Before(start) {
// keep this task in remaining list
continue
}
remaining.Delete(taskId)
}
}
}
return nil
}
var (
reconciliationCancelledErr = fmt.Errorf("explicit task reconciliation cancelled")
)
type ReconcilerAction func(driver bindings.SchedulerDriver, cancel <-chan struct{}) <-chan error
type Reconciler struct {
proc.Doer
Action ReconcilerAction
explicit chan struct{} // send an empty struct to trigger explicit reconciliation
implicit chan struct{} // send an empty struct to trigger implicit reconciliation
done <-chan struct{} // close this when you want the reconciler to exit
cooldown time.Duration
explicitReconciliationAbortTimeout time.Duration
}
func newReconciler(doer proc.Doer, action ReconcilerAction,
cooldown, explicitReconciliationAbortTimeout time.Duration, done <-chan struct{}) *Reconciler {
return &Reconciler{
Doer: doer,
explicit: make(chan struct{}, 1),
implicit: make(chan struct{}, 1),
cooldown: cooldown,
explicitReconciliationAbortTimeout: explicitReconciliationAbortTimeout,
done: done,
Action: func(driver bindings.SchedulerDriver, cancel <-chan struct{}) <-chan error {
// trigged the reconciler action in the doer's execution context,
// but it could take a while and the scheduler needs to be able to
// process updates, the callbacks for which ALSO execute in the SAME
// deferred execution context -- so the action MUST be executed async.
errOnce := proc.NewErrorOnce(cancel)
return errOnce.Send(doer.Do(func() {
// only triggers the action if we're the currently elected,
// registered master and runs the action async.
go func() {
var err <-chan error
defer errOnce.Send(err)
err = action(driver, cancel)
}()
})).Err()
},
}
}
func (r *Reconciler) RequestExplicit() {
select {
case r.explicit <- struct{}{}: // noop
default: // request queue full; noop
}
}
func (r *Reconciler) RequestImplicit() {
select {
case r.implicit <- struct{}{}: // noop
default: // request queue full; noop
}
}
// execute task reconciliation, returns when r.done is closed. intended to run as a goroutine.
// if reconciliation is requested while another is in progress, the in-progress operation will be
// cancelled before the new reconciliation operation begins.
func (r *Reconciler) Run(driver bindings.SchedulerDriver) {
var cancel, finished chan struct{}
requestLoop:
for {
select {
case <-r.done:
return
default: // proceed
}
select {
case <-r.implicit:
metrics.ReconciliationRequested.WithLabelValues("implicit").Inc()
select {
case <-r.done:
return
case <-r.explicit:
break // give preference to a pending request for explicit
default: // continue
// don't run implicit reconciliation while explicit is ongoing
if finished != nil {
select {
case <-finished: // continue w/ implicit
default:
log.Infoln("skipping implicit reconcile because explicit reconcile is ongoing")
continue requestLoop
}
}
errOnce := proc.NewErrorOnce(r.done)
errCh := r.Do(func() {
var err error
defer errOnce.Report(err)
log.Infoln("implicit reconcile tasks")
metrics.ReconciliationExecuted.WithLabelValues("implicit").Inc()
if _, err = driver.ReconcileTasks([]*mesos.TaskStatus{}); err != nil {
log.V(1).Infof("failed to request implicit reconciliation from mesos: %v", err)
}
})
proc.OnError(errOnce.Send(errCh).Err(), func(err error) {
log.Errorf("failed to run implicit reconciliation: %v", err)
}, r.done)
goto slowdown
}
case <-r.done:
return
case <-r.explicit: // continue
metrics.ReconciliationRequested.WithLabelValues("explicit").Inc()
}
if cancel != nil {
close(cancel)
cancel = nil
// play nice and wait for the prior operation to finish, complain
// if it doesn't
select {
case <-r.done:
return
case <-finished: // noop, expected
case <-time.After(r.explicitReconciliationAbortTimeout): // very unexpected
log.Error("reconciler action failed to stop upon cancellation")
}
}
// copy 'finished' to 'fin' here in case we end up with simultaneous go-routines,
// if cancellation takes too long or fails - we don't want to close the same chan
// more than once
cancel = make(chan struct{})
finished = make(chan struct{})
go func(fin chan struct{}) {
startedAt := time.Now()
defer func() {
metrics.ReconciliationLatency.Observe(metrics.InMicroseconds(time.Since(startedAt)))
}()
metrics.ReconciliationExecuted.WithLabelValues("explicit").Inc()
defer close(fin)
err := <-r.Action(driver, cancel)
if err == reconciliationCancelledErr {
metrics.ReconciliationCancelled.WithLabelValues("explicit").Inc()
log.Infoln(err.Error())
} else if err != nil {
log.Errorf("reconciler action failed: %v", err)
}
}(finished)
slowdown:
// don't allow reconciliation to run very frequently, either explicit or implicit
select {
case <-r.done:
return
case <-time.After(r.cooldown): // noop
}
} // for
}
func (ks *KubernetesScheduler) recoverTasks() error {
podList, err := ks.client.Pods(api.NamespaceAll).List(labels.Everything(), fields.Everything())
if err != nil {
log.V(1).Infof("failed to recover pod registry, madness may ensue: %v", err)
return err
}
recoverSlave := func(t *podtask.T) {
slaveId := t.Spec.SlaveID
ks.slaveHostNames.Register(slaveId, t.Offer.Host())
}
for _, pod := range podList.Items {
if _, isMirrorPod := pod.Annotations[kubetypes.ConfigMirrorAnnotationKey]; isMirrorPod {
// mirrored pods are never reconciled because the scheduler isn't responsible for
// scheduling them; they're started by the executor/kubelet upon instantiation and
// reflected in the apiserver afterward. the scheduler has no knowledge of them.
continue
}
if t, ok, err := podtask.RecoverFrom(pod); err != nil {
log.Errorf("failed to recover task from pod, will attempt to delete '%v/%v': %v", pod.Namespace, pod.Name, err)
err := ks.client.Pods(pod.Namespace).Delete(pod.Name, nil)
//TODO(jdef) check for temporary or not-found errors
if err != nil {
log.Errorf("failed to delete pod '%v/%v': %v", pod.Namespace, pod.Name, err)
}
} else if ok {
ks.taskRegistry.Register(t)
recoverSlave(t)
log.Infof("recovered task %v from pod %v/%v", t.ID, pod.Namespace, pod.Name)
}
}
return nil
Offers() offers.Registry
Reconcile(t *podtask.T)
KillTask(id string) error
LaunchTask(t *podtask.T) error
Run(done <-chan struct{})
}

View File

@@ -0,0 +1,74 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package scheduler
import (
"sync"
"github.com/stretchr/testify/mock"
"k8s.io/kubernetes/contrib/mesos/pkg/offers"
"k8s.io/kubernetes/contrib/mesos/pkg/runtime"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/podtask"
"time"
)
// MockScheduler implements SchedulerApi
type MockScheduler struct {
sync.RWMutex
mock.Mock
}
func (m *MockScheduler) Run(done <-chan struct{}) {
_ = m.Called()
runtime.Until(func() {
time.Sleep(time.Second)
}, time.Second, done)
return
}
func (m *MockScheduler) Offers() (f offers.Registry) {
args := m.Called()
x := args.Get(0)
if x != nil {
f = x.(offers.Registry)
}
return
}
func (m *MockScheduler) Tasks() (f podtask.Registry) {
args := m.Called()
x := args.Get(0)
if x != nil {
f = x.(podtask.Registry)
}
return
}
func (m *MockScheduler) KillTask(taskId string) error {
args := m.Called(taskId)
return args.Error(0)
}
func (m *MockScheduler) LaunchTask(task *podtask.T) error {
args := m.Called(task)
return args.Error(0)
}
func (m *MockScheduler) Reconcile(task *podtask.T) {
_ = m.Called()
return
}

View File

@@ -42,7 +42,7 @@ func (m *SchedulerServer) newServiceWriter(stop <-chan struct{}) func() {
glog.Errorf("Can't create scheduler service: %v", err)
}
if err := m.setEndpoints(SCHEDULER_SERVICE_NAME, net.IP(m.Address), m.Port); err != nil {
if err := m.setEndpoints(SCHEDULER_SERVICE_NAME, net.IP(m.address), m.port); err != nil {
glog.Errorf("Can't create scheduler endpoints: %v", err)
}
@@ -76,8 +76,8 @@ func (m *SchedulerServer) createSchedulerServiceIfNeeded(serviceName string, ser
SessionAffinity: api.ServiceAffinityNone,
},
}
if m.ServiceAddress != nil {
svc.Spec.ClusterIP = m.ServiceAddress.String()
if m.serviceAddress != nil {
svc.Spec.ClusterIP = m.serviceAddress.String()
}
_, err := m.client.Services(api.NamespaceValue(ctx)).Create(svc)
if err != nil && errors.IsAlreadyExists(err) {

View File

@@ -54,7 +54,9 @@ import (
minioncfg "k8s.io/kubernetes/contrib/mesos/pkg/minion/config"
"k8s.io/kubernetes/contrib/mesos/pkg/profile"
"k8s.io/kubernetes/contrib/mesos/pkg/runtime"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/components"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/components/algorithm/podschedulers"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/components/framework"
schedcfg "k8s.io/kubernetes/contrib/mesos/pkg/scheduler/config"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/ha"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/meta"
@@ -65,6 +67,7 @@ import (
"k8s.io/kubernetes/pkg/api"
"k8s.io/kubernetes/pkg/api/resource"
"k8s.io/kubernetes/pkg/client/cache"
"k8s.io/kubernetes/pkg/client/record"
client "k8s.io/kubernetes/pkg/client/unversioned"
clientauth "k8s.io/kubernetes/pkg/client/unversioned/auth"
"k8s.io/kubernetes/pkg/fields"
@@ -86,72 +89,72 @@ const (
)
type SchedulerServer struct {
Port int
Address net.IP
EnableProfiling bool
AuthPath string
APIServerList []string
EtcdServerList []string
EtcdConfigFile string
AllowPrivileged bool
ExecutorPath string
ProxyPath string
MesosMaster string
MesosUser string
MesosRole string
MesosAuthPrincipal string
MesosAuthSecretFile string
MesosCgroupPrefix string
MesosExecutorCPUs mresource.CPUShares
MesosExecutorMem mresource.MegaBytes
Checkpoint bool
FailoverTimeout float64
port int
address net.IP
enableProfiling bool
authPath string
apiServerList []string
etcdServerList []string
etcdConfigFile string
allowPrivileged bool
executorPath string
proxyPath string
mesosMaster string
mesosUser string
mesosRole string
mesosAuthPrincipal string
mesosAuthSecretFile string
mesosCgroupPrefix string
mesosExecutorCPUs mresource.CPUShares
mesosExecutorMem mresource.MegaBytes
checkpoint bool
failoverTimeout float64
ExecutorLogV int
ExecutorBindall bool
ExecutorSuicideTimeout time.Duration
LaunchGracePeriod time.Duration
executorLogV int
executorBindall bool
executorSuicideTimeout time.Duration
launchGracePeriod time.Duration
RunProxy bool
ProxyBindall bool
ProxyLogV int
runProxy bool
proxyBindall bool
proxyLogV int
MinionPathOverride string
MinionLogMaxSize resource.Quantity
MinionLogMaxBackups int
MinionLogMaxAgeInDays int
minionPathOverride string
minionLogMaxSize resource.Quantity
minionLogMaxBackups int
minionLogMaxAgeInDays int
MesosAuthProvider string
DriverPort uint
HostnameOverride string
ReconcileInterval int64
ReconcileCooldown time.Duration
DefaultContainerCPULimit mresource.CPUShares
DefaultContainerMemLimit mresource.MegaBytes
SchedulerConfigFileName string
Graceful bool
FrameworkName string
FrameworkWebURI string
HA bool
AdvertisedAddress string
ServiceAddress net.IP
HADomain string
KMPath string
ClusterDNS net.IP
ClusterDomain string
KubeletRootDirectory string
KubeletDockerEndpoint string
KubeletPodInfraContainerImage string
KubeletCadvisorPort uint
KubeletHostNetworkSources string
KubeletSyncFrequency time.Duration
KubeletNetworkPluginName string
StaticPodsConfigPath string
DockerCfgPath string
ContainPodResources bool
AccountForPodResources bool
mesosAuthProvider string
driverPort uint
hostnameOverride string
reconcileInterval int64
reconcileCooldown time.Duration
defaultContainerCPULimit mresource.CPUShares
defaultContainerMemLimit mresource.MegaBytes
schedulerConfigFileName string
graceful bool
frameworkName string
frameworkWebURI string
ha bool
advertisedAddress string
serviceAddress net.IP
haDomain string
kmPath string
clusterDNS net.IP
clusterDomain string
kubeletRootDirectory string
kubeletDockerEndpoint string
kubeletPodInfraContainerImage string
kubeletCadvisorPort uint
kubeletHostNetworkSources string
kubeletSyncFrequency time.Duration
kubeletNetworkPluginName string
staticPodsConfigPath string
dockerCfgPath string
containPodResources bool
accountForPodResources bool
nodeRelistPeriod time.Duration
SandboxOverlay string
sandboxOverlay string
executable string // path to the binary running this service
client *client.Client
@@ -170,36 +173,36 @@ type schedulerProcessInterface interface {
// NewSchedulerServer creates a new SchedulerServer with default parameters
func NewSchedulerServer() *SchedulerServer {
s := SchedulerServer{
Port: ports.SchedulerPort,
Address: net.ParseIP("127.0.0.1"),
FailoverTimeout: time.Duration((1 << 62) - 1).Seconds(),
port: ports.SchedulerPort,
address: net.ParseIP("127.0.0.1"),
failoverTimeout: time.Duration((1 << 62) - 1).Seconds(),
RunProxy: true,
ExecutorSuicideTimeout: execcfg.DefaultSuicideTimeout,
LaunchGracePeriod: execcfg.DefaultLaunchGracePeriod,
DefaultContainerCPULimit: mresource.DefaultDefaultContainerCPULimit,
DefaultContainerMemLimit: mresource.DefaultDefaultContainerMemLimit,
runProxy: true,
executorSuicideTimeout: execcfg.DefaultSuicideTimeout,
launchGracePeriod: execcfg.DefaultLaunchGracePeriod,
defaultContainerCPULimit: mresource.DefaultDefaultContainerCPULimit,
defaultContainerMemLimit: mresource.DefaultDefaultContainerMemLimit,
MinionLogMaxSize: minioncfg.DefaultLogMaxSize(),
MinionLogMaxBackups: minioncfg.DefaultLogMaxBackups,
MinionLogMaxAgeInDays: minioncfg.DefaultLogMaxAgeInDays,
minionLogMaxSize: minioncfg.DefaultLogMaxSize(),
minionLogMaxBackups: minioncfg.DefaultLogMaxBackups,
minionLogMaxAgeInDays: minioncfg.DefaultLogMaxAgeInDays,
MesosAuthProvider: sasl.ProviderName,
MesosCgroupPrefix: minioncfg.DefaultCgroupPrefix,
MesosMaster: defaultMesosMaster,
MesosUser: defaultMesosUser,
MesosExecutorCPUs: defaultExecutorCPUs,
MesosExecutorMem: defaultExecutorMem,
ReconcileInterval: defaultReconcileInterval,
ReconcileCooldown: defaultReconcileCooldown,
Checkpoint: true,
FrameworkName: defaultFrameworkName,
HA: false,
mesosAuthProvider: sasl.ProviderName,
mesosCgroupPrefix: minioncfg.DefaultCgroupPrefix,
mesosMaster: defaultMesosMaster,
mesosUser: defaultMesosUser,
mesosExecutorCPUs: defaultExecutorCPUs,
mesosExecutorMem: defaultExecutorMem,
reconcileInterval: defaultReconcileInterval,
reconcileCooldown: defaultReconcileCooldown,
checkpoint: true,
frameworkName: defaultFrameworkName,
ha: false,
mux: http.NewServeMux(),
KubeletCadvisorPort: 4194, // copied from github.com/GoogleCloudPlatform/kubernetes/blob/release-0.14/cmd/kubelet/app/server.go
KubeletSyncFrequency: 10 * time.Second,
ContainPodResources: true,
AccountForPodResources: true,
kubeletCadvisorPort: 4194, // copied from github.com/GoogleCloudPlatform/kubernetes/blob/release-0.14/cmd/kubelet/app/server.go
kubeletSyncFrequency: 10 * time.Second,
containPodResources: true,
accountForPodResources: true,
nodeRelistPeriod: defaultNodeRelistPeriod,
}
// cache this for later use. also useful in case the original binary gets deleted, e.g.
@@ -208,76 +211,76 @@ func NewSchedulerServer() *SchedulerServer {
log.Fatalf("failed to determine path to currently running executable: %v", err)
} else {
s.executable = filename
s.KMPath = filename
s.kmPath = filename
}
return &s
}
func (s *SchedulerServer) addCoreFlags(fs *pflag.FlagSet) {
fs.IntVar(&s.Port, "port", s.Port, "The port that the scheduler's http service runs on")
fs.IPVar(&s.Address, "address", s.Address, "The IP address to serve on (set to 0.0.0.0 for all interfaces)")
fs.BoolVar(&s.EnableProfiling, "profiling", s.EnableProfiling, "Enable profiling via web interface host:port/debug/pprof/")
fs.StringSliceVar(&s.APIServerList, "api-servers", s.APIServerList, "List of Kubernetes API servers for publishing events, and reading pods and services. (ip:port), comma separated.")
fs.StringVar(&s.AuthPath, "auth-path", s.AuthPath, "Path to .kubernetes_auth file, specifying how to authenticate to API server.")
fs.StringSliceVar(&s.EtcdServerList, "etcd-servers", s.EtcdServerList, "List of etcd servers to watch (http://ip:port), comma separated. Mutually exclusive with --etcd-config")
fs.StringVar(&s.EtcdConfigFile, "etcd-config", s.EtcdConfigFile, "The config file for the etcd client. Mutually exclusive with --etcd-servers.")
fs.BoolVar(&s.AllowPrivileged, "allow-privileged", s.AllowPrivileged, "If true, allow privileged containers.")
fs.StringVar(&s.ClusterDomain, "cluster-domain", s.ClusterDomain, "Domain for this cluster. If set, kubelet will configure all containers to search this domain in addition to the host's search domains")
fs.IPVar(&s.ClusterDNS, "cluster-dns", s.ClusterDNS, "IP address for a cluster DNS server. If set, kubelet will configure all containers to use this for DNS resolution in addition to the host's DNS servers")
fs.StringVar(&s.StaticPodsConfigPath, "static-pods-config", s.StaticPodsConfigPath, "Path for specification of static pods. Path should point to dir containing the staticPods configuration files. Defaults to none.")
fs.IntVar(&s.port, "port", s.port, "The port that the scheduler's http service runs on")
fs.IPVar(&s.address, "address", s.address, "The IP address to serve on (set to 0.0.0.0 for all interfaces)")
fs.BoolVar(&s.enableProfiling, "profiling", s.enableProfiling, "Enable profiling via web interface host:port/debug/pprof/")
fs.StringSliceVar(&s.apiServerList, "api-servers", s.apiServerList, "List of Kubernetes API servers for publishing events, and reading pods and services. (ip:port), comma separated.")
fs.StringVar(&s.authPath, "auth-path", s.authPath, "Path to .kubernetes_auth file, specifying how to authenticate to API server.")
fs.StringSliceVar(&s.etcdServerList, "etcd-servers", s.etcdServerList, "List of etcd servers to watch (http://ip:port), comma separated. Mutually exclusive with --etcd-config")
fs.StringVar(&s.etcdConfigFile, "etcd-config", s.etcdConfigFile, "The config file for the etcd client. Mutually exclusive with --etcd-servers.")
fs.BoolVar(&s.allowPrivileged, "allow-privileged", s.allowPrivileged, "If true, allow privileged containers.")
fs.StringVar(&s.clusterDomain, "cluster-domain", s.clusterDomain, "Domain for this cluster. If set, kubelet will configure all containers to search this domain in addition to the host's search domains")
fs.IPVar(&s.clusterDNS, "cluster-dns", s.clusterDNS, "IP address for a cluster DNS server. If set, kubelet will configure all containers to use this for DNS resolution in addition to the host's DNS servers")
fs.StringVar(&s.staticPodsConfigPath, "static-pods-config", s.staticPodsConfigPath, "Path for specification of static pods. Path should point to dir containing the staticPods configuration files. Defaults to none.")
fs.StringVar(&s.MesosMaster, "mesos-master", s.MesosMaster, "Location of the Mesos master. The format is a comma-delimited list of of hosts like zk://host1:port,host2:port/mesos. If using ZooKeeper, pay particular attention to the leading zk:// and trailing /mesos! If not using ZooKeeper, standard URLs like http://localhost are also acceptable.")
fs.StringVar(&s.MesosUser, "mesos-user", s.MesosUser, "Mesos user for this framework, defaults to root.")
fs.StringVar(&s.MesosRole, "mesos-role", s.MesosRole, "Mesos role for this framework, defaults to none.")
fs.StringVar(&s.MesosAuthPrincipal, "mesos-authentication-principal", s.MesosAuthPrincipal, "Mesos authentication principal.")
fs.StringVar(&s.MesosAuthSecretFile, "mesos-authentication-secret-file", s.MesosAuthSecretFile, "Mesos authentication secret file.")
fs.StringVar(&s.MesosAuthProvider, "mesos-authentication-provider", s.MesosAuthProvider, fmt.Sprintf("Authentication provider to use, default is SASL that supports mechanisms: %+v", mech.ListSupported()))
fs.StringVar(&s.DockerCfgPath, "dockercfg-path", s.DockerCfgPath, "Path to a dockercfg file that will be used by the docker instance of the minions.")
fs.StringVar(&s.MesosCgroupPrefix, "mesos-cgroup-prefix", s.MesosCgroupPrefix, "The cgroup prefix concatenated with MESOS_DIRECTORY must give the executor cgroup set by Mesos")
fs.Var(&s.MesosExecutorCPUs, "mesos-executor-cpus", "Initial CPU shares to allocate for each Mesos executor container.")
fs.Var(&s.MesosExecutorMem, "mesos-executor-mem", "Initial memory (MB) to allocate for each Mesos executor container.")
fs.BoolVar(&s.Checkpoint, "checkpoint", s.Checkpoint, "Enable/disable checkpointing for the kubernetes-mesos framework.")
fs.Float64Var(&s.FailoverTimeout, "failover-timeout", s.FailoverTimeout, fmt.Sprintf("Framework failover timeout, in sec."))
fs.UintVar(&s.DriverPort, "driver-port", s.DriverPort, "Port that the Mesos scheduler driver process should listen on.")
fs.StringVar(&s.HostnameOverride, "hostname-override", s.HostnameOverride, "If non-empty, will use this string as identification instead of the actual hostname.")
fs.Int64Var(&s.ReconcileInterval, "reconcile-interval", s.ReconcileInterval, "Interval at which to execute task reconciliation, in sec. Zero disables.")
fs.DurationVar(&s.ReconcileCooldown, "reconcile-cooldown", s.ReconcileCooldown, "Minimum rest period between task reconciliation operations.")
fs.StringVar(&s.SchedulerConfigFileName, "scheduler-config", s.SchedulerConfigFileName, "An ini-style configuration file with low-level scheduler settings.")
fs.BoolVar(&s.Graceful, "graceful", s.Graceful, "Indicator of a graceful failover, intended for internal use only.")
fs.BoolVar(&s.HA, "ha", s.HA, "Run the scheduler in high availability mode with leader election. All peers should be configured exactly the same.")
fs.StringVar(&s.FrameworkName, "framework-name", s.FrameworkName, "The framework name to register with Mesos.")
fs.StringVar(&s.FrameworkWebURI, "framework-weburi", s.FrameworkWebURI, "A URI that points to a web-based interface for interacting with the framework.")
fs.StringVar(&s.AdvertisedAddress, "advertised-address", s.AdvertisedAddress, "host:port address that is advertised to clients. May be used to construct artifact download URIs.")
fs.IPVar(&s.ServiceAddress, "service-address", s.ServiceAddress, "The service portal IP address that the scheduler should register with (if unset, chooses randomly)")
fs.Var(&s.DefaultContainerCPULimit, "default-container-cpu-limit", "Containers without a CPU resource limit are admitted this much CPU shares")
fs.Var(&s.DefaultContainerMemLimit, "default-container-mem-limit", "Containers without a memory resource limit are admitted this much amount of memory in MB")
fs.BoolVar(&s.ContainPodResources, "contain-pod-resources", s.ContainPodResources, "Reparent pod containers into mesos cgroups; disable if you're having strange mesos/docker/systemd interactions.")
fs.BoolVar(&s.AccountForPodResources, "account-for-pod-resources", s.AccountForPodResources, "Allocate pod CPU and memory resources from offers (Default: true)")
fs.StringVar(&s.mesosMaster, "mesos-master", s.mesosMaster, "Location of the Mesos master. The format is a comma-delimited list of of hosts like zk://host1:port,host2:port/mesos. If using ZooKeeper, pay particular attention to the leading zk:// and trailing /mesos! If not using ZooKeeper, standard URLs like http://localhost are also acceptable.")
fs.StringVar(&s.mesosUser, "mesos-user", s.mesosUser, "Mesos user for this framework, defaults to root.")
fs.StringVar(&s.mesosRole, "mesos-role", s.mesosRole, "Mesos role for this framework, defaults to none.")
fs.StringVar(&s.mesosAuthPrincipal, "mesos-authentication-principal", s.mesosAuthPrincipal, "Mesos authentication principal.")
fs.StringVar(&s.mesosAuthSecretFile, "mesos-authentication-secret-file", s.mesosAuthSecretFile, "Mesos authentication secret file.")
fs.StringVar(&s.mesosAuthProvider, "mesos-authentication-provider", s.mesosAuthProvider, fmt.Sprintf("Authentication provider to use, default is SASL that supports mechanisms: %+v", mech.ListSupported()))
fs.StringVar(&s.dockerCfgPath, "dockercfg-path", s.dockerCfgPath, "Path to a dockercfg file that will be used by the docker instance of the minions.")
fs.StringVar(&s.mesosCgroupPrefix, "mesos-cgroup-prefix", s.mesosCgroupPrefix, "The cgroup prefix concatenated with MESOS_DIRECTORY must give the executor cgroup set by Mesos")
fs.Var(&s.mesosExecutorCPUs, "mesos-executor-cpus", "Initial CPU shares to allocate for each Mesos executor container.")
fs.Var(&s.mesosExecutorMem, "mesos-executor-mem", "Initial memory (MB) to allocate for each Mesos executor container.")
fs.BoolVar(&s.checkpoint, "checkpoint", s.checkpoint, "Enable/disable checkpointing for the kubernetes-mesos framework.")
fs.Float64Var(&s.failoverTimeout, "failover-timeout", s.failoverTimeout, fmt.Sprintf("Framework failover timeout, in sec."))
fs.UintVar(&s.driverPort, "driver-port", s.driverPort, "Port that the Mesos scheduler driver process should listen on.")
fs.StringVar(&s.hostnameOverride, "hostname-override", s.hostnameOverride, "If non-empty, will use this string as identification instead of the actual hostname.")
fs.Int64Var(&s.reconcileInterval, "reconcile-interval", s.reconcileInterval, "Interval at which to execute task reconciliation, in sec. Zero disables.")
fs.DurationVar(&s.reconcileCooldown, "reconcile-cooldown", s.reconcileCooldown, "Minimum rest period between task reconciliation operations.")
fs.StringVar(&s.schedulerConfigFileName, "scheduler-config", s.schedulerConfigFileName, "An ini-style configuration file with low-level scheduler settings.")
fs.BoolVar(&s.graceful, "graceful", s.graceful, "Indicator of a graceful failover, intended for internal use only.")
fs.BoolVar(&s.ha, "ha", s.ha, "Run the scheduler in high availability mode with leader election. All peers should be configured exactly the same.")
fs.StringVar(&s.frameworkName, "framework-name", s.frameworkName, "The framework name to register with Mesos.")
fs.StringVar(&s.frameworkWebURI, "framework-weburi", s.frameworkWebURI, "A URI that points to a web-based interface for interacting with the framework.")
fs.StringVar(&s.advertisedAddress, "advertised-address", s.advertisedAddress, "host:port address that is advertised to clients. May be used to construct artifact download URIs.")
fs.IPVar(&s.serviceAddress, "service-address", s.serviceAddress, "The service portal IP address that the scheduler should register with (if unset, chooses randomly)")
fs.Var(&s.defaultContainerCPULimit, "default-container-cpu-limit", "Containers without a CPU resource limit are admitted this much CPU shares")
fs.Var(&s.defaultContainerMemLimit, "default-container-mem-limit", "Containers without a memory resource limit are admitted this much amount of memory in MB")
fs.BoolVar(&s.containPodResources, "contain-pod-resources", s.containPodResources, "Reparent pod containers into mesos cgroups; disable if you're having strange mesos/docker/systemd interactions.")
fs.BoolVar(&s.accountForPodResources, "account-for-pod-resources", s.accountForPodResources, "Allocate pod CPU and memory resources from offers (Default: true)")
fs.DurationVar(&s.nodeRelistPeriod, "node-monitor-period", s.nodeRelistPeriod, "Period between relisting of all nodes from the apiserver.")
fs.IntVar(&s.ExecutorLogV, "executor-logv", s.ExecutorLogV, "Logging verbosity of spawned minion and executor processes.")
fs.BoolVar(&s.ExecutorBindall, "executor-bindall", s.ExecutorBindall, "When true will set -address of the executor to 0.0.0.0.")
fs.DurationVar(&s.ExecutorSuicideTimeout, "executor-suicide-timeout", s.ExecutorSuicideTimeout, "Executor self-terminates after this period of inactivity. Zero disables suicide watch.")
fs.DurationVar(&s.LaunchGracePeriod, "mesos-launch-grace-period", s.LaunchGracePeriod, "Launch grace period after which launching tasks will be cancelled. Zero disables launch cancellation.")
fs.StringVar(&s.SandboxOverlay, "mesos-sandbox-overlay", s.SandboxOverlay, "Path to an archive (tar.gz, tar.bz2 or zip) extracted into the sandbox.")
fs.IntVar(&s.executorLogV, "executor-logv", s.executorLogV, "Logging verbosity of spawned minion and executor processes.")
fs.BoolVar(&s.executorBindall, "executor-bindall", s.executorBindall, "When true will set -address of the executor to 0.0.0.0.")
fs.DurationVar(&s.executorSuicideTimeout, "executor-suicide-timeout", s.executorSuicideTimeout, "Executor self-terminates after this period of inactivity. Zero disables suicide watch.")
fs.DurationVar(&s.launchGracePeriod, "mesos-launch-grace-period", s.launchGracePeriod, "Launch grace period after which launching tasks will be cancelled. Zero disables launch cancellation.")
fs.StringVar(&s.sandboxOverlay, "mesos-sandbox-overlay", s.sandboxOverlay, "Path to an archive (tar.gz, tar.bz2 or zip) extracted into the sandbox.")
fs.BoolVar(&s.ProxyBindall, "proxy-bindall", s.ProxyBindall, "When true pass -proxy-bindall to the executor.")
fs.BoolVar(&s.RunProxy, "run-proxy", s.RunProxy, "Run the kube-proxy as a side process of the executor.")
fs.IntVar(&s.ProxyLogV, "proxy-logv", s.ProxyLogV, "Logging verbosity of spawned minion proxy processes.")
fs.BoolVar(&s.proxyBindall, "proxy-bindall", s.proxyBindall, "When true pass -proxy-bindall to the executor.")
fs.BoolVar(&s.runProxy, "run-proxy", s.runProxy, "Run the kube-proxy as a side process of the executor.")
fs.IntVar(&s.proxyLogV, "proxy-logv", s.proxyLogV, "Logging verbosity of spawned minion proxy processes.")
fs.StringVar(&s.MinionPathOverride, "minion-path-override", s.MinionPathOverride, "Override the PATH in the environment of the minion sub-processes.")
fs.Var(resource.NewQuantityFlagValue(&s.MinionLogMaxSize), "minion-max-log-size", "Maximum log file size for the executor and proxy before rotation")
fs.IntVar(&s.MinionLogMaxAgeInDays, "minion-max-log-age", s.MinionLogMaxAgeInDays, "Maximum log file age of the executor and proxy in days")
fs.IntVar(&s.MinionLogMaxBackups, "minion-max-log-backups", s.MinionLogMaxBackups, "Maximum log file backups of the executor and proxy to keep after rotation")
fs.StringVar(&s.minionPathOverride, "minion-path-override", s.minionPathOverride, "Override the PATH in the environment of the minion sub-processes.")
fs.Var(resource.NewQuantityFlagValue(&s.minionLogMaxSize), "minion-max-log-size", "Maximum log file size for the executor and proxy before rotation")
fs.IntVar(&s.minionLogMaxAgeInDays, "minion-max-log-age", s.minionLogMaxAgeInDays, "Maximum log file age of the executor and proxy in days")
fs.IntVar(&s.minionLogMaxBackups, "minion-max-log-backups", s.minionLogMaxBackups, "Maximum log file backups of the executor and proxy to keep after rotation")
fs.StringVar(&s.KubeletRootDirectory, "kubelet-root-dir", s.KubeletRootDirectory, "Directory path for managing kubelet files (volume mounts,etc). Defaults to executor sandbox.")
fs.StringVar(&s.KubeletDockerEndpoint, "kubelet-docker-endpoint", s.KubeletDockerEndpoint, "If non-empty, kubelet will use this for the docker endpoint to communicate with.")
fs.StringVar(&s.KubeletPodInfraContainerImage, "kubelet-pod-infra-container-image", s.KubeletPodInfraContainerImage, "The image whose network/ipc namespaces containers in each pod will use.")
fs.UintVar(&s.KubeletCadvisorPort, "kubelet-cadvisor-port", s.KubeletCadvisorPort, "The port of the kubelet's local cAdvisor endpoint")
fs.StringVar(&s.KubeletHostNetworkSources, "kubelet-host-network-sources", s.KubeletHostNetworkSources, "Comma-separated list of sources from which the Kubelet allows pods to use of host network. For all sources use \"*\" [default=\"file\"]")
fs.DurationVar(&s.KubeletSyncFrequency, "kubelet-sync-frequency", s.KubeletSyncFrequency, "Max period between synchronizing running containers and config")
fs.StringVar(&s.KubeletNetworkPluginName, "kubelet-network-plugin", s.KubeletNetworkPluginName, "<Warning: Alpha feature> The name of the network plugin to be invoked for various events in kubelet/pod lifecycle")
fs.StringVar(&s.kubeletRootDirectory, "kubelet-root-dir", s.kubeletRootDirectory, "Directory path for managing kubelet files (volume mounts,etc). Defaults to executor sandbox.")
fs.StringVar(&s.kubeletDockerEndpoint, "kubelet-docker-endpoint", s.kubeletDockerEndpoint, "If non-empty, kubelet will use this for the docker endpoint to communicate with.")
fs.StringVar(&s.kubeletPodInfraContainerImage, "kubelet-pod-infra-container-image", s.kubeletPodInfraContainerImage, "The image whose network/ipc namespaces containers in each pod will use.")
fs.UintVar(&s.kubeletCadvisorPort, "kubelet-cadvisor-port", s.kubeletCadvisorPort, "The port of the kubelet's local cAdvisor endpoint")
fs.StringVar(&s.kubeletHostNetworkSources, "kubelet-host-network-sources", s.kubeletHostNetworkSources, "Comma-separated list of sources from which the Kubelet allows pods to use of host network. For all sources use \"*\" [default=\"file\"]")
fs.DurationVar(&s.kubeletSyncFrequency, "kubelet-sync-frequency", s.kubeletSyncFrequency, "Max period between synchronizing running containers and config")
fs.StringVar(&s.kubeletNetworkPluginName, "kubelet-network-plugin", s.kubeletNetworkPluginName, "<Warning: Alpha feature> The name of the network plugin to be invoked for various events in kubelet/pod lifecycle")
//TODO(jdef) support this flag once we have a better handle on mesos-dns and k8s DNS integration
//fs.StringVar(&s.HADomain, "ha-domain", s.HADomain, "Domain of the HA scheduler service, only used in HA mode. If specified may be used to construct artifact download URIs.")
@@ -285,12 +288,12 @@ func (s *SchedulerServer) addCoreFlags(fs *pflag.FlagSet) {
func (s *SchedulerServer) AddStandaloneFlags(fs *pflag.FlagSet) {
s.addCoreFlags(fs)
fs.StringVar(&s.ExecutorPath, "executor-path", s.ExecutorPath, "Location of the kubernetes executor executable")
fs.StringVar(&s.executorPath, "executor-path", s.executorPath, "Location of the kubernetes executor executable")
}
func (s *SchedulerServer) AddHyperkubeFlags(fs *pflag.FlagSet) {
s.addCoreFlags(fs)
fs.StringVar(&s.KMPath, "km-path", s.KMPath, "Location of the km executable, may be a URI or an absolute file path.")
fs.StringVar(&s.kmPath, "km-path", s.kmPath, "Location of the km executable, may be a URI or an absolute file path.")
}
// returns (downloadURI, basename(path))
@@ -310,12 +313,12 @@ func (s *SchedulerServer) serveFrameworkArtifactWithFilename(path string, filena
serveFile("/"+filename, path)
hostURI := ""
if s.AdvertisedAddress != "" {
hostURI = fmt.Sprintf("http://%s/%s", s.AdvertisedAddress, filename)
} else if s.HA && s.HADomain != "" {
hostURI = fmt.Sprintf("http://%s.%s:%d/%s", SCHEDULER_SERVICE_NAME, s.HADomain, ports.SchedulerPort, filename)
if s.advertisedAddress != "" {
hostURI = fmt.Sprintf("http://%s/%s", s.advertisedAddress, filename)
} else if s.ha && s.haDomain != "" {
hostURI = fmt.Sprintf("http://%s.%s:%d/%s", SCHEDULER_SERVICE_NAME, s.haDomain, ports.SchedulerPort, filename)
} else {
hostURI = fmt.Sprintf("http://%s:%d/%s", s.Address.String(), s.Port, filename)
hostURI = fmt.Sprintf("http://%s:%d/%s", s.address.String(), s.port, filename)
}
log.V(2).Infof("Hosting artifact '%s' at '%s'", filename, hostURI)
@@ -327,21 +330,21 @@ func (s *SchedulerServer) prepareExecutorInfo(hks hyperkube.Interface) (*mesos.E
Shell: proto.Bool(false),
}
if s.ExecutorPath != "" {
uri, executorCmd := s.serveFrameworkArtifact(s.ExecutorPath)
if s.executorPath != "" {
uri, executorCmd := s.serveFrameworkArtifact(s.executorPath)
ci.Uris = append(ci.Uris, &mesos.CommandInfo_URI{Value: proto.String(uri), Executable: proto.Bool(true)})
ci.Value = proto.String(fmt.Sprintf("./%s", executorCmd))
} else if !hks.FindServer(hyperkube.CommandMinion) {
return nil, nil, fmt.Errorf("either run this scheduler via km or else --executor-path is required")
} else {
if strings.Index(s.KMPath, "://") > 0 {
if strings.Index(s.kmPath, "://") > 0 {
// URI could point directly to executable, e.g. hdfs:///km
// or else indirectly, e.g. http://acmestorage/tarball.tgz
// so we assume that for this case the command will always "km"
ci.Uris = append(ci.Uris, &mesos.CommandInfo_URI{Value: proto.String(s.KMPath), Executable: proto.Bool(true)})
ci.Uris = append(ci.Uris, &mesos.CommandInfo_URI{Value: proto.String(s.kmPath), Executable: proto.Bool(true)})
ci.Value = proto.String("./km") // TODO(jdef) extract constant
} else if s.KMPath != "" {
uri, kmCmd := s.serveFrameworkArtifact(s.KMPath)
} else if s.kmPath != "" {
uri, kmCmd := s.serveFrameworkArtifact(s.kmPath)
ci.Uris = append(ci.Uris, &mesos.CommandInfo_URI{Value: proto.String(uri), Executable: proto.Bool(true)})
ci.Value = proto.String(fmt.Sprintf("./%s", kmCmd))
} else {
@@ -351,55 +354,55 @@ func (s *SchedulerServer) prepareExecutorInfo(hks hyperkube.Interface) (*mesos.E
}
ci.Arguments = append(ci.Arguments, hyperkube.CommandMinion)
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--run-proxy=%v", s.RunProxy))
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--proxy-bindall=%v", s.ProxyBindall))
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--proxy-logv=%d", s.ProxyLogV))
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--run-proxy=%v", s.runProxy))
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--proxy-bindall=%v", s.proxyBindall))
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--proxy-logv=%d", s.proxyLogV))
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--path-override=%s", s.MinionPathOverride))
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--max-log-size=%v", s.MinionLogMaxSize.String()))
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--max-log-backups=%d", s.MinionLogMaxBackups))
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--max-log-age=%d", s.MinionLogMaxAgeInDays))
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--path-override=%s", s.minionPathOverride))
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--max-log-size=%v", s.minionLogMaxSize.String()))
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--max-log-backups=%d", s.minionLogMaxBackups))
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--max-log-age=%d", s.minionLogMaxAgeInDays))
}
if s.SandboxOverlay != "" {
if _, err := os.Stat(s.SandboxOverlay); os.IsNotExist(err) {
log.Fatalf("Sandbox overlay archive not found: %s", s.SandboxOverlay)
if s.sandboxOverlay != "" {
if _, err := os.Stat(s.sandboxOverlay); os.IsNotExist(err) {
return nil, nil, fmt.Errorf("Sandbox overlay archive not found: %s", s.sandboxOverlay)
}
uri, _ := s.serveFrameworkArtifact(s.SandboxOverlay)
uri, _ := s.serveFrameworkArtifact(s.sandboxOverlay)
ci.Uris = append(ci.Uris, &mesos.CommandInfo_URI{Value: proto.String(uri), Executable: proto.Bool(false), Extract: proto.Bool(true)})
}
if s.DockerCfgPath != "" {
uri := s.serveFrameworkArtifactWithFilename(s.DockerCfgPath, ".dockercfg")
if s.dockerCfgPath != "" {
uri := s.serveFrameworkArtifactWithFilename(s.dockerCfgPath, ".dockercfg")
ci.Uris = append(ci.Uris, &mesos.CommandInfo_URI{Value: proto.String(uri), Executable: proto.Bool(false), Extract: proto.Bool(false)})
}
//TODO(jdef): provide some way (env var?) for users to customize executor config
//TODO(jdef): set -address to 127.0.0.1 if `address` is 127.0.0.1
apiServerArgs := strings.Join(s.APIServerList, ",")
apiServerArgs := strings.Join(s.apiServerList, ",")
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--api-servers=%s", apiServerArgs))
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--v=%d", s.ExecutorLogV)) // this also applies to the minion
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--allow-privileged=%t", s.AllowPrivileged))
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--suicide-timeout=%v", s.ExecutorSuicideTimeout))
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--mesos-launch-grace-period=%v", s.LaunchGracePeriod))
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--v=%d", s.executorLogV)) // this also applies to the minion
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--allow-privileged=%t", s.allowPrivileged))
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--suicide-timeout=%v", s.executorSuicideTimeout))
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--mesos-launch-grace-period=%v", s.launchGracePeriod))
if s.ExecutorBindall {
if s.executorBindall {
//TODO(jdef) determine whether hostname-override is really needed for bindall because
//it conflicts with kubelet node status checks/updates
//ci.Arguments = append(ci.Arguments, "--hostname-override=0.0.0.0")
ci.Arguments = append(ci.Arguments, "--address=0.0.0.0")
}
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--mesos-cgroup-prefix=%v", s.MesosCgroupPrefix))
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--cadvisor-port=%v", s.KubeletCadvisorPort))
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--sync-frequency=%v", s.KubeletSyncFrequency))
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--contain-pod-resources=%t", s.ContainPodResources))
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--enable-debugging-handlers=%t", s.EnableProfiling))
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--mesos-cgroup-prefix=%v", s.mesosCgroupPrefix))
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--cadvisor-port=%v", s.kubeletCadvisorPort))
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--sync-frequency=%v", s.kubeletSyncFrequency))
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--contain-pod-resources=%t", s.containPodResources))
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--enable-debugging-handlers=%t", s.enableProfiling))
if s.AuthPath != "" {
if s.authPath != "" {
//TODO(jdef) should probably support non-local files, e.g. hdfs:///some/config/file
uri, basename := s.serveFrameworkArtifact(s.AuthPath)
uri, basename := s.serveFrameworkArtifact(s.authPath)
ci.Uris = append(ci.Uris, &mesos.CommandInfo_URI{Value: proto.String(uri)})
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--auth-path=%s", basename))
}
@@ -408,15 +411,15 @@ func (s *SchedulerServer) prepareExecutorInfo(hks hyperkube.Interface) (*mesos.E
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--%s=%s", name, value))
}
}
if s.ClusterDNS != nil {
appendOptional("cluster-dns", s.ClusterDNS.String())
if s.clusterDNS != nil {
appendOptional("cluster-dns", s.clusterDNS.String())
}
appendOptional("cluster-domain", s.ClusterDomain)
appendOptional("root-dir", s.KubeletRootDirectory)
appendOptional("docker-endpoint", s.KubeletDockerEndpoint)
appendOptional("pod-infra-container-image", s.KubeletPodInfraContainerImage)
appendOptional("host-network-sources", s.KubeletHostNetworkSources)
appendOptional("network-plugin", s.KubeletNetworkPluginName)
appendOptional("cluster-domain", s.clusterDomain)
appendOptional("root-dir", s.kubeletRootDirectory)
appendOptional("docker-endpoint", s.kubeletDockerEndpoint)
appendOptional("pod-infra-container-image", s.kubeletPodInfraContainerImage)
appendOptional("host-network-sources", s.kubeletHostNetworkSources)
appendOptional("network-plugin", s.kubeletNetworkPluginName)
log.V(1).Infof("prepared executor command %q with args '%+v'", ci.GetValue(), ci.Arguments)
@@ -429,8 +432,8 @@ func (s *SchedulerServer) prepareExecutorInfo(hks hyperkube.Interface) (*mesos.E
// Check for staticPods
var staticPodCPUs, staticPodMem float64
if s.StaticPodsConfigPath != "" {
bs, paths, err := archive.ZipDir(s.StaticPodsConfigPath)
if s.staticPodsConfigPath != "" {
bs, paths, err := archive.ZipDir(s.staticPodsConfigPath)
if err != nil {
return nil, nil, err
}
@@ -451,8 +454,8 @@ func (s *SchedulerServer) prepareExecutorInfo(hks hyperkube.Interface) (*mesos.E
}
// TODO(sttts): allow unlimited static pods as well and patch in the default resource limits
unlimitedCPU := mresource.LimitPodCPU(&pod, s.DefaultContainerCPULimit)
unlimitedMem := mresource.LimitPodMem(&pod, s.DefaultContainerMemLimit)
unlimitedCPU := mresource.LimitPodCPU(&pod, s.defaultContainerCPULimit)
unlimitedMem := mresource.LimitPodMem(&pod, s.defaultContainerMemLimit)
if unlimitedCPU {
return nil, nil, fmt.Errorf("found static pod without limit on cpu resources: %v", podPath)
}
@@ -473,8 +476,8 @@ func (s *SchedulerServer) prepareExecutorInfo(hks hyperkube.Interface) (*mesos.E
}
execInfo.Resources = []*mesos.Resource{
mutil.NewScalarResource("cpus", float64(s.MesosExecutorCPUs)+staticPodCPUs),
mutil.NewScalarResource("mem", float64(s.MesosExecutorMem)+staticPodMem),
mutil.NewScalarResource("cpus", float64(s.mesosExecutorCPUs)+staticPodCPUs),
mutil.NewScalarResource("mem", float64(s.mesosExecutorMem)+staticPodMem),
}
// calculate ExecutorInfo hash to be used for validating compatibility
@@ -489,7 +492,7 @@ func (s *SchedulerServer) prepareExecutorInfo(hks hyperkube.Interface) (*mesos.E
// TODO(jdef): hacked from kubelet/server/server.go
// TODO(k8s): replace this with clientcmd
func (s *SchedulerServer) createAPIServerClient() (*client.Client, error) {
authInfo, err := clientauth.LoadFromFile(s.AuthPath)
authInfo, err := clientauth.LoadFromFile(s.authPath)
if err != nil {
log.Warningf("Could not load kubernetes auth path: %v. Continuing with defaults.", err)
}
@@ -501,14 +504,14 @@ func (s *SchedulerServer) createAPIServerClient() (*client.Client, error) {
if err != nil {
return nil, err
}
if len(s.APIServerList) < 1 {
if len(s.apiServerList) < 1 {
return nil, fmt.Errorf("no api servers specified")
}
// TODO: adapt Kube client to support LB over several servers
if len(s.APIServerList) > 1 {
if len(s.apiServerList) > 1 {
log.Infof("Multiple api servers specified. Picking first one")
}
clientConfig.Host = s.APIServerList[0]
clientConfig.Host = s.apiServerList[0]
c, err := client.New(&clientConfig)
if err != nil {
return nil, err
@@ -531,8 +534,8 @@ func (s *SchedulerServer) getDriver() (driver bindings.SchedulerDriver) {
func (s *SchedulerServer) Run(hks hyperkube.Interface, _ []string) error {
// get scheduler low-level config
sc := schedcfg.CreateDefaultConfig()
if s.SchedulerConfigFileName != "" {
f, err := os.Open(s.SchedulerConfigFileName)
if s.schedulerConfigFileName != "" {
f, err := os.Open(s.schedulerConfigFileName)
if err != nil {
log.Fatalf("Cannot open scheduler config file: %v", err)
}
@@ -545,18 +548,18 @@ func (s *SchedulerServer) Run(hks hyperkube.Interface, _ []string) error {
schedulerProcess, driverFactory, etcdClient, eid := s.bootstrap(hks, sc)
if s.EnableProfiling {
if s.enableProfiling {
profile.InstallHandler(s.mux)
}
go runtime.Until(func() {
log.V(1).Info("Starting HTTP interface")
log.Error(http.ListenAndServe(net.JoinHostPort(s.Address.String(), strconv.Itoa(s.Port)), s.mux))
log.Error(http.ListenAndServe(net.JoinHostPort(s.address.String(), strconv.Itoa(s.port)), s.mux))
}, sc.HttpBindInterval.Duration, schedulerProcess.Terminal())
if s.HA {
if s.ha {
validation := ha.ValidationFunc(validateLeadershipTransition)
srv := ha.NewCandidate(schedulerProcess, driverFactory, validation)
path := fmt.Sprintf(meta.DefaultElectionFormat, s.FrameworkName)
path := fmt.Sprintf(meta.DefaultElectionFormat, s.frameworkName)
sid := uid.New(eid.Group(), "").String()
log.Infof("registering for election at %v with id %v", path, sid)
go election.Notify(election.NewEtcdMasterElector(etcdClient), path, sid, srv, nil)
@@ -595,7 +598,7 @@ func (s *SchedulerServer) awaitFailover(schedulerProcess schedulerProcessInterfa
case <-schedulerProcess.Failover():
err = doFailover()
default:
if s.HA {
if s.ha {
err = fmt.Errorf("ha scheduler exiting instead of failing over")
} else {
log.Infof("exiting scheduler")
@@ -637,22 +640,22 @@ func newEtcd(etcdConfigFile string, etcdServerList []string) (client tools.EtcdC
func (s *SchedulerServer) bootstrap(hks hyperkube.Interface, sc *schedcfg.Config) (*ha.SchedulerProcess, ha.DriverFactory, tools.EtcdClient, *uid.UID) {
s.FrameworkName = strings.TrimSpace(s.FrameworkName)
if s.FrameworkName == "" {
s.frameworkName = strings.TrimSpace(s.frameworkName)
if s.frameworkName == "" {
log.Fatalf("framework-name must be a non-empty string")
}
s.FrameworkWebURI = strings.TrimSpace(s.FrameworkWebURI)
s.frameworkWebURI = strings.TrimSpace(s.frameworkWebURI)
metrics.Register()
runtime.Register()
s.mux.Handle("/metrics", prometheus.Handler())
healthz.InstallHandler(s.mux)
if (s.EtcdConfigFile != "" && len(s.EtcdServerList) != 0) || (s.EtcdConfigFile == "" && len(s.EtcdServerList) == 0) {
if (s.etcdConfigFile != "" && len(s.etcdServerList) != 0) || (s.etcdConfigFile == "" && len(s.etcdServerList) == 0) {
log.Fatalf("specify either --etcd-servers or --etcd-config")
}
if len(s.APIServerList) < 1 {
if len(s.apiServerList) < 1 {
log.Fatal("No api servers specified.")
}
@@ -662,9 +665,9 @@ func (s *SchedulerServer) bootstrap(hks hyperkube.Interface, sc *schedcfg.Config
}
s.client = client
if s.ReconcileCooldown < defaultReconcileCooldown {
s.ReconcileCooldown = defaultReconcileCooldown
log.Warningf("user-specified reconcile cooldown too small, defaulting to %v", s.ReconcileCooldown)
if s.reconcileCooldown < defaultReconcileCooldown {
s.reconcileCooldown = defaultReconcileCooldown
log.Warningf("user-specified reconcile cooldown too small, defaulting to %v", s.reconcileCooldown)
}
executor, eid, err := s.prepareExecutorInfo(hks)
@@ -676,25 +679,25 @@ func (s *SchedulerServer) bootstrap(hks hyperkube.Interface, sc *schedcfg.Config
// (1) the generic config store is available for the FrameworkId storage
// (2) the generic master election is provided by the apiserver
// Compare docs/proposals/high-availability.md
etcdClient, err := newEtcd(s.EtcdConfigFile, s.EtcdServerList)
etcdClient, err := newEtcd(s.etcdConfigFile, s.etcdServerList)
if err != nil {
log.Fatalf("misconfigured etcd: %v", err)
}
as := scheduler.NewAllocationStrategy(
as := podschedulers.NewAllocationStrategy(
podtask.NewDefaultPredicate(
s.DefaultContainerCPULimit,
s.DefaultContainerMemLimit,
s.defaultContainerCPULimit,
s.defaultContainerMemLimit,
),
podtask.NewDefaultProcurement(
s.DefaultContainerCPULimit,
s.DefaultContainerMemLimit,
s.defaultContainerCPULimit,
s.defaultContainerMemLimit,
),
)
// downgrade allocation strategy if user disables "account-for-pod-resources"
if !s.AccountForPodResources {
as = scheduler.NewAllocationStrategy(
if !s.accountForPodResources {
as = podschedulers.NewAllocationStrategy(
podtask.DefaultMinimalPredicate,
podtask.DefaultMinimalProcurement)
}
@@ -716,48 +719,61 @@ func (s *SchedulerServer) bootstrap(hks hyperkube.Interface, sc *schedcfg.Config
return n.(*api.Node)
}
fcfs := scheduler.NewFCFSPodScheduler(as, lookupNode)
mesosPodScheduler := scheduler.New(scheduler.Config{
Schedcfg: *sc,
fcfs := podschedulers.NewFCFSPodScheduler(as, lookupNode)
framework := framework.New(framework.Config{
SchedulerConfig: *sc,
Executor: executor,
Scheduler: fcfs,
Client: client,
EtcdClient: etcdClient,
FailoverTimeout: s.FailoverTimeout,
ReconcileInterval: s.ReconcileInterval,
ReconcileCooldown: s.ReconcileCooldown,
FailoverTimeout: s.failoverTimeout,
ReconcileInterval: s.reconcileInterval,
ReconcileCooldown: s.reconcileCooldown,
LookupNode: lookupNode,
StoreFrameworkId: func(id string) {
// TODO(jdef): port FrameworkId store to generic Kubernetes config store as soon as available
_, err := etcdClient.Set(meta.FrameworkIDKey, id, uint64(s.failoverTimeout))
if err != nil {
log.Errorf("failed to renew frameworkId TTL: %v", err)
}
},
})
masterUri := s.MesosMaster
masterUri := s.mesosMaster
info, cred, err := s.buildFrameworkInfo()
if err != nil {
log.Fatalf("Misconfigured mesos framework: %v", err)
}
schedulerProcess := ha.New(mesosPodScheduler)
schedulerProcess := ha.New(framework)
dconfig := &bindings.DriverConfig{
Scheduler: schedulerProcess,
Framework: info,
Master: masterUri,
Credential: cred,
BindingAddress: s.Address,
BindingPort: uint16(s.DriverPort),
HostnameOverride: s.HostnameOverride,
BindingAddress: s.address,
BindingPort: uint16(s.driverPort),
HostnameOverride: s.hostnameOverride,
WithAuthContext: func(ctx context.Context) context.Context {
ctx = auth.WithLoginProvider(ctx, s.MesosAuthProvider)
ctx = sasl.WithBindingAddress(ctx, s.Address)
ctx = auth.WithLoginProvider(ctx, s.mesosAuthProvider)
ctx = sasl.WithBindingAddress(ctx, s.address)
return ctx
},
}
kpl := scheduler.NewPlugin(mesosPodScheduler.NewDefaultPluginConfig(schedulerProcess.Terminal(), s.mux))
runtime.On(mesosPodScheduler.Registration(), func() { kpl.Run(schedulerProcess.Terminal()) })
runtime.On(mesosPodScheduler.Registration(), s.newServiceWriter(schedulerProcess.Terminal()))
// create event recorder sending events to the "" namespace of the apiserver
broadcaster := record.NewBroadcaster()
recorder := broadcaster.NewRecorder(api.EventSource{Component: "scheduler"})
broadcaster.StartRecordingToSink(client.Events(""))
// create scheduler core with all components arranged around it
lw := cache.NewListWatchFromClient(client, "pods", api.NamespaceAll, fields.Everything())
sched := components.New(sc, framework, fcfs, client, recorder, schedulerProcess.Terminal(), s.mux, lw)
runtime.On(framework.Registration(), func() { sched.Run(schedulerProcess.Terminal()) })
runtime.On(framework.Registration(), s.newServiceWriter(schedulerProcess.Terminal()))
driverFactory := ha.DriverFactory(func() (drv bindings.SchedulerDriver, err error) {
log.V(1).Infoln("performing deferred initialization")
if err = mesosPodScheduler.Init(schedulerProcess.Master(), kpl, s.mux); err != nil {
if err = framework.Init(sched, schedulerProcess.Master(), s.mux); err != nil {
return nil, fmt.Errorf("failed to initialize pod scheduler: %v", err)
}
log.V(1).Infoln("deferred init complete")
@@ -806,14 +822,14 @@ func (s *SchedulerServer) failover(driver bindings.SchedulerDriver, hks hyperkub
args = append(args, fmt.Sprintf("--%s=%s", flag.Name, flag.Value.String()))
}
})
if !s.Graceful {
if !s.graceful {
args = append(args, "--graceful")
}
if len(s.APIServerList) > 0 {
args = append(args, "--api-servers="+strings.Join(s.APIServerList, ","))
if len(s.apiServerList) > 0 {
args = append(args, "--api-servers="+strings.Join(s.apiServerList, ","))
}
if len(s.EtcdServerList) > 0 {
args = append(args, "--etcd-servers="+strings.Join(s.EtcdServerList, ","))
if len(s.etcdServerList) > 0 {
args = append(args, "--etcd-servers="+strings.Join(s.etcdServerList, ","))
}
args = append(args, flags.Args()...)
@@ -846,30 +862,30 @@ func (s *SchedulerServer) buildFrameworkInfo() (info *mesos.FrameworkInfo, cred
}
log.V(2).Infof("Framework configured with mesos user %v", username)
info = &mesos.FrameworkInfo{
Name: proto.String(s.FrameworkName),
Name: proto.String(s.frameworkName),
User: proto.String(username),
Checkpoint: proto.Bool(s.Checkpoint),
Checkpoint: proto.Bool(s.checkpoint),
}
if s.FrameworkWebURI != "" {
info.WebuiUrl = proto.String(s.FrameworkWebURI)
if s.frameworkWebURI != "" {
info.WebuiUrl = proto.String(s.frameworkWebURI)
}
if s.FailoverTimeout > 0 {
info.FailoverTimeout = proto.Float64(s.FailoverTimeout)
if s.failoverTimeout > 0 {
info.FailoverTimeout = proto.Float64(s.failoverTimeout)
}
if s.MesosRole != "" {
info.Role = proto.String(s.MesosRole)
if s.mesosRole != "" {
info.Role = proto.String(s.mesosRole)
}
if s.MesosAuthPrincipal != "" {
info.Principal = proto.String(s.MesosAuthPrincipal)
if s.MesosAuthSecretFile == "" {
if s.mesosAuthPrincipal != "" {
info.Principal = proto.String(s.mesosAuthPrincipal)
if s.mesosAuthSecretFile == "" {
return nil, nil, errors.New("authentication principal specified without the required credentials file")
}
secret, err := ioutil.ReadFile(s.MesosAuthSecretFile)
secret, err := ioutil.ReadFile(s.mesosAuthSecretFile)
if err != nil {
return nil, nil, err
}
cred = &mesos.Credential{
Principal: proto.String(s.MesosAuthPrincipal),
Principal: proto.String(s.mesosAuthPrincipal),
Secret: secret,
}
}
@@ -877,7 +893,7 @@ func (s *SchedulerServer) buildFrameworkInfo() (info *mesos.FrameworkInfo, cred
}
func (s *SchedulerServer) fetchFrameworkID(client tools.EtcdClient) (*mesos.FrameworkID, error) {
if s.FailoverTimeout > 0 {
if s.failoverTimeout > 0 {
if response, err := client.Get(meta.FrameworkIDKey, false, false); err != nil {
if !etcdstorage.IsEtcdNotFound(err) {
return nil, fmt.Errorf("unexpected failure attempting to load framework ID from etcd: %v", err)
@@ -900,7 +916,7 @@ func (s *SchedulerServer) fetchFrameworkID(client tools.EtcdClient) (*mesos.Fram
}
func (s *SchedulerServer) getUsername() (username string, err error) {
username = s.MesosUser
username = s.mesosUser
if username == "" {
if u, err := user.Current(); err == nil {
username = u.Username

View File

@@ -121,8 +121,8 @@ func Test_DefaultResourceLimits(t *testing.T) {
assert := assert.New(t)
s := NewSchedulerServer()
assert.Equal(s.DefaultContainerCPULimit, mresource.DefaultDefaultContainerCPULimit)
assert.Equal(s.DefaultContainerMemLimit, mresource.DefaultDefaultContainerMemLimit)
assert.Equal(s.defaultContainerCPULimit, mresource.DefaultDefaultContainerCPULimit)
assert.Equal(s.defaultContainerMemLimit, mresource.DefaultDefaultContainerMemLimit)
}
func Test_StaticPods(t *testing.T) {