Merge pull request #16316 from mesosphere/scheduler-refactor
MESOS: Refactor scheduler
This commit is contained in:
BIN
contrib/mesos/docs/scheduler.monopic
Normal file
BIN
contrib/mesos/docs/scheduler.monopic
Normal file
Binary file not shown.
@@ -99,7 +99,7 @@ type NodeInfo struct {
|
|||||||
|
|
||||||
// KubernetesExecutor is an mesos executor that runs pods
|
// KubernetesExecutor is an mesos executor that runs pods
|
||||||
// in a minion machine.
|
// in a minion machine.
|
||||||
type KubernetesExecutor struct {
|
type Executor struct {
|
||||||
updateChan chan<- kubetypes.PodUpdate // sent to the kubelet, closed on shutdown
|
updateChan chan<- kubetypes.PodUpdate // sent to the kubelet, closed on shutdown
|
||||||
state stateType
|
state stateType
|
||||||
tasks map[string]*kuberTask
|
tasks map[string]*kuberTask
|
||||||
@@ -136,13 +136,13 @@ type Config struct {
|
|||||||
NodeInfos chan<- NodeInfo
|
NodeInfos chan<- NodeInfo
|
||||||
}
|
}
|
||||||
|
|
||||||
func (k *KubernetesExecutor) isConnected() bool {
|
func (k *Executor) isConnected() bool {
|
||||||
return connectedState == (&k.state).get()
|
return connectedState == (&k.state).get()
|
||||||
}
|
}
|
||||||
|
|
||||||
// New creates a new kubernetes executor.
|
// New creates a new kubernetes executor.
|
||||||
func New(config Config) *KubernetesExecutor {
|
func New(config Config) *Executor {
|
||||||
k := &KubernetesExecutor{
|
k := &Executor{
|
||||||
updateChan: config.Updates,
|
updateChan: config.Updates,
|
||||||
state: disconnectedState,
|
state: disconnectedState,
|
||||||
tasks: make(map[string]*kuberTask),
|
tasks: make(map[string]*kuberTask),
|
||||||
@@ -187,7 +187,7 @@ func New(config Config) *KubernetesExecutor {
|
|||||||
return k
|
return k
|
||||||
}
|
}
|
||||||
|
|
||||||
func (k *KubernetesExecutor) Init(driver bindings.ExecutorDriver) {
|
func (k *Executor) Init(driver bindings.ExecutorDriver) {
|
||||||
k.killKubeletContainers()
|
k.killKubeletContainers()
|
||||||
k.resetSuicideWatch(driver)
|
k.resetSuicideWatch(driver)
|
||||||
|
|
||||||
@@ -196,7 +196,7 @@ func (k *KubernetesExecutor) Init(driver bindings.ExecutorDriver) {
|
|||||||
//TODO(jdef) monitor kubeletFinished and shutdown if it happens
|
//TODO(jdef) monitor kubeletFinished and shutdown if it happens
|
||||||
}
|
}
|
||||||
|
|
||||||
func (k *KubernetesExecutor) isDone() bool {
|
func (k *Executor) isDone() bool {
|
||||||
select {
|
select {
|
||||||
case <-k.terminate:
|
case <-k.terminate:
|
||||||
return true
|
return true
|
||||||
@@ -206,7 +206,7 @@ func (k *KubernetesExecutor) isDone() bool {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// sendPodUpdate assumes that caller is holding state lock; returns true when update is sent otherwise false
|
// sendPodUpdate assumes that caller is holding state lock; returns true when update is sent otherwise false
|
||||||
func (k *KubernetesExecutor) sendPodUpdate(u *kubetypes.PodUpdate) bool {
|
func (k *Executor) sendPodUpdate(u *kubetypes.PodUpdate) bool {
|
||||||
if k.isDone() {
|
if k.isDone() {
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
@@ -215,7 +215,7 @@ func (k *KubernetesExecutor) sendPodUpdate(u *kubetypes.PodUpdate) bool {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Registered is called when the executor is successfully registered with the slave.
|
// Registered is called when the executor is successfully registered with the slave.
|
||||||
func (k *KubernetesExecutor) Registered(driver bindings.ExecutorDriver,
|
func (k *Executor) Registered(driver bindings.ExecutorDriver,
|
||||||
executorInfo *mesos.ExecutorInfo, frameworkInfo *mesos.FrameworkInfo, slaveInfo *mesos.SlaveInfo) {
|
executorInfo *mesos.ExecutorInfo, frameworkInfo *mesos.FrameworkInfo, slaveInfo *mesos.SlaveInfo) {
|
||||||
if k.isDone() {
|
if k.isDone() {
|
||||||
return
|
return
|
||||||
@@ -252,7 +252,7 @@ func (k *KubernetesExecutor) Registered(driver bindings.ExecutorDriver,
|
|||||||
|
|
||||||
// Reregistered is called when the executor is successfully re-registered with the slave.
|
// Reregistered is called when the executor is successfully re-registered with the slave.
|
||||||
// This can happen when the slave fails over.
|
// This can happen when the slave fails over.
|
||||||
func (k *KubernetesExecutor) Reregistered(driver bindings.ExecutorDriver, slaveInfo *mesos.SlaveInfo) {
|
func (k *Executor) Reregistered(driver bindings.ExecutorDriver, slaveInfo *mesos.SlaveInfo) {
|
||||||
if k.isDone() {
|
if k.isDone() {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
@@ -280,7 +280,7 @@ func (k *KubernetesExecutor) Reregistered(driver bindings.ExecutorDriver, slaveI
|
|||||||
}
|
}
|
||||||
|
|
||||||
// initializeStaticPodsSource unzips the data slice into the static-pods directory
|
// initializeStaticPodsSource unzips the data slice into the static-pods directory
|
||||||
func (k *KubernetesExecutor) initializeStaticPodsSource(data []byte) {
|
func (k *Executor) initializeStaticPodsSource(data []byte) {
|
||||||
log.V(2).Infof("extracting static pods config to %s", k.staticPodsConfigPath)
|
log.V(2).Infof("extracting static pods config to %s", k.staticPodsConfigPath)
|
||||||
err := archive.UnzipDir(data, k.staticPodsConfigPath)
|
err := archive.UnzipDir(data, k.staticPodsConfigPath)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -290,7 +290,7 @@ func (k *KubernetesExecutor) initializeStaticPodsSource(data []byte) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Disconnected is called when the executor is disconnected from the slave.
|
// Disconnected is called when the executor is disconnected from the slave.
|
||||||
func (k *KubernetesExecutor) Disconnected(driver bindings.ExecutorDriver) {
|
func (k *Executor) Disconnected(driver bindings.ExecutorDriver) {
|
||||||
if k.isDone() {
|
if k.isDone() {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
@@ -306,7 +306,7 @@ func (k *KubernetesExecutor) Disconnected(driver bindings.ExecutorDriver) {
|
|||||||
// is running, but the binding is not recorded in the Kubernetes store yet.
|
// is running, but the binding is not recorded in the Kubernetes store yet.
|
||||||
// This function is invoked to tell the executor to record the binding in the
|
// This function is invoked to tell the executor to record the binding in the
|
||||||
// Kubernetes store and start the pod via the Kubelet.
|
// Kubernetes store and start the pod via the Kubelet.
|
||||||
func (k *KubernetesExecutor) LaunchTask(driver bindings.ExecutorDriver, taskInfo *mesos.TaskInfo) {
|
func (k *Executor) LaunchTask(driver bindings.ExecutorDriver, taskInfo *mesos.TaskInfo) {
|
||||||
if k.isDone() {
|
if k.isDone() {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
@@ -356,7 +356,7 @@ func (k *KubernetesExecutor) LaunchTask(driver bindings.ExecutorDriver, taskInfo
|
|||||||
go k.launchTask(driver, taskId, pod)
|
go k.launchTask(driver, taskId, pod)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (k *KubernetesExecutor) handleChangedApiserverPod(pod *api.Pod) {
|
func (k *Executor) handleChangedApiserverPod(pod *api.Pod) {
|
||||||
// exclude "pre-scheduled" pods which have a NodeName set to this node without being scheduled already
|
// exclude "pre-scheduled" pods which have a NodeName set to this node without being scheduled already
|
||||||
taskId := pod.Annotations[meta.TaskIdKey]
|
taskId := pod.Annotations[meta.TaskIdKey]
|
||||||
if taskId == "" {
|
if taskId == "" {
|
||||||
@@ -402,7 +402,7 @@ func (k *KubernetesExecutor) handleChangedApiserverPod(pod *api.Pod) {
|
|||||||
// a timer that, upon expiration, causes this executor to commit suicide.
|
// a timer that, upon expiration, causes this executor to commit suicide.
|
||||||
// this implementation runs asynchronously. callers that wish to wait for the
|
// this implementation runs asynchronously. callers that wish to wait for the
|
||||||
// reset to complete may wait for the returned signal chan to close.
|
// reset to complete may wait for the returned signal chan to close.
|
||||||
func (k *KubernetesExecutor) resetSuicideWatch(driver bindings.ExecutorDriver) <-chan struct{} {
|
func (k *Executor) resetSuicideWatch(driver bindings.ExecutorDriver) <-chan struct{} {
|
||||||
ch := make(chan struct{})
|
ch := make(chan struct{})
|
||||||
go func() {
|
go func() {
|
||||||
defer close(ch)
|
defer close(ch)
|
||||||
@@ -432,7 +432,7 @@ func (k *KubernetesExecutor) resetSuicideWatch(driver bindings.ExecutorDriver) <
|
|||||||
return ch
|
return ch
|
||||||
}
|
}
|
||||||
|
|
||||||
func (k *KubernetesExecutor) attemptSuicide(driver bindings.ExecutorDriver, abort <-chan struct{}) {
|
func (k *Executor) attemptSuicide(driver bindings.ExecutorDriver, abort <-chan struct{}) {
|
||||||
k.lock.Lock()
|
k.lock.Lock()
|
||||||
defer k.lock.Unlock()
|
defer k.lock.Unlock()
|
||||||
|
|
||||||
@@ -464,7 +464,7 @@ func (k *KubernetesExecutor) attemptSuicide(driver bindings.ExecutorDriver, abor
|
|||||||
}
|
}
|
||||||
|
|
||||||
// async continuation of LaunchTask
|
// async continuation of LaunchTask
|
||||||
func (k *KubernetesExecutor) launchTask(driver bindings.ExecutorDriver, taskId string, pod *api.Pod) {
|
func (k *Executor) launchTask(driver bindings.ExecutorDriver, taskId string, pod *api.Pod) {
|
||||||
deleteTask := func() {
|
deleteTask := func() {
|
||||||
k.lock.Lock()
|
k.lock.Lock()
|
||||||
defer k.lock.Unlock()
|
defer k.lock.Unlock()
|
||||||
@@ -475,7 +475,7 @@ func (k *KubernetesExecutor) launchTask(driver bindings.ExecutorDriver, taskId s
|
|||||||
// TODO(k8s): use Pods interface for binding once clusters are upgraded
|
// TODO(k8s): use Pods interface for binding once clusters are upgraded
|
||||||
// return b.Pods(binding.Namespace).Bind(binding)
|
// return b.Pods(binding.Namespace).Bind(binding)
|
||||||
if pod.Spec.NodeName == "" {
|
if pod.Spec.NodeName == "" {
|
||||||
//HACK(jdef): cloned binding construction from k8s plugin/pkg/scheduler/scheduler.go
|
//HACK(jdef): cloned binding construction from k8s plugin/pkg/scheduler/framework.go
|
||||||
binding := &api.Binding{
|
binding := &api.Binding{
|
||||||
ObjectMeta: api.ObjectMeta{
|
ObjectMeta: api.ObjectMeta{
|
||||||
Namespace: pod.Namespace,
|
Namespace: pod.Namespace,
|
||||||
@@ -588,7 +588,7 @@ func (k *KubernetesExecutor) launchTask(driver bindings.ExecutorDriver, taskId s
|
|||||||
go k._launchTask(driver, taskId, podFullName, psf)
|
go k._launchTask(driver, taskId, podFullName, psf)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (k *KubernetesExecutor) _launchTask(driver bindings.ExecutorDriver, taskId, podFullName string, psf podStatusFunc) {
|
func (k *Executor) _launchTask(driver bindings.ExecutorDriver, taskId, podFullName string, psf podStatusFunc) {
|
||||||
|
|
||||||
expired := make(chan struct{})
|
expired := make(chan struct{})
|
||||||
|
|
||||||
@@ -669,7 +669,7 @@ reportLost:
|
|||||||
k.reportLostTask(driver, taskId, messages.LaunchTaskFailed)
|
k.reportLostTask(driver, taskId, messages.LaunchTaskFailed)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (k *KubernetesExecutor) __launchTask(driver bindings.ExecutorDriver, taskId, podFullName string, psf podStatusFunc) {
|
func (k *Executor) __launchTask(driver bindings.ExecutorDriver, taskId, podFullName string, psf podStatusFunc) {
|
||||||
// TODO(nnielsen): Monitor health of pod and report if lost.
|
// TODO(nnielsen): Monitor health of pod and report if lost.
|
||||||
// Should we also allow this to fail a couple of times before reporting lost?
|
// Should we also allow this to fail a couple of times before reporting lost?
|
||||||
// What if the docker daemon is restarting and we can't connect, but it's
|
// What if the docker daemon is restarting and we can't connect, but it's
|
||||||
@@ -692,7 +692,7 @@ func (k *KubernetesExecutor) __launchTask(driver bindings.ExecutorDriver, taskId
|
|||||||
// whether the pod is running. It will only return false if the task is still registered and the pod is
|
// whether the pod is running. It will only return false if the task is still registered and the pod is
|
||||||
// registered in Docker. Otherwise it returns true. If there's still a task record on file, but no pod
|
// registered in Docker. Otherwise it returns true. If there's still a task record on file, but no pod
|
||||||
// in Docker, then we'll also send a TASK_LOST event.
|
// in Docker, then we'll also send a TASK_LOST event.
|
||||||
func (k *KubernetesExecutor) checkForLostPodTask(driver bindings.ExecutorDriver, taskId string, isKnownPod func() bool) bool {
|
func (k *Executor) checkForLostPodTask(driver bindings.ExecutorDriver, taskId string, isKnownPod func() bool) bool {
|
||||||
// TODO (jdefelice) don't send false alarms for deleted pods (KILLED tasks)
|
// TODO (jdefelice) don't send false alarms for deleted pods (KILLED tasks)
|
||||||
k.lock.Lock()
|
k.lock.Lock()
|
||||||
defer k.lock.Unlock()
|
defer k.lock.Unlock()
|
||||||
@@ -716,7 +716,7 @@ func (k *KubernetesExecutor) checkForLostPodTask(driver bindings.ExecutorDriver,
|
|||||||
}
|
}
|
||||||
|
|
||||||
// KillTask is called when the executor receives a request to kill a task.
|
// KillTask is called when the executor receives a request to kill a task.
|
||||||
func (k *KubernetesExecutor) KillTask(driver bindings.ExecutorDriver, taskId *mesos.TaskID) {
|
func (k *Executor) KillTask(driver bindings.ExecutorDriver, taskId *mesos.TaskID) {
|
||||||
if k.isDone() {
|
if k.isDone() {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
@@ -735,14 +735,14 @@ func (k *KubernetesExecutor) KillTask(driver bindings.ExecutorDriver, taskId *me
|
|||||||
|
|
||||||
// Reports a lost task to the slave and updates internal task and pod tracking state.
|
// Reports a lost task to the slave and updates internal task and pod tracking state.
|
||||||
// Assumes that the caller is locking around pod and task state.
|
// Assumes that the caller is locking around pod and task state.
|
||||||
func (k *KubernetesExecutor) reportLostTask(driver bindings.ExecutorDriver, tid, reason string) {
|
func (k *Executor) reportLostTask(driver bindings.ExecutorDriver, tid, reason string) {
|
||||||
k.removePodTask(driver, tid, reason, mesos.TaskState_TASK_LOST)
|
k.removePodTask(driver, tid, reason, mesos.TaskState_TASK_LOST)
|
||||||
}
|
}
|
||||||
|
|
||||||
// deletes the pod and task associated with the task identified by tid and sends a task
|
// deletes the pod and task associated with the task identified by tid and sends a task
|
||||||
// status update to mesos. also attempts to reset the suicide watch.
|
// status update to mesos. also attempts to reset the suicide watch.
|
||||||
// Assumes that the caller is locking around pod and task state.
|
// Assumes that the caller is locking around pod and task state.
|
||||||
func (k *KubernetesExecutor) removePodTask(driver bindings.ExecutorDriver, tid, reason string, state mesos.TaskState) {
|
func (k *Executor) removePodTask(driver bindings.ExecutorDriver, tid, reason string, state mesos.TaskState) {
|
||||||
task, ok := k.tasks[tid]
|
task, ok := k.tasks[tid]
|
||||||
if !ok {
|
if !ok {
|
||||||
log.V(1).Infof("Failed to remove task, unknown task %v\n", tid)
|
log.V(1).Infof("Failed to remove task, unknown task %v\n", tid)
|
||||||
@@ -770,7 +770,7 @@ func (k *KubernetesExecutor) removePodTask(driver bindings.ExecutorDriver, tid,
|
|||||||
}
|
}
|
||||||
|
|
||||||
// FrameworkMessage is called when the framework sends some message to the executor
|
// FrameworkMessage is called when the framework sends some message to the executor
|
||||||
func (k *KubernetesExecutor) FrameworkMessage(driver bindings.ExecutorDriver, message string) {
|
func (k *Executor) FrameworkMessage(driver bindings.ExecutorDriver, message string) {
|
||||||
if k.isDone() {
|
if k.isDone() {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
@@ -780,7 +780,7 @@ func (k *KubernetesExecutor) FrameworkMessage(driver bindings.ExecutorDriver, me
|
|||||||
}
|
}
|
||||||
|
|
||||||
log.Infof("Receives message from framework %v\n", message)
|
log.Infof("Receives message from framework %v\n", message)
|
||||||
//TODO(jdef) master reported a lost task, reconcile this! @see scheduler.go:handleTaskLost
|
//TODO(jdef) master reported a lost task, reconcile this! @see framework.go:handleTaskLost
|
||||||
if strings.HasPrefix(message, messages.TaskLost+":") {
|
if strings.HasPrefix(message, messages.TaskLost+":") {
|
||||||
taskId := message[len(messages.TaskLost)+1:]
|
taskId := message[len(messages.TaskLost)+1:]
|
||||||
if taskId != "" {
|
if taskId != "" {
|
||||||
@@ -798,14 +798,14 @@ func (k *KubernetesExecutor) FrameworkMessage(driver bindings.ExecutorDriver, me
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Shutdown is called when the executor receives a shutdown request.
|
// Shutdown is called when the executor receives a shutdown request.
|
||||||
func (k *KubernetesExecutor) Shutdown(driver bindings.ExecutorDriver) {
|
func (k *Executor) Shutdown(driver bindings.ExecutorDriver) {
|
||||||
k.lock.Lock()
|
k.lock.Lock()
|
||||||
defer k.lock.Unlock()
|
defer k.lock.Unlock()
|
||||||
k.doShutdown(driver)
|
k.doShutdown(driver)
|
||||||
}
|
}
|
||||||
|
|
||||||
// assumes that caller has obtained state lock
|
// assumes that caller has obtained state lock
|
||||||
func (k *KubernetesExecutor) doShutdown(driver bindings.ExecutorDriver) {
|
func (k *Executor) doShutdown(driver bindings.ExecutorDriver) {
|
||||||
defer func() {
|
defer func() {
|
||||||
log.Errorf("exiting with unclean shutdown: %v", recover())
|
log.Errorf("exiting with unclean shutdown: %v", recover())
|
||||||
if k.exitFunc != nil {
|
if k.exitFunc != nil {
|
||||||
@@ -859,7 +859,7 @@ func (k *KubernetesExecutor) doShutdown(driver bindings.ExecutorDriver) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Destroy existing k8s containers
|
// Destroy existing k8s containers
|
||||||
func (k *KubernetesExecutor) killKubeletContainers() {
|
func (k *Executor) killKubeletContainers() {
|
||||||
if containers, err := dockertools.GetKubeletDockerContainers(k.dockerClient, true); err == nil {
|
if containers, err := dockertools.GetKubeletDockerContainers(k.dockerClient, true); err == nil {
|
||||||
opts := docker.RemoveContainerOptions{
|
opts := docker.RemoveContainerOptions{
|
||||||
RemoveVolumes: true,
|
RemoveVolumes: true,
|
||||||
@@ -878,7 +878,7 @@ func (k *KubernetesExecutor) killKubeletContainers() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Error is called when some error happens.
|
// Error is called when some error happens.
|
||||||
func (k *KubernetesExecutor) Error(driver bindings.ExecutorDriver, message string) {
|
func (k *Executor) Error(driver bindings.ExecutorDriver, message string) {
|
||||||
log.Errorln(message)
|
log.Errorln(message)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -890,7 +890,7 @@ func newStatus(taskId *mesos.TaskID, state mesos.TaskState, message string) *mes
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (k *KubernetesExecutor) sendStatus(driver bindings.ExecutorDriver, status *mesos.TaskStatus) {
|
func (k *Executor) sendStatus(driver bindings.ExecutorDriver, status *mesos.TaskStatus) {
|
||||||
select {
|
select {
|
||||||
case <-k.terminate:
|
case <-k.terminate:
|
||||||
default:
|
default:
|
||||||
@@ -898,7 +898,7 @@ func (k *KubernetesExecutor) sendStatus(driver bindings.ExecutorDriver, status *
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (k *KubernetesExecutor) sendFrameworkMessage(driver bindings.ExecutorDriver, msg string) {
|
func (k *Executor) sendFrameworkMessage(driver bindings.ExecutorDriver, msg string) {
|
||||||
select {
|
select {
|
||||||
case <-k.terminate:
|
case <-k.terminate:
|
||||||
default:
|
default:
|
||||||
@@ -906,7 +906,7 @@ func (k *KubernetesExecutor) sendFrameworkMessage(driver bindings.ExecutorDriver
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (k *KubernetesExecutor) sendLoop() {
|
func (k *Executor) sendLoop() {
|
||||||
defer log.V(1).Info("sender loop exiting")
|
defer log.V(1).Info("sender loop exiting")
|
||||||
for {
|
for {
|
||||||
select {
|
select {
|
||||||
|
@@ -170,11 +170,10 @@ func TestExecutorLaunchAndKillTask(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pod := NewTestPod(1)
|
pod := NewTestPod(1)
|
||||||
podTask, err := podtask.New(api.NewDefaultContext(), "",
|
podTask, err := podtask.New(api.NewDefaultContext(), "", pod)
|
||||||
*pod, &mesosproto.ExecutorInfo{})
|
|
||||||
assert.Equal(t, nil, err, "must be able to create a task from a pod")
|
assert.Equal(t, nil, err, "must be able to create a task from a pod")
|
||||||
|
|
||||||
taskInfo := podTask.BuildTaskInfo()
|
taskInfo := podTask.BuildTaskInfo(&mesosproto.ExecutorInfo{})
|
||||||
data, err := testapi.Default.Codec().Encode(pod)
|
data, err := testapi.Default.Codec().Encode(pod)
|
||||||
assert.Equal(t, nil, err, "must be able to encode a pod's spec data")
|
assert.Equal(t, nil, err, "must be able to encode a pod's spec data")
|
||||||
taskInfo.Data = data
|
taskInfo.Data = data
|
||||||
@@ -417,10 +416,8 @@ func TestExecutorFrameworkMessage(t *testing.T) {
|
|||||||
|
|
||||||
// set up a pod to then lose
|
// set up a pod to then lose
|
||||||
pod := NewTestPod(1)
|
pod := NewTestPod(1)
|
||||||
podTask, _ := podtask.New(api.NewDefaultContext(), "foo",
|
podTask, _ := podtask.New(api.NewDefaultContext(), "foo", pod)
|
||||||
*pod, &mesosproto.ExecutorInfo{})
|
taskInfo := podTask.BuildTaskInfo(&mesosproto.ExecutorInfo{})
|
||||||
|
|
||||||
taskInfo := podTask.BuildTaskInfo()
|
|
||||||
data, _ := testapi.Default.Codec().Encode(pod)
|
data, _ := testapi.Default.Codec().Encode(pod)
|
||||||
taskInfo.Data = data
|
taskInfo.Data = data
|
||||||
|
|
||||||
|
@@ -66,7 +66,7 @@ func (m *MockExecutorDriver) SendFrameworkMessage(msg string) (mesosproto.Status
|
|||||||
return args.Get(0).(mesosproto.Status), args.Error(1)
|
return args.Get(0).(mesosproto.Status), args.Error(1)
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewTestKubernetesExecutor() (*KubernetesExecutor, chan kubetypes.PodUpdate) {
|
func NewTestKubernetesExecutor() (*Executor, chan kubetypes.PodUpdate) {
|
||||||
updates := make(chan kubetypes.PodUpdate, 1024)
|
updates := make(chan kubetypes.PodUpdate, 1024)
|
||||||
return New(Config{
|
return New(Config{
|
||||||
Docker: dockertools.ConnectToDockerOrDie("fake://"),
|
Docker: dockertools.ConnectToDockerOrDie("fake://"),
|
||||||
|
@@ -219,7 +219,7 @@ func (ms *MinionServer) launchHyperkubeServer(server string, args []string, logF
|
|||||||
}
|
}
|
||||||
pwd, err := os.Getwd()
|
pwd, err := os.Getwd()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Fatalf("Cannot get current directory: %v", err)
|
panic(fmt.Errorf("Cannot get current directory: %v", err))
|
||||||
}
|
}
|
||||||
kmEnv = append(kmEnv, fmt.Sprintf("%s:%s", e, path.Join(pwd, "bin")))
|
kmEnv = append(kmEnv, fmt.Sprintf("%s:%s", e, path.Join(pwd, "bin")))
|
||||||
}
|
}
|
||||||
|
167
contrib/mesos/pkg/scheduler/components/algorithm/algorithm.go
Normal file
167
contrib/mesos/pkg/scheduler/components/algorithm/algorithm.go
Normal file
@@ -0,0 +1,167 @@
|
|||||||
|
/*
|
||||||
|
Copyright 2015 The Kubernetes Authors All rights reserved.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package algorithm
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
|
||||||
|
log "github.com/golang/glog"
|
||||||
|
"k8s.io/kubernetes/contrib/mesos/pkg/offers"
|
||||||
|
"k8s.io/kubernetes/contrib/mesos/pkg/queue"
|
||||||
|
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler"
|
||||||
|
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/components/algorithm/podschedulers"
|
||||||
|
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/errors"
|
||||||
|
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/podtask"
|
||||||
|
"k8s.io/kubernetes/pkg/api"
|
||||||
|
"k8s.io/kubernetes/pkg/client/cache"
|
||||||
|
)
|
||||||
|
|
||||||
|
type SchedulerAlgorithm interface {
|
||||||
|
Schedule(pod *api.Pod) (string, error)
|
||||||
|
}
|
||||||
|
|
||||||
|
// SchedulerAlgorithm implements the algorithm.ScheduleAlgorithm interface
|
||||||
|
type schedulerAlgorithm struct {
|
||||||
|
sched scheduler.Scheduler
|
||||||
|
podUpdates queue.FIFO
|
||||||
|
podScheduler podschedulers.PodScheduler
|
||||||
|
}
|
||||||
|
|
||||||
|
func New(sched scheduler.Scheduler, podUpdates queue.FIFO, podScheduler podschedulers.PodScheduler) SchedulerAlgorithm {
|
||||||
|
return &schedulerAlgorithm{
|
||||||
|
sched: sched,
|
||||||
|
podUpdates: podUpdates,
|
||||||
|
podScheduler: podScheduler,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Schedule implements the Scheduler interface of Kubernetes.
|
||||||
|
// It returns the selectedMachine's name and error (if there's any).
|
||||||
|
func (k *schedulerAlgorithm) Schedule(pod *api.Pod) (string, error) {
|
||||||
|
log.Infof("Try to schedule pod %v\n", pod.Name)
|
||||||
|
ctx := api.WithNamespace(api.NewDefaultContext(), pod.Namespace)
|
||||||
|
|
||||||
|
// default upstream scheduler passes pod.Name as binding.PodID
|
||||||
|
podKey, err := podtask.MakePodKey(ctx, pod.Name)
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
|
||||||
|
k.sched.Lock()
|
||||||
|
defer k.sched.Unlock()
|
||||||
|
|
||||||
|
switch task, state := k.sched.Tasks().ForPod(podKey); state {
|
||||||
|
case podtask.StateUnknown:
|
||||||
|
// There's a bit of a potential race here, a pod could have been yielded() and
|
||||||
|
// then before we get *here* it could be deleted.
|
||||||
|
// We use meta to index the pod in the store since that's what k8s reflector does.
|
||||||
|
podName, err := cache.MetaNamespaceKeyFunc(pod)
|
||||||
|
if err != nil {
|
||||||
|
log.Warningf("aborting Schedule, unable to understand pod object %+v", pod)
|
||||||
|
return "", errors.NoSuchPodErr
|
||||||
|
}
|
||||||
|
if deleted := k.podUpdates.Poll(podName, queue.DELETE_EVENT); deleted {
|
||||||
|
// avoid scheduling a pod that's been deleted between yieldPod() and Schedule()
|
||||||
|
log.Infof("aborting Schedule, pod has been deleted %+v", pod)
|
||||||
|
return "", errors.NoSuchPodErr
|
||||||
|
}
|
||||||
|
|
||||||
|
podTask, err := podtask.New(ctx, "", pod)
|
||||||
|
if err != nil {
|
||||||
|
log.Warningf("aborting Schedule, unable to create podtask object %+v: %v", pod, err)
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
|
||||||
|
podTask, err = k.sched.Tasks().Register(podTask)
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
|
||||||
|
return k.doSchedule(podTask)
|
||||||
|
|
||||||
|
//TODO(jdef) it's possible that the pod state has diverged from what
|
||||||
|
//we knew previously, we should probably update the task.Pod state here
|
||||||
|
//before proceeding with scheduling
|
||||||
|
case podtask.StatePending:
|
||||||
|
if pod.UID != task.Pod.UID {
|
||||||
|
// we're dealing with a brand new pod spec here, so the old one must have been
|
||||||
|
// deleted -- and so our task store is out of sync w/ respect to reality
|
||||||
|
//TODO(jdef) reconcile task
|
||||||
|
return "", fmt.Errorf("task %v spec is out of sync with pod %v spec, aborting schedule", task.ID, pod.Name)
|
||||||
|
} else if task.Has(podtask.Launched) {
|
||||||
|
// task has been marked as "launched" but the pod binding creation may have failed in k8s,
|
||||||
|
// but we're going to let someone else handle it, probably the mesos task error handler
|
||||||
|
return "", fmt.Errorf("task %s has already been launched, aborting schedule", task.ID)
|
||||||
|
} else {
|
||||||
|
return k.doSchedule(task)
|
||||||
|
}
|
||||||
|
|
||||||
|
default:
|
||||||
|
return "", fmt.Errorf("task %s is not pending, nothing to schedule", task.ID)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Call ScheduleFunc and subtract some resources, returning the name of the machine the task is scheduled on
|
||||||
|
func (k *schedulerAlgorithm) doSchedule(task *podtask.T) (string, error) {
|
||||||
|
var offer offers.Perishable
|
||||||
|
var err error
|
||||||
|
|
||||||
|
if task.HasAcceptedOffer() {
|
||||||
|
// verify that the offer is still on the table
|
||||||
|
var ok bool
|
||||||
|
offer, ok = k.sched.Offers().Get(task.GetOfferId())
|
||||||
|
|
||||||
|
if !ok || offer.HasExpired() {
|
||||||
|
task.Offer.Release()
|
||||||
|
task.Reset()
|
||||||
|
if err = k.sched.Tasks().Update(task); err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if offer == nil {
|
||||||
|
offer, err = k.podScheduler.SchedulePod(k.sched.Offers(), task)
|
||||||
|
}
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
|
||||||
|
details := offer.Details()
|
||||||
|
if details == nil {
|
||||||
|
return "", fmt.Errorf("offer already invalid/expired for task %v", task.ID)
|
||||||
|
}
|
||||||
|
|
||||||
|
if task.Offer != nil && task.Offer != offer {
|
||||||
|
return "", fmt.Errorf("task.offer assignment must be idempotent, task %+v: offer %+v", task, offer)
|
||||||
|
}
|
||||||
|
|
||||||
|
task.Offer = offer
|
||||||
|
if err := k.podScheduler.Procurement()(task, details); err != nil {
|
||||||
|
offer.Release()
|
||||||
|
task.Reset()
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := k.sched.Tasks().Update(task); err != nil {
|
||||||
|
offer.Release()
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
|
||||||
|
return details.GetHostname(), nil
|
||||||
|
}
|
18
contrib/mesos/pkg/scheduler/components/algorithm/doc.go
Normal file
18
contrib/mesos/pkg/scheduler/components/algorithm/doc.go
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
/*
|
||||||
|
Copyright 2015 The Kubernetes Authors All rights reserved.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
// Package algorithm implements the SchedulerAlgorithm
|
||||||
|
package algorithm
|
@@ -0,0 +1,19 @@
|
|||||||
|
/*
|
||||||
|
Copyright 2015 The Kubernetes Authors All rights reserved.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
// Package podschedulers defines an interface (w/ implementations) for matching
|
||||||
|
// pods against offers.
|
||||||
|
package podschedulers
|
@@ -14,7 +14,7 @@ See the License for the specific language governing permissions and
|
|||||||
limitations under the License.
|
limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
package scheduler
|
package podschedulers
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
@@ -23,6 +23,7 @@ import (
|
|||||||
|
|
||||||
"k8s.io/kubernetes/contrib/mesos/pkg/node"
|
"k8s.io/kubernetes/contrib/mesos/pkg/node"
|
||||||
"k8s.io/kubernetes/contrib/mesos/pkg/offers"
|
"k8s.io/kubernetes/contrib/mesos/pkg/offers"
|
||||||
|
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/errors"
|
||||||
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/podtask"
|
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/podtask"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -62,7 +63,7 @@ func NewFCFSPodScheduler(as AllocationStrategy, lookupNode node.LookupFunc) PodS
|
|||||||
}
|
}
|
||||||
|
|
||||||
// A first-come-first-serve scheduler: acquires the first offer that can support the task
|
// A first-come-first-serve scheduler: acquires the first offer that can support the task
|
||||||
func (fps *fcfsPodScheduler) SchedulePod(r offers.Registry, unused SlaveIndex, task *podtask.T) (offers.Perishable, error) {
|
func (fps *fcfsPodScheduler) SchedulePod(r offers.Registry, task *podtask.T) (offers.Perishable, error) {
|
||||||
podName := fmt.Sprintf("%s/%s", task.Pod.Namespace, task.Pod.Name)
|
podName := fmt.Sprintf("%s/%s", task.Pod.Namespace, task.Pod.Name)
|
||||||
var acceptedOffer offers.Perishable
|
var acceptedOffer offers.Perishable
|
||||||
err := r.Walk(func(p offers.Perishable) (bool, error) {
|
err := r.Walk(func(p offers.Perishable) (bool, error) {
|
||||||
@@ -101,5 +102,5 @@ func (fps *fcfsPodScheduler) SchedulePod(r offers.Registry, unused SlaveIndex, t
|
|||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
log.V(2).Infof("failed to find a fit for pod: %s", podName)
|
log.V(2).Infof("failed to find a fit for pod: %s", podName)
|
||||||
return nil, noSuitableOffersErr
|
return nil, errors.NoSuitableOffersErr
|
||||||
}
|
}
|
@@ -14,11 +14,9 @@ See the License for the specific language governing permissions and
|
|||||||
limitations under the License.
|
limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
package scheduler
|
package podschedulers
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"errors"
|
|
||||||
|
|
||||||
"k8s.io/kubernetes/contrib/mesos/pkg/offers"
|
"k8s.io/kubernetes/contrib/mesos/pkg/offers"
|
||||||
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/podtask"
|
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/podtask"
|
||||||
)
|
)
|
||||||
@@ -37,25 +35,11 @@ type PodScheduler interface {
|
|||||||
// SchedulePod implements how to schedule pods among slaves.
|
// SchedulePod implements how to schedule pods among slaves.
|
||||||
// We can have different implementation for different scheduling policy.
|
// We can have different implementation for different scheduling policy.
|
||||||
//
|
//
|
||||||
// The function accepts a group of slaves (each contains offers from
|
// The function accepts a set of offers and a single pod, which aligns well
|
||||||
// that slave) and a single pod, which aligns well with the k8s scheduling
|
// with the k8s scheduling algorithm. It returns an offerId that is acceptable
|
||||||
// algorithm. It returns an offerId that is acceptable for the pod, otherwise
|
// for the pod, otherwise nil. The caller is responsible for filling in task
|
||||||
// nil. The caller is responsible for filling in task state w/ relevant offer
|
// state w/ relevant offer details.
|
||||||
// details.
|
|
||||||
//
|
//
|
||||||
// See the FCFSPodScheduler for example.
|
// See the FCFSPodScheduler for example.
|
||||||
SchedulePod(r offers.Registry, slaves SlaveIndex, task *podtask.T) (offers.Perishable, error)
|
SchedulePod(r offers.Registry, task *podtask.T) (offers.Perishable, error)
|
||||||
}
|
|
||||||
|
|
||||||
// A minimal placeholder
|
|
||||||
type empty struct{}
|
|
||||||
|
|
||||||
var (
|
|
||||||
noSuitableOffersErr = errors.New("No suitable offers for pod/task")
|
|
||||||
noSuchPodErr = errors.New("No such pod exists")
|
|
||||||
noSuchTaskErr = errors.New("No such task exists")
|
|
||||||
)
|
|
||||||
|
|
||||||
type SlaveIndex interface {
|
|
||||||
slaveHostNameFor(id string) string
|
|
||||||
}
|
}
|
157
contrib/mesos/pkg/scheduler/components/binder/binder.go
Normal file
157
contrib/mesos/pkg/scheduler/components/binder/binder.go
Normal file
@@ -0,0 +1,157 @@
|
|||||||
|
/*
|
||||||
|
Copyright 2015 The Kubernetes Authors All rights reserved.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package binder
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"strconv"
|
||||||
|
|
||||||
|
log "github.com/golang/glog"
|
||||||
|
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler"
|
||||||
|
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/errors"
|
||||||
|
annotation "k8s.io/kubernetes/contrib/mesos/pkg/scheduler/meta"
|
||||||
|
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/podtask"
|
||||||
|
"k8s.io/kubernetes/pkg/api"
|
||||||
|
)
|
||||||
|
|
||||||
|
type Binder interface {
|
||||||
|
Bind(binding *api.Binding) error
|
||||||
|
}
|
||||||
|
|
||||||
|
type binder struct {
|
||||||
|
sched scheduler.Scheduler
|
||||||
|
}
|
||||||
|
|
||||||
|
func New(sched scheduler.Scheduler) Binder {
|
||||||
|
return &binder{
|
||||||
|
sched: sched,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// implements binding.Registry, launches the pod-associated-task in mesos
|
||||||
|
func (b *binder) Bind(binding *api.Binding) error {
|
||||||
|
|
||||||
|
ctx := api.WithNamespace(api.NewContext(), binding.Namespace)
|
||||||
|
|
||||||
|
// default upstream scheduler passes pod.Name as binding.Name
|
||||||
|
podKey, err := podtask.MakePodKey(ctx, binding.Name)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
b.sched.Lock()
|
||||||
|
defer b.sched.Unlock()
|
||||||
|
|
||||||
|
switch task, state := b.sched.Tasks().ForPod(podKey); state {
|
||||||
|
case podtask.StatePending:
|
||||||
|
return b.bind(ctx, binding, task)
|
||||||
|
default:
|
||||||
|
// in this case it's likely that the pod has been deleted between Schedule
|
||||||
|
// and Bind calls
|
||||||
|
log.Infof("No pending task for pod %s", podKey)
|
||||||
|
return errors.NoSuchPodErr //TODO(jdef) this error is somewhat misleading since the task could be running?!
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (b *binder) rollback(task *podtask.T, err error) error {
|
||||||
|
task.Offer.Release()
|
||||||
|
task.Reset()
|
||||||
|
if err2 := b.sched.Tasks().Update(task); err2 != nil {
|
||||||
|
log.Errorf("failed to update pod task: %v", err2)
|
||||||
|
}
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
// assumes that: caller has acquired scheduler lock and that the task is still pending
|
||||||
|
//
|
||||||
|
// bind does not actually do the binding itself, but launches the pod as a Mesos task. The
|
||||||
|
// kubernetes executor on the slave will finally do the binding. This is different from the
|
||||||
|
// upstream scheduler in the sense that the upstream scheduler does the binding and the
|
||||||
|
// kubelet will notice that and launches the pod.
|
||||||
|
func (b *binder) bind(ctx api.Context, binding *api.Binding, task *podtask.T) (err error) {
|
||||||
|
// sanity check: ensure that the task hasAcceptedOffer(), it's possible that between
|
||||||
|
// Schedule() and now that the offer for this task was rescinded or invalidated.
|
||||||
|
// ((we should never see this here))
|
||||||
|
if !task.HasAcceptedOffer() {
|
||||||
|
return fmt.Errorf("task has not accepted a valid offer %v", task.ID)
|
||||||
|
}
|
||||||
|
|
||||||
|
// By this time, there is a chance that the slave is disconnected.
|
||||||
|
offerId := task.GetOfferId()
|
||||||
|
if offer, ok := b.sched.Offers().Get(offerId); !ok || offer.HasExpired() {
|
||||||
|
// already rescinded or timed out or otherwise invalidated
|
||||||
|
return b.rollback(task, fmt.Errorf("failed prior to launchTask due to expired offer for task %v", task.ID))
|
||||||
|
}
|
||||||
|
|
||||||
|
if err = b.prepareTaskForLaunch(ctx, binding.Target.Name, task, offerId); err == nil {
|
||||||
|
log.V(2).Infof("launching task: %q on target %q slave %q for pod \"%v/%v\", cpu %.2f, mem %.2f MB",
|
||||||
|
task.ID, binding.Target.Name, task.Spec.SlaveID, task.Pod.Namespace, task.Pod.Name, task.Spec.CPU, task.Spec.Memory)
|
||||||
|
if err = b.sched.LaunchTask(task); err == nil {
|
||||||
|
b.sched.Offers().Invalidate(offerId)
|
||||||
|
task.Set(podtask.Launched)
|
||||||
|
if err = b.sched.Tasks().Update(task); err != nil {
|
||||||
|
// this should only happen if the task has been removed or has changed status,
|
||||||
|
// which SHOULD NOT HAPPEN as long as we're synchronizing correctly
|
||||||
|
log.Errorf("failed to update task w/ Launched status: %v", err)
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return b.rollback(task, fmt.Errorf("Failed to launch task %v: %v", task.ID, err))
|
||||||
|
}
|
||||||
|
|
||||||
|
//TODO(jdef) unit test this, ensure that task's copy of api.Pod is not modified
|
||||||
|
func (b *binder) prepareTaskForLaunch(ctx api.Context, machine string, task *podtask.T, offerId string) error {
|
||||||
|
pod := task.Pod
|
||||||
|
|
||||||
|
// we make an effort here to avoid making changes to the task's copy of the pod, since
|
||||||
|
// we want that to reflect the initial user spec, and not the modified spec that we
|
||||||
|
// build for the executor to consume.
|
||||||
|
oemCt := pod.Spec.Containers
|
||||||
|
pod.Spec.Containers = append([]api.Container{}, oemCt...) // (shallow) clone before mod
|
||||||
|
|
||||||
|
if pod.Annotations == nil {
|
||||||
|
pod.Annotations = make(map[string]string)
|
||||||
|
}
|
||||||
|
|
||||||
|
task.SaveRecoveryInfo(pod.Annotations)
|
||||||
|
pod.Annotations[annotation.BindingHostKey] = task.Spec.AssignedSlave
|
||||||
|
|
||||||
|
for _, entry := range task.Spec.PortMap {
|
||||||
|
oemPorts := pod.Spec.Containers[entry.ContainerIdx].Ports
|
||||||
|
ports := append([]api.ContainerPort{}, oemPorts...)
|
||||||
|
p := &ports[entry.PortIdx]
|
||||||
|
p.HostPort = int(entry.OfferPort)
|
||||||
|
op := strconv.FormatUint(entry.OfferPort, 10)
|
||||||
|
pod.Annotations[fmt.Sprintf(annotation.PortMappingKeyFormat, p.Protocol, p.ContainerPort)] = op
|
||||||
|
if p.Name != "" {
|
||||||
|
pod.Annotations[fmt.Sprintf(annotation.PortNameMappingKeyFormat, p.Protocol, p.Name)] = op
|
||||||
|
}
|
||||||
|
pod.Spec.Containers[entry.ContainerIdx].Ports = ports
|
||||||
|
}
|
||||||
|
|
||||||
|
// the kubelet-executor uses this to instantiate the pod
|
||||||
|
log.V(3).Infof("prepared pod spec: %+v", pod)
|
||||||
|
|
||||||
|
data, err := api.Codec.Encode(&pod)
|
||||||
|
if err != nil {
|
||||||
|
log.V(2).Infof("Failed to marshal the pod spec: %v", err)
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
task.Spec.Data = data
|
||||||
|
return nil
|
||||||
|
}
|
19
contrib/mesos/pkg/scheduler/components/binder/doc.go
Normal file
19
contrib/mesos/pkg/scheduler/components/binder/doc.go
Normal file
@@ -0,0 +1,19 @@
|
|||||||
|
/*
|
||||||
|
Copyright 2015 The Kubernetes Authors All rights reserved.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
// Package binder implements the Binder which launched a task and let the
|
||||||
|
// executor do the actual binding.
|
||||||
|
package binder
|
107
contrib/mesos/pkg/scheduler/components/controller/controller.go
Normal file
107
contrib/mesos/pkg/scheduler/components/controller/controller.go
Normal file
@@ -0,0 +1,107 @@
|
|||||||
|
/*
|
||||||
|
Copyright 2015 The Kubernetes Authors All rights reserved.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package controller
|
||||||
|
|
||||||
|
import (
|
||||||
|
"time"
|
||||||
|
|
||||||
|
log "github.com/golang/glog"
|
||||||
|
"k8s.io/kubernetes/contrib/mesos/pkg/runtime"
|
||||||
|
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/components/algorithm"
|
||||||
|
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/components/binder"
|
||||||
|
"k8s.io/kubernetes/pkg/api"
|
||||||
|
"k8s.io/kubernetes/pkg/client/record"
|
||||||
|
client "k8s.io/kubernetes/pkg/client/unversioned"
|
||||||
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
recoveryDelay = 100 * time.Millisecond // delay after scheduler plugin crashes, before we resume scheduling
|
||||||
|
|
||||||
|
FailedScheduling = "FailedScheduling"
|
||||||
|
Scheduled = "Scheduled"
|
||||||
|
)
|
||||||
|
|
||||||
|
type Controller interface {
|
||||||
|
Run(<-chan struct{})
|
||||||
|
}
|
||||||
|
|
||||||
|
type controller struct {
|
||||||
|
algorithm algorithm.SchedulerAlgorithm
|
||||||
|
binder binder.Binder
|
||||||
|
nextPod func() *api.Pod
|
||||||
|
error func(*api.Pod, error)
|
||||||
|
recorder record.EventRecorder
|
||||||
|
client *client.Client
|
||||||
|
started chan<- struct{} // startup latch
|
||||||
|
}
|
||||||
|
|
||||||
|
func New(client *client.Client, algorithm algorithm.SchedulerAlgorithm,
|
||||||
|
recorder record.EventRecorder, nextPod func() *api.Pod, error func(pod *api.Pod, schedulingErr error),
|
||||||
|
binder binder.Binder, started chan<- struct{}) Controller {
|
||||||
|
return &controller{
|
||||||
|
algorithm: algorithm,
|
||||||
|
binder: binder,
|
||||||
|
nextPod: nextPod,
|
||||||
|
error: error,
|
||||||
|
recorder: recorder,
|
||||||
|
client: client,
|
||||||
|
started: started,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *controller) Run(done <-chan struct{}) {
|
||||||
|
defer close(s.started)
|
||||||
|
go runtime.Until(s.scheduleOne, recoveryDelay, done)
|
||||||
|
}
|
||||||
|
|
||||||
|
// hacked from GoogleCloudPlatform/kubernetes/plugin/pkg/scheduler/scheduler.go,
|
||||||
|
// with the Modeler stuff removed since we don't use it because we have mesos.
|
||||||
|
func (s *controller) scheduleOne() {
|
||||||
|
pod := s.nextPod()
|
||||||
|
|
||||||
|
// pods which are pre-scheduled (i.e. NodeName is set) are deleted by the kubelet
|
||||||
|
// in upstream. Not so in Mesos because the kubelet hasn't see that pod yet. Hence,
|
||||||
|
// the scheduler has to take care of this:
|
||||||
|
if pod.Spec.NodeName != "" && pod.DeletionTimestamp != nil {
|
||||||
|
log.V(3).Infof("deleting pre-scheduled, not yet running pod: %s/%s", pod.Namespace, pod.Name)
|
||||||
|
s.client.Pods(pod.Namespace).Delete(pod.Name, api.NewDeleteOptions(0))
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
log.V(3).Infof("Attempting to schedule: %+v", pod)
|
||||||
|
dest, err := s.algorithm.Schedule(pod)
|
||||||
|
if err != nil {
|
||||||
|
log.V(1).Infof("Failed to schedule: %+v", pod)
|
||||||
|
s.recorder.Eventf(pod, FailedScheduling, "Error scheduling: %v", err)
|
||||||
|
s.error(pod, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
b := &api.Binding{
|
||||||
|
ObjectMeta: api.ObjectMeta{Namespace: pod.Namespace, Name: pod.Name},
|
||||||
|
Target: api.ObjectReference{
|
||||||
|
Kind: "Node",
|
||||||
|
Name: dest,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
if err := s.binder.Bind(b); err != nil {
|
||||||
|
log.V(1).Infof("Failed to bind pod: %+v", err)
|
||||||
|
s.recorder.Eventf(pod, FailedScheduling, "Binding rejected: %v", err)
|
||||||
|
s.error(pod, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
s.recorder.Eventf(pod, Scheduled, "Successfully assigned %v to %v", pod.Name, dest)
|
||||||
|
}
|
20
contrib/mesos/pkg/scheduler/components/controller/doc.go
Normal file
20
contrib/mesos/pkg/scheduler/components/controller/doc.go
Normal file
@@ -0,0 +1,20 @@
|
|||||||
|
/*
|
||||||
|
Copyright 2015 The Kubernetes Authors All rights reserved.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
// Package controller implements the scheduling controller which waits for pod
|
||||||
|
// events from the queuer (i.e. from the apiserver), passes them to the
|
||||||
|
// SchedulerAlgorithm and in case of success to the binder which does the launch.
|
||||||
|
package controller
|
125
contrib/mesos/pkg/scheduler/components/deleter/deleter.go
Normal file
125
contrib/mesos/pkg/scheduler/components/deleter/deleter.go
Normal file
@@ -0,0 +1,125 @@
|
|||||||
|
/*
|
||||||
|
Copyright 2015 The Kubernetes Authors All rights reserved.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package deleter
|
||||||
|
|
||||||
|
import (
|
||||||
|
"time"
|
||||||
|
|
||||||
|
log "github.com/golang/glog"
|
||||||
|
"k8s.io/kubernetes/contrib/mesos/pkg/queue"
|
||||||
|
"k8s.io/kubernetes/contrib/mesos/pkg/runtime"
|
||||||
|
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler"
|
||||||
|
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/errors"
|
||||||
|
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/podtask"
|
||||||
|
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/queuer"
|
||||||
|
"k8s.io/kubernetes/pkg/api"
|
||||||
|
)
|
||||||
|
|
||||||
|
type Deleter interface {
|
||||||
|
Run(updates <-chan queue.Entry, done <-chan struct{})
|
||||||
|
DeleteOne(pod *queuer.Pod) error
|
||||||
|
}
|
||||||
|
|
||||||
|
type deleter struct {
|
||||||
|
sched scheduler.Scheduler
|
||||||
|
qr queuer.Queuer
|
||||||
|
}
|
||||||
|
|
||||||
|
func New(sched scheduler.Scheduler, qr queuer.Queuer) Deleter {
|
||||||
|
return &deleter{
|
||||||
|
sched: sched,
|
||||||
|
qr: qr,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// currently monitors for "pod deleted" events, upon which handle()
|
||||||
|
// is invoked.
|
||||||
|
func (k *deleter) Run(updates <-chan queue.Entry, done <-chan struct{}) {
|
||||||
|
go runtime.Until(func() {
|
||||||
|
for {
|
||||||
|
entry := <-updates
|
||||||
|
pod := entry.Value().(*queuer.Pod)
|
||||||
|
if entry.Is(queue.DELETE_EVENT) {
|
||||||
|
if err := k.DeleteOne(pod); err != nil {
|
||||||
|
log.Error(err)
|
||||||
|
}
|
||||||
|
} else if !entry.Is(queue.POP_EVENT) {
|
||||||
|
k.qr.UpdatesAvailable()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}, 1*time.Second, done)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (k *deleter) DeleteOne(pod *queuer.Pod) error {
|
||||||
|
ctx := api.WithNamespace(api.NewDefaultContext(), pod.Namespace)
|
||||||
|
podKey, err := podtask.MakePodKey(ctx, pod.Name)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
log.V(2).Infof("pod deleted: %v", podKey)
|
||||||
|
|
||||||
|
// order is important here: we want to make sure we have the lock before
|
||||||
|
// removing the pod from the scheduling queue. this makes the concurrent
|
||||||
|
// execution of scheduler-error-handling and delete-handling easier to
|
||||||
|
// reason about.
|
||||||
|
k.sched.Lock()
|
||||||
|
defer k.sched.Unlock()
|
||||||
|
|
||||||
|
// prevent the scheduler from attempting to pop this; it's also possible that
|
||||||
|
// it's concurrently being scheduled (somewhere between pod scheduling and
|
||||||
|
// binding) - if so, then we'll end up removing it from taskRegistry which
|
||||||
|
// will abort Bind()ing
|
||||||
|
k.qr.Dequeue(pod.GetUID())
|
||||||
|
|
||||||
|
switch task, state := k.sched.Tasks().ForPod(podKey); state {
|
||||||
|
case podtask.StateUnknown:
|
||||||
|
log.V(2).Infof("Could not resolve pod '%s' to task id", podKey)
|
||||||
|
return errors.NoSuchPodErr
|
||||||
|
|
||||||
|
// determine if the task has already been launched to mesos, if not then
|
||||||
|
// cleanup is easier (unregister) since there's no state to sync
|
||||||
|
case podtask.StatePending:
|
||||||
|
if !task.Has(podtask.Launched) {
|
||||||
|
// we've been invoked in between Schedule() and Bind()
|
||||||
|
if task.HasAcceptedOffer() {
|
||||||
|
task.Offer.Release()
|
||||||
|
task.Reset()
|
||||||
|
task.Set(podtask.Deleted)
|
||||||
|
//TODO(jdef) probably want better handling here
|
||||||
|
if err := k.sched.Tasks().Update(task); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
k.sched.Tasks().Unregister(task)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
fallthrough
|
||||||
|
|
||||||
|
case podtask.StateRunning:
|
||||||
|
// signal to watchers that the related pod is going down
|
||||||
|
task.Set(podtask.Deleted)
|
||||||
|
if err := k.sched.Tasks().Update(task); err != nil {
|
||||||
|
log.Errorf("failed to update task w/ Deleted status: %v", err)
|
||||||
|
}
|
||||||
|
return k.sched.KillTask(task.ID)
|
||||||
|
|
||||||
|
default:
|
||||||
|
log.Infof("cannot kill pod '%s': non-terminal task not found %v", podKey, task.ID)
|
||||||
|
return errors.NoSuchTaskErr
|
||||||
|
}
|
||||||
|
}
|
160
contrib/mesos/pkg/scheduler/components/deleter/deleter_test.go
Normal file
160
contrib/mesos/pkg/scheduler/components/deleter/deleter_test.go
Normal file
@@ -0,0 +1,160 @@
|
|||||||
|
/*
|
||||||
|
Copyright 2015 The Kubernetes Authors All rights reserved.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package deleter
|
||||||
|
|
||||||
|
import (
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/stretchr/testify/assert"
|
||||||
|
"k8s.io/kubernetes/contrib/mesos/pkg/queue"
|
||||||
|
types "k8s.io/kubernetes/contrib/mesos/pkg/scheduler"
|
||||||
|
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/errors"
|
||||||
|
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/podtask"
|
||||||
|
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/queuer"
|
||||||
|
"k8s.io/kubernetes/pkg/api"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestDeleteOne_NonexistentPod(t *testing.T) {
|
||||||
|
assert := assert.New(t)
|
||||||
|
obj := &types.MockScheduler{}
|
||||||
|
reg := podtask.NewInMemoryRegistry()
|
||||||
|
obj.On("Tasks").Return(reg)
|
||||||
|
|
||||||
|
q := queue.NewDelayFIFO()
|
||||||
|
qr := queuer.New(q, nil)
|
||||||
|
assert.Equal(0, len(q.List()))
|
||||||
|
d := New(obj, qr)
|
||||||
|
pod := &queuer.Pod{Pod: &api.Pod{
|
||||||
|
ObjectMeta: api.ObjectMeta{
|
||||||
|
Name: "foo",
|
||||||
|
Namespace: api.NamespaceDefault,
|
||||||
|
}}}
|
||||||
|
err := d.DeleteOne(pod)
|
||||||
|
assert.Equal(err, errors.NoSuchPodErr)
|
||||||
|
obj.AssertExpectations(t)
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestDeleteOne_PendingPod(t *testing.T) {
|
||||||
|
assert := assert.New(t)
|
||||||
|
obj := &types.MockScheduler{}
|
||||||
|
reg := podtask.NewInMemoryRegistry()
|
||||||
|
obj.On("Tasks").Return(reg)
|
||||||
|
|
||||||
|
pod := &queuer.Pod{Pod: &api.Pod{
|
||||||
|
ObjectMeta: api.ObjectMeta{
|
||||||
|
Name: "foo",
|
||||||
|
UID: "foo0",
|
||||||
|
Namespace: api.NamespaceDefault,
|
||||||
|
}}}
|
||||||
|
task, err := podtask.New(api.NewDefaultContext(), "bar", pod.Pod)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("failed to create task: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
_, err = reg.Register(task)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("failed to register task: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// preconditions
|
||||||
|
q := queue.NewDelayFIFO()
|
||||||
|
qr := queuer.New(q, nil)
|
||||||
|
q.Add(pod, queue.ReplaceExisting)
|
||||||
|
assert.Equal(1, len(q.List()))
|
||||||
|
_, found := q.Get("default/foo")
|
||||||
|
assert.True(found)
|
||||||
|
|
||||||
|
// exec & post conditions
|
||||||
|
d := New(obj, qr)
|
||||||
|
err = d.DeleteOne(pod)
|
||||||
|
assert.Nil(err)
|
||||||
|
_, found = q.Get("foo0")
|
||||||
|
assert.False(found)
|
||||||
|
assert.Equal(0, len(q.List()))
|
||||||
|
obj.AssertExpectations(t)
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestDeleteOne_Running(t *testing.T) {
|
||||||
|
assert := assert.New(t)
|
||||||
|
obj := &types.MockScheduler{}
|
||||||
|
reg := podtask.NewInMemoryRegistry()
|
||||||
|
obj.On("Tasks").Return(reg)
|
||||||
|
|
||||||
|
pod := &queuer.Pod{Pod: &api.Pod{
|
||||||
|
ObjectMeta: api.ObjectMeta{
|
||||||
|
Name: "foo",
|
||||||
|
UID: "foo0",
|
||||||
|
Namespace: api.NamespaceDefault,
|
||||||
|
}}}
|
||||||
|
task, err := podtask.New(api.NewDefaultContext(), "bar", pod.Pod)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("unexpected error: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
task, err = reg.Register(task)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("unexpected error: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
task.Set(podtask.Launched)
|
||||||
|
err = reg.Update(task)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("unexpected error: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// preconditions
|
||||||
|
q := queue.NewDelayFIFO()
|
||||||
|
qr := queuer.New(q, nil)
|
||||||
|
q.Add(pod, queue.ReplaceExisting)
|
||||||
|
assert.Equal(1, len(q.List()))
|
||||||
|
_, found := q.Get("default/foo")
|
||||||
|
assert.True(found)
|
||||||
|
|
||||||
|
obj.On("KillTask", task.ID).Return(nil)
|
||||||
|
|
||||||
|
// exec & post conditions
|
||||||
|
d := New(obj, qr)
|
||||||
|
err = d.DeleteOne(pod)
|
||||||
|
assert.Nil(err)
|
||||||
|
_, found = q.Get("foo0")
|
||||||
|
assert.False(found)
|
||||||
|
assert.Equal(0, len(q.List()))
|
||||||
|
obj.AssertExpectations(t)
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestDeleteOne_badPodNaming(t *testing.T) {
|
||||||
|
assert := assert.New(t)
|
||||||
|
obj := &types.MockScheduler{}
|
||||||
|
pod := &queuer.Pod{Pod: &api.Pod{}}
|
||||||
|
q := queue.NewDelayFIFO()
|
||||||
|
qr := queuer.New(q, nil)
|
||||||
|
d := New(obj, qr)
|
||||||
|
|
||||||
|
err := d.DeleteOne(pod)
|
||||||
|
assert.NotNil(err)
|
||||||
|
|
||||||
|
pod.Pod.ObjectMeta.Name = "foo"
|
||||||
|
err = d.DeleteOne(pod)
|
||||||
|
assert.NotNil(err)
|
||||||
|
|
||||||
|
pod.Pod.ObjectMeta.Name = ""
|
||||||
|
pod.Pod.ObjectMeta.Namespace = "bar"
|
||||||
|
err = d.DeleteOne(pod)
|
||||||
|
assert.NotNil(err)
|
||||||
|
|
||||||
|
obj.AssertExpectations(t)
|
||||||
|
}
|
19
contrib/mesos/pkg/scheduler/components/deleter/doc.go
Normal file
19
contrib/mesos/pkg/scheduler/components/deleter/doc.go
Normal file
@@ -0,0 +1,19 @@
|
|||||||
|
/*
|
||||||
|
Copyright 2015 The Kubernetes Authors All rights reserved.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
// Package deleter implements the deleter which listens for pod DELETE events
|
||||||
|
// from the apiserver and kills tasks for deleted pods.
|
||||||
|
package deleter
|
20
contrib/mesos/pkg/scheduler/components/doc.go
Normal file
20
contrib/mesos/pkg/scheduler/components/doc.go
Normal file
@@ -0,0 +1,20 @@
|
|||||||
|
/*
|
||||||
|
Copyright 2015 The Kubernetes Authors All rights reserved.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
// Package components implements independent aspects of the scheduler which
|
||||||
|
// do not use Framework or Scheduler internals, but rely solely on the Scheduler
|
||||||
|
// interface.
|
||||||
|
package components
|
19
contrib/mesos/pkg/scheduler/components/errorhandler/doc.go
Normal file
19
contrib/mesos/pkg/scheduler/components/errorhandler/doc.go
Normal file
@@ -0,0 +1,19 @@
|
|||||||
|
/*
|
||||||
|
Copyright 2015 The Kubernetes Authors All rights reserved.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
// Package errorhandler implements the ErrorHandler which handles scheduer error
|
||||||
|
// and possibly requeue pods for scheduling again.
|
||||||
|
package errorhandler
|
@@ -0,0 +1,97 @@
|
|||||||
|
/*
|
||||||
|
Copyright 2015 The Kubernetes Authors All rights reserved.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package errorhandler
|
||||||
|
|
||||||
|
import (
|
||||||
|
log "github.com/golang/glog"
|
||||||
|
"k8s.io/kubernetes/contrib/mesos/pkg/backoff"
|
||||||
|
"k8s.io/kubernetes/contrib/mesos/pkg/queue"
|
||||||
|
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler"
|
||||||
|
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/errors"
|
||||||
|
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/podtask"
|
||||||
|
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/queuer"
|
||||||
|
"k8s.io/kubernetes/pkg/api"
|
||||||
|
"k8s.io/kubernetes/pkg/util"
|
||||||
|
)
|
||||||
|
|
||||||
|
type ErrorHandler interface {
|
||||||
|
Error(pod *api.Pod, schedulingErr error)
|
||||||
|
}
|
||||||
|
|
||||||
|
type errorHandler struct {
|
||||||
|
sched scheduler.Scheduler
|
||||||
|
backoff *backoff.Backoff
|
||||||
|
qr queuer.Queuer
|
||||||
|
newBreakChan func(podKey string) queue.BreakChan
|
||||||
|
}
|
||||||
|
|
||||||
|
func New(sched scheduler.Scheduler, backoff *backoff.Backoff, qr queuer.Queuer, newBC func(podKey string) queue.BreakChan) ErrorHandler {
|
||||||
|
return &errorHandler{
|
||||||
|
sched: sched,
|
||||||
|
backoff: backoff,
|
||||||
|
qr: qr,
|
||||||
|
newBreakChan: newBC,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// implementation of scheduling plugin's Error func; see plugin/pkg/scheduler
|
||||||
|
func (k *errorHandler) Error(pod *api.Pod, schedulingErr error) {
|
||||||
|
|
||||||
|
if schedulingErr == errors.NoSuchPodErr {
|
||||||
|
log.V(2).Infof("Not rescheduling non-existent pod %v", pod.Name)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
log.Infof("Error scheduling %v: %v; retrying", pod.Name, schedulingErr)
|
||||||
|
defer util.HandleCrash()
|
||||||
|
|
||||||
|
// default upstream scheduler passes pod.Name as binding.PodID
|
||||||
|
ctx := api.WithNamespace(api.NewDefaultContext(), pod.Namespace)
|
||||||
|
podKey, err := podtask.MakePodKey(ctx, pod.Name)
|
||||||
|
if err != nil {
|
||||||
|
log.Errorf("Failed to construct pod key, aborting scheduling for pod %v: %v", pod.Name, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
k.backoff.GC()
|
||||||
|
k.sched.Lock()
|
||||||
|
defer k.sched.Unlock()
|
||||||
|
|
||||||
|
switch task, state := k.sched.Tasks().ForPod(podKey); state {
|
||||||
|
case podtask.StateUnknown:
|
||||||
|
// if we don't have a mapping here any more then someone deleted the pod
|
||||||
|
log.V(2).Infof("Could not resolve pod to task, aborting pod reschdule: %s", podKey)
|
||||||
|
return
|
||||||
|
|
||||||
|
case podtask.StatePending:
|
||||||
|
if task.Has(podtask.Launched) {
|
||||||
|
log.V(2).Infof("Skipping re-scheduling for already-launched pod %v", podKey)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
breakoutEarly := queue.BreakChan(nil)
|
||||||
|
if schedulingErr == errors.NoSuitableOffersErr {
|
||||||
|
log.V(3).Infof("adding backoff breakout handler for pod %v", podKey)
|
||||||
|
breakoutEarly = k.newBreakChan(podKey)
|
||||||
|
}
|
||||||
|
delay := k.backoff.Get(podKey)
|
||||||
|
log.V(3).Infof("requeuing pod %v with delay %v", podKey, delay)
|
||||||
|
k.qr.Requeue(&queuer.Pod{Pod: pod, Delay: &delay, Notify: breakoutEarly})
|
||||||
|
|
||||||
|
default:
|
||||||
|
log.V(2).Infof("Task is no longer pending, aborting reschedule for pod %v", podKey)
|
||||||
|
}
|
||||||
|
}
|
@@ -14,5 +14,5 @@ See the License for the specific language governing permissions and
|
|||||||
limitations under the License.
|
limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
// Package slave manages node hostnames for slave ids.
|
// Package framework implements the Mesos scheduler.
|
||||||
package slave
|
package framework
|
@@ -14,83 +14,13 @@ See the License for the specific language governing permissions and
|
|||||||
limitations under the License.
|
limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
package scheduler
|
package framework
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"sync"
|
|
||||||
"testing"
|
|
||||||
|
|
||||||
mesos "github.com/mesos/mesos-go/mesosproto"
|
mesos "github.com/mesos/mesos-go/mesosproto"
|
||||||
"github.com/stretchr/testify/mock"
|
"github.com/stretchr/testify/mock"
|
||||||
"k8s.io/kubernetes/contrib/mesos/pkg/offers"
|
|
||||||
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/podtask"
|
|
||||||
"k8s.io/kubernetes/pkg/api"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
// implements SchedulerInterface
|
|
||||||
type MockScheduler struct {
|
|
||||||
sync.RWMutex
|
|
||||||
mock.Mock
|
|
||||||
}
|
|
||||||
|
|
||||||
func (m *MockScheduler) slaveHostNameFor(id string) (hostName string) {
|
|
||||||
args := m.Called(id)
|
|
||||||
x := args.Get(0)
|
|
||||||
if x != nil {
|
|
||||||
hostName = x.(string)
|
|
||||||
}
|
|
||||||
return
|
|
||||||
}
|
|
||||||
func (m *MockScheduler) algorithm() (f PodScheduler) {
|
|
||||||
args := m.Called()
|
|
||||||
x := args.Get(0)
|
|
||||||
if x != nil {
|
|
||||||
f = x.(PodScheduler)
|
|
||||||
}
|
|
||||||
return
|
|
||||||
}
|
|
||||||
func (m *MockScheduler) createPodTask(ctx api.Context, pod *api.Pod) (task *podtask.T, err error) {
|
|
||||||
args := m.Called(ctx, pod)
|
|
||||||
x := args.Get(0)
|
|
||||||
if x != nil {
|
|
||||||
task = x.(*podtask.T)
|
|
||||||
}
|
|
||||||
err = args.Error(1)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
func (m *MockScheduler) offers() (f offers.Registry) {
|
|
||||||
args := m.Called()
|
|
||||||
x := args.Get(0)
|
|
||||||
if x != nil {
|
|
||||||
f = x.(offers.Registry)
|
|
||||||
}
|
|
||||||
return
|
|
||||||
}
|
|
||||||
func (m *MockScheduler) tasks() (f podtask.Registry) {
|
|
||||||
args := m.Called()
|
|
||||||
x := args.Get(0)
|
|
||||||
if x != nil {
|
|
||||||
f = x.(podtask.Registry)
|
|
||||||
}
|
|
||||||
return
|
|
||||||
}
|
|
||||||
func (m *MockScheduler) killTask(taskId string) error {
|
|
||||||
args := m.Called(taskId)
|
|
||||||
return args.Error(0)
|
|
||||||
}
|
|
||||||
func (m *MockScheduler) launchTask(task *podtask.T) error {
|
|
||||||
args := m.Called(task)
|
|
||||||
return args.Error(0)
|
|
||||||
}
|
|
||||||
|
|
||||||
// @deprecated this is a placeholder for me to test the mock package
|
|
||||||
func TestNoSlavesYet(t *testing.T) {
|
|
||||||
obj := &MockScheduler{}
|
|
||||||
obj.On("slaveHostNameFor", "foo").Return(nil)
|
|
||||||
obj.slaveHostNameFor("foo")
|
|
||||||
obj.AssertExpectations(t)
|
|
||||||
}
|
|
||||||
|
|
||||||
/*-----------------------------------------------------------------------------
|
/*-----------------------------------------------------------------------------
|
||||||
|
|
|
|
||||||
| this really belongs in the mesos-go package, but that's being updated soon
|
| this really belongs in the mesos-go package, but that's being updated soon
|
||||||
@@ -146,57 +76,84 @@ func (m *MockSchedulerDriver) Init() error {
|
|||||||
args := m.Called()
|
args := m.Called()
|
||||||
return args.Error(0)
|
return args.Error(0)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *MockSchedulerDriver) Start() (mesos.Status, error) {
|
func (m *MockSchedulerDriver) Start() (mesos.Status, error) {
|
||||||
args := m.Called()
|
args := m.Called()
|
||||||
return status(args, 0), args.Error(1)
|
return status(args, 0), args.Error(1)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *MockSchedulerDriver) Stop(b bool) (mesos.Status, error) {
|
func (m *MockSchedulerDriver) Stop(b bool) (mesos.Status, error) {
|
||||||
args := m.Called(b)
|
args := m.Called(b)
|
||||||
return status(args, 0), args.Error(1)
|
return status(args, 0), args.Error(1)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *MockSchedulerDriver) Abort() (mesos.Status, error) {
|
func (m *MockSchedulerDriver) Abort() (mesos.Status, error) {
|
||||||
args := m.Called()
|
args := m.Called()
|
||||||
return status(args, 0), args.Error(1)
|
return status(args, 0), args.Error(1)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *MockSchedulerDriver) Join() (mesos.Status, error) {
|
func (m *MockSchedulerDriver) Join() (mesos.Status, error) {
|
||||||
args := m.Called()
|
args := m.Called()
|
||||||
return status(args, 0), args.Error(1)
|
return status(args, 0), args.Error(1)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *MockSchedulerDriver) Run() (mesos.Status, error) {
|
func (m *MockSchedulerDriver) Run() (mesos.Status, error) {
|
||||||
args := m.Called()
|
args := m.Called()
|
||||||
return status(args, 0), args.Error(1)
|
return status(args, 0), args.Error(1)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *MockSchedulerDriver) RequestResources(r []*mesos.Request) (mesos.Status, error) {
|
func (m *MockSchedulerDriver) RequestResources(r []*mesos.Request) (mesos.Status, error) {
|
||||||
args := m.Called(r)
|
args := m.Called(r)
|
||||||
return status(args, 0), args.Error(1)
|
return status(args, 0), args.Error(1)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *MockSchedulerDriver) ReconcileTasks(statuses []*mesos.TaskStatus) (mesos.Status, error) {
|
func (m *MockSchedulerDriver) ReconcileTasks(statuses []*mesos.TaskStatus) (mesos.Status, error) {
|
||||||
args := m.Called(statuses)
|
args := m.Called(statuses)
|
||||||
return status(args, 0), args.Error(1)
|
return status(args, 0), args.Error(1)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *MockSchedulerDriver) LaunchTasks(offerIds []*mesos.OfferID, ti []*mesos.TaskInfo, f *mesos.Filters) (mesos.Status, error) {
|
func (m *MockSchedulerDriver) LaunchTasks(offerIds []*mesos.OfferID, ti []*mesos.TaskInfo, f *mesos.Filters) (mesos.Status, error) {
|
||||||
args := m.Called(offerIds, ti, f)
|
args := m.Called(offerIds, ti, f)
|
||||||
return status(args, 0), args.Error(1)
|
return status(args, 0), args.Error(1)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *MockSchedulerDriver) KillTask(tid *mesos.TaskID) (mesos.Status, error) {
|
func (m *MockSchedulerDriver) KillTask(tid *mesos.TaskID) (mesos.Status, error) {
|
||||||
args := m.Called(tid)
|
args := m.Called(tid)
|
||||||
return status(args, 0), args.Error(1)
|
return status(args, 0), args.Error(1)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *MockSchedulerDriver) DeclineOffer(oid *mesos.OfferID, f *mesos.Filters) (mesos.Status, error) {
|
func (m *MockSchedulerDriver) DeclineOffer(oid *mesos.OfferID, f *mesos.Filters) (mesos.Status, error) {
|
||||||
args := m.Called(oid, f)
|
args := m.Called(oid, f)
|
||||||
return status(args, 0), args.Error(1)
|
return status(args, 0), args.Error(1)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *MockSchedulerDriver) ReviveOffers() (mesos.Status, error) {
|
func (m *MockSchedulerDriver) ReviveOffers() (mesos.Status, error) {
|
||||||
args := m.Called()
|
args := m.Called()
|
||||||
return status(args, 0), args.Error(0)
|
return status(args, 0), args.Error(0)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *MockSchedulerDriver) SendFrameworkMessage(eid *mesos.ExecutorID, sid *mesos.SlaveID, s string) (mesos.Status, error) {
|
func (m *MockSchedulerDriver) SendFrameworkMessage(eid *mesos.ExecutorID, sid *mesos.SlaveID, s string) (mesos.Status, error) {
|
||||||
args := m.Called(eid, sid, s)
|
args := m.Called(eid, sid, s)
|
||||||
return status(args, 0), args.Error(1)
|
return status(args, 0), args.Error(1)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *MockSchedulerDriver) Destroy() {
|
func (m *MockSchedulerDriver) Destroy() {
|
||||||
m.Called()
|
m.Called()
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *MockSchedulerDriver) Wait() {
|
func (m *MockSchedulerDriver) Wait() {
|
||||||
m.Called()
|
m.Called()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type JoinableDriver struct {
|
||||||
|
MockSchedulerDriver
|
||||||
|
joinFunc func() (mesos.Status, error)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Join invokes joinFunc if it has been set, otherwise blocks forever
|
||||||
|
func (m *JoinableDriver) Join() (mesos.Status, error) {
|
||||||
|
if m.joinFunc != nil {
|
||||||
|
return m.joinFunc()
|
||||||
|
}
|
||||||
|
select {}
|
||||||
|
}
|
716
contrib/mesos/pkg/scheduler/components/framework/framework.go
Normal file
716
contrib/mesos/pkg/scheduler/components/framework/framework.go
Normal file
@@ -0,0 +1,716 @@
|
|||||||
|
/*
|
||||||
|
Copyright 2015 The Kubernetes Authors All rights reserved.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package framework
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"io"
|
||||||
|
"math"
|
||||||
|
"net/http"
|
||||||
|
"sync"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
log "github.com/golang/glog"
|
||||||
|
mesos "github.com/mesos/mesos-go/mesosproto"
|
||||||
|
mutil "github.com/mesos/mesos-go/mesosutil"
|
||||||
|
bindings "github.com/mesos/mesos-go/scheduler"
|
||||||
|
execcfg "k8s.io/kubernetes/contrib/mesos/pkg/executor/config"
|
||||||
|
"k8s.io/kubernetes/contrib/mesos/pkg/executor/messages"
|
||||||
|
"k8s.io/kubernetes/contrib/mesos/pkg/node"
|
||||||
|
"k8s.io/kubernetes/contrib/mesos/pkg/offers"
|
||||||
|
offermetrics "k8s.io/kubernetes/contrib/mesos/pkg/offers/metrics"
|
||||||
|
"k8s.io/kubernetes/contrib/mesos/pkg/proc"
|
||||||
|
"k8s.io/kubernetes/contrib/mesos/pkg/runtime"
|
||||||
|
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler"
|
||||||
|
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/components/tasksreconciler"
|
||||||
|
schedcfg "k8s.io/kubernetes/contrib/mesos/pkg/scheduler/config"
|
||||||
|
merrors "k8s.io/kubernetes/contrib/mesos/pkg/scheduler/errors"
|
||||||
|
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/meta"
|
||||||
|
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/metrics"
|
||||||
|
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/podtask"
|
||||||
|
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/uid"
|
||||||
|
"k8s.io/kubernetes/pkg/api"
|
||||||
|
"k8s.io/kubernetes/pkg/api/errors"
|
||||||
|
client "k8s.io/kubernetes/pkg/client/unversioned"
|
||||||
|
"k8s.io/kubernetes/pkg/fields"
|
||||||
|
"k8s.io/kubernetes/pkg/kubelet/container"
|
||||||
|
kubetypes "k8s.io/kubernetes/pkg/kubelet/types"
|
||||||
|
"k8s.io/kubernetes/pkg/labels"
|
||||||
|
"k8s.io/kubernetes/pkg/util/sets"
|
||||||
|
)
|
||||||
|
|
||||||
|
type Framework interface {
|
||||||
|
bindings.Scheduler
|
||||||
|
|
||||||
|
Init(sched scheduler.Scheduler, electedMaster proc.Process, mux *http.ServeMux) error
|
||||||
|
Registration() <-chan struct{}
|
||||||
|
Offers() offers.Registry
|
||||||
|
LaunchTask(t *podtask.T) error
|
||||||
|
KillTask(id string) error
|
||||||
|
}
|
||||||
|
|
||||||
|
type framework struct {
|
||||||
|
// We use a lock here to avoid races
|
||||||
|
// between invoking the mesos callback
|
||||||
|
*sync.RWMutex
|
||||||
|
|
||||||
|
// Config related, write-once
|
||||||
|
sched scheduler.Scheduler
|
||||||
|
schedulerConfig *schedcfg.Config
|
||||||
|
executor *mesos.ExecutorInfo
|
||||||
|
executorGroup uint64
|
||||||
|
client *client.Client
|
||||||
|
failoverTimeout float64 // in seconds
|
||||||
|
reconcileInterval int64
|
||||||
|
nodeRegistrator node.Registrator
|
||||||
|
storeFrameworkId func(id string)
|
||||||
|
|
||||||
|
// Mesos context
|
||||||
|
driver bindings.SchedulerDriver // late initialization
|
||||||
|
frameworkId *mesos.FrameworkID
|
||||||
|
masterInfo *mesos.MasterInfo
|
||||||
|
registered bool
|
||||||
|
registration chan struct{} // signal chan that closes upon first successful registration
|
||||||
|
onRegistration sync.Once
|
||||||
|
offers offers.Registry
|
||||||
|
slaveHostNames *slaveRegistry
|
||||||
|
|
||||||
|
// via deferred init
|
||||||
|
tasksReconciler taskreconciler.TasksReconciler
|
||||||
|
mux *http.ServeMux
|
||||||
|
reconcileCooldown time.Duration
|
||||||
|
asRegisteredMaster proc.Doer
|
||||||
|
terminate <-chan struct{} // signal chan, closes when we should kill background tasks
|
||||||
|
}
|
||||||
|
|
||||||
|
type Config struct {
|
||||||
|
SchedulerConfig schedcfg.Config
|
||||||
|
Executor *mesos.ExecutorInfo
|
||||||
|
Client *client.Client
|
||||||
|
StoreFrameworkId func(id string)
|
||||||
|
FailoverTimeout float64
|
||||||
|
ReconcileInterval int64
|
||||||
|
ReconcileCooldown time.Duration
|
||||||
|
LookupNode node.LookupFunc
|
||||||
|
}
|
||||||
|
|
||||||
|
// New creates a new Framework
|
||||||
|
func New(config Config) Framework {
|
||||||
|
var k *framework
|
||||||
|
k = &framework{
|
||||||
|
schedulerConfig: &config.SchedulerConfig,
|
||||||
|
RWMutex: new(sync.RWMutex),
|
||||||
|
executor: config.Executor,
|
||||||
|
executorGroup: uid.Parse(config.Executor.ExecutorId.GetValue()).Group(),
|
||||||
|
client: config.Client,
|
||||||
|
failoverTimeout: config.FailoverTimeout,
|
||||||
|
reconcileInterval: config.ReconcileInterval,
|
||||||
|
nodeRegistrator: node.NewRegistrator(config.Client, config.LookupNode),
|
||||||
|
offers: offers.CreateRegistry(offers.RegistryConfig{
|
||||||
|
Compat: func(o *mesos.Offer) bool {
|
||||||
|
// the node must be registered and have up-to-date labels
|
||||||
|
n := config.LookupNode(o.GetHostname())
|
||||||
|
if n == nil || !node.IsUpToDate(n, node.SlaveAttributesToLabels(o.GetAttributes())) {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
// the executor IDs must not identify a kubelet-executor with a group that doesn't match ours
|
||||||
|
for _, eid := range o.GetExecutorIds() {
|
||||||
|
execuid := uid.Parse(eid.GetValue())
|
||||||
|
if execuid.Name() == execcfg.DefaultInfoID && execuid.Group() != k.executorGroup {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return true
|
||||||
|
},
|
||||||
|
DeclineOffer: func(id string) <-chan error {
|
||||||
|
errOnce := proc.NewErrorOnce(k.terminate)
|
||||||
|
errOuter := k.asRegisteredMaster.Do(func() {
|
||||||
|
var err error
|
||||||
|
defer errOnce.Report(err)
|
||||||
|
offerId := mutil.NewOfferID(id)
|
||||||
|
filters := &mesos.Filters{}
|
||||||
|
_, err = k.driver.DeclineOffer(offerId, filters)
|
||||||
|
})
|
||||||
|
return errOnce.Send(errOuter).Err()
|
||||||
|
},
|
||||||
|
// remember expired offers so that we can tell if a previously scheduler offer relies on one
|
||||||
|
LingerTTL: config.SchedulerConfig.OfferLingerTTL.Duration,
|
||||||
|
TTL: config.SchedulerConfig.OfferTTL.Duration,
|
||||||
|
ListenerDelay: config.SchedulerConfig.ListenerDelay.Duration,
|
||||||
|
}),
|
||||||
|
slaveHostNames: newSlaveRegistry(),
|
||||||
|
reconcileCooldown: config.ReconcileCooldown,
|
||||||
|
registration: make(chan struct{}),
|
||||||
|
asRegisteredMaster: proc.DoerFunc(func(proc.Action) <-chan error {
|
||||||
|
return proc.ErrorChanf("cannot execute action with unregistered scheduler")
|
||||||
|
}),
|
||||||
|
storeFrameworkId: config.StoreFrameworkId,
|
||||||
|
}
|
||||||
|
return k
|
||||||
|
}
|
||||||
|
|
||||||
|
func (k *framework) Init(sched scheduler.Scheduler, electedMaster proc.Process, mux *http.ServeMux) error {
|
||||||
|
log.V(1).Infoln("initializing kubernetes mesos scheduler")
|
||||||
|
|
||||||
|
k.sched = sched
|
||||||
|
k.mux = mux
|
||||||
|
k.asRegisteredMaster = proc.DoerFunc(func(a proc.Action) <-chan error {
|
||||||
|
if !k.registered {
|
||||||
|
return proc.ErrorChanf("failed to execute action, scheduler is disconnected")
|
||||||
|
}
|
||||||
|
return electedMaster.Do(a)
|
||||||
|
})
|
||||||
|
k.terminate = electedMaster.Done()
|
||||||
|
k.offers.Init(k.terminate)
|
||||||
|
k.nodeRegistrator.Run(k.terminate)
|
||||||
|
return k.recoverTasks()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (k *framework) asMaster() proc.Doer {
|
||||||
|
k.RLock()
|
||||||
|
defer k.RUnlock()
|
||||||
|
return k.asRegisteredMaster
|
||||||
|
}
|
||||||
|
|
||||||
|
func (k *framework) installDebugHandlers(mux *http.ServeMux) {
|
||||||
|
wrappedHandler := func(uri string, h http.Handler) {
|
||||||
|
mux.HandleFunc(uri, func(w http.ResponseWriter, r *http.Request) {
|
||||||
|
ch := make(chan struct{})
|
||||||
|
closer := runtime.Closer(ch)
|
||||||
|
proc.OnError(k.asMaster().Do(func() {
|
||||||
|
defer closer()
|
||||||
|
h.ServeHTTP(w, r)
|
||||||
|
}), func(err error) {
|
||||||
|
defer closer()
|
||||||
|
log.Warningf("failed HTTP request for %s: %v", uri, err)
|
||||||
|
w.WriteHeader(http.StatusServiceUnavailable)
|
||||||
|
}, k.terminate)
|
||||||
|
select {
|
||||||
|
case <-time.After(k.schedulerConfig.HttpHandlerTimeout.Duration):
|
||||||
|
log.Warningf("timed out waiting for request to be processed")
|
||||||
|
w.WriteHeader(http.StatusServiceUnavailable)
|
||||||
|
return
|
||||||
|
case <-ch: // noop
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
requestReconciliation := func(uri string, requestAction func()) {
|
||||||
|
wrappedHandler(uri, http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||||
|
requestAction()
|
||||||
|
w.WriteHeader(http.StatusNoContent)
|
||||||
|
}))
|
||||||
|
}
|
||||||
|
requestReconciliation("/debug/actions/requestExplicit", k.tasksReconciler.RequestExplicit)
|
||||||
|
requestReconciliation("/debug/actions/requestImplicit", k.tasksReconciler.RequestImplicit)
|
||||||
|
|
||||||
|
wrappedHandler("/debug/actions/kamikaze", http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||||
|
slaves := k.slaveHostNames.SlaveIDs()
|
||||||
|
for _, slaveId := range slaves {
|
||||||
|
_, err := k.driver.SendFrameworkMessage(
|
||||||
|
k.executor.ExecutorId,
|
||||||
|
mutil.NewSlaveID(slaveId),
|
||||||
|
messages.Kamikaze)
|
||||||
|
if err != nil {
|
||||||
|
log.Warningf("failed to send kamikaze message to slave %s: %v", slaveId, err)
|
||||||
|
} else {
|
||||||
|
io.WriteString(w, fmt.Sprintf("kamikaze slave %s\n", slaveId))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
io.WriteString(w, "OK")
|
||||||
|
}))
|
||||||
|
}
|
||||||
|
|
||||||
|
func (k *framework) Registration() <-chan struct{} {
|
||||||
|
return k.registration
|
||||||
|
}
|
||||||
|
|
||||||
|
// Registered is called when the scheduler registered with the master successfully.
|
||||||
|
func (k *framework) Registered(drv bindings.SchedulerDriver, fid *mesos.FrameworkID, mi *mesos.MasterInfo) {
|
||||||
|
log.Infof("Scheduler registered with the master: %v with frameworkId: %v\n", mi, fid)
|
||||||
|
|
||||||
|
k.driver = drv
|
||||||
|
k.frameworkId = fid
|
||||||
|
k.masterInfo = mi
|
||||||
|
k.registered = true
|
||||||
|
|
||||||
|
k.onRegistration.Do(func() { k.onInitialRegistration(drv) })
|
||||||
|
k.tasksReconciler.RequestExplicit()
|
||||||
|
}
|
||||||
|
|
||||||
|
// Reregistered is called when the scheduler re-registered with the master successfully.
|
||||||
|
// This happends when the master fails over.
|
||||||
|
func (k *framework) Reregistered(drv bindings.SchedulerDriver, mi *mesos.MasterInfo) {
|
||||||
|
log.Infof("Scheduler reregistered with the master: %v\n", mi)
|
||||||
|
|
||||||
|
k.driver = drv
|
||||||
|
k.masterInfo = mi
|
||||||
|
k.registered = true
|
||||||
|
|
||||||
|
k.onRegistration.Do(func() { k.onInitialRegistration(drv) })
|
||||||
|
k.tasksReconciler.RequestExplicit()
|
||||||
|
}
|
||||||
|
|
||||||
|
// perform one-time initialization actions upon the first registration event received from Mesos.
|
||||||
|
func (k *framework) onInitialRegistration(driver bindings.SchedulerDriver) {
|
||||||
|
defer close(k.registration)
|
||||||
|
|
||||||
|
if k.failoverTimeout > 0 {
|
||||||
|
refreshInterval := k.schedulerConfig.FrameworkIdRefreshInterval.Duration
|
||||||
|
if k.failoverTimeout < k.schedulerConfig.FrameworkIdRefreshInterval.Duration.Seconds() {
|
||||||
|
refreshInterval = time.Duration(math.Max(1, k.failoverTimeout/2)) * time.Second
|
||||||
|
}
|
||||||
|
go runtime.Until(func() {
|
||||||
|
k.storeFrameworkId(k.frameworkId.GetValue())
|
||||||
|
}, refreshInterval, k.terminate)
|
||||||
|
}
|
||||||
|
|
||||||
|
r1 := k.makeTaskRegistryReconciler()
|
||||||
|
r2 := k.makePodRegistryReconciler()
|
||||||
|
|
||||||
|
k.tasksReconciler = taskreconciler.New(k.asRegisteredMaster, taskreconciler.MakeComposite(k.terminate, r1, r2),
|
||||||
|
k.reconcileCooldown, k.schedulerConfig.ExplicitReconciliationAbortTimeout.Duration, k.terminate)
|
||||||
|
go k.tasksReconciler.Run(driver, k.terminate)
|
||||||
|
|
||||||
|
if k.reconcileInterval > 0 {
|
||||||
|
ri := time.Duration(k.reconcileInterval) * time.Second
|
||||||
|
time.AfterFunc(k.schedulerConfig.InitialImplicitReconciliationDelay.Duration, func() { runtime.Until(k.tasksReconciler.RequestImplicit, ri, k.terminate) })
|
||||||
|
log.Infof("will perform implicit task reconciliation at interval: %v after %v", ri, k.schedulerConfig.InitialImplicitReconciliationDelay.Duration)
|
||||||
|
}
|
||||||
|
|
||||||
|
k.installDebugHandlers(k.mux)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Disconnected is called when the scheduler loses connection to the master.
|
||||||
|
func (k *framework) Disconnected(driver bindings.SchedulerDriver) {
|
||||||
|
log.Infof("Master disconnected!\n")
|
||||||
|
|
||||||
|
k.registered = false
|
||||||
|
|
||||||
|
// discard all cached offers to avoid unnecessary TASK_LOST updates
|
||||||
|
k.offers.Invalidate("")
|
||||||
|
}
|
||||||
|
|
||||||
|
// ResourceOffers is called when the scheduler receives some offers from the master.
|
||||||
|
func (k *framework) ResourceOffers(driver bindings.SchedulerDriver, offers []*mesos.Offer) {
|
||||||
|
log.V(2).Infof("Received offers %+v", offers)
|
||||||
|
|
||||||
|
// Record the offers in the global offer map as well as each slave's offer map.
|
||||||
|
k.offers.Add(offers)
|
||||||
|
for _, offer := range offers {
|
||||||
|
slaveId := offer.GetSlaveId().GetValue()
|
||||||
|
k.slaveHostNames.Register(slaveId, offer.GetHostname())
|
||||||
|
|
||||||
|
// create api object if not existing already
|
||||||
|
if k.nodeRegistrator != nil {
|
||||||
|
labels := node.SlaveAttributesToLabels(offer.GetAttributes())
|
||||||
|
_, err := k.nodeRegistrator.Register(offer.GetHostname(), labels)
|
||||||
|
if err != nil {
|
||||||
|
log.Error(err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// OfferRescinded is called when the resources are recinded from the scheduler.
|
||||||
|
func (k *framework) OfferRescinded(driver bindings.SchedulerDriver, offerId *mesos.OfferID) {
|
||||||
|
log.Infof("Offer rescinded %v\n", offerId)
|
||||||
|
|
||||||
|
oid := offerId.GetValue()
|
||||||
|
k.offers.Delete(oid, offermetrics.OfferRescinded)
|
||||||
|
}
|
||||||
|
|
||||||
|
// StatusUpdate is called when a status update message is sent to the scheduler.
|
||||||
|
func (k *framework) StatusUpdate(driver bindings.SchedulerDriver, taskStatus *mesos.TaskStatus) {
|
||||||
|
|
||||||
|
source, reason := "none", "none"
|
||||||
|
if taskStatus.Source != nil {
|
||||||
|
source = (*taskStatus.Source).String()
|
||||||
|
}
|
||||||
|
if taskStatus.Reason != nil {
|
||||||
|
reason = (*taskStatus.Reason).String()
|
||||||
|
}
|
||||||
|
taskState := taskStatus.GetState()
|
||||||
|
metrics.StatusUpdates.WithLabelValues(source, reason, taskState.String()).Inc()
|
||||||
|
|
||||||
|
message := "none"
|
||||||
|
if taskStatus.Message != nil {
|
||||||
|
message = *taskStatus.Message
|
||||||
|
}
|
||||||
|
|
||||||
|
log.Infof(
|
||||||
|
"task status update %q from %q for task %q on slave %q executor %q for reason %q with message %q",
|
||||||
|
taskState.String(),
|
||||||
|
source,
|
||||||
|
taskStatus.TaskId.GetValue(),
|
||||||
|
taskStatus.SlaveId.GetValue(),
|
||||||
|
taskStatus.ExecutorId.GetValue(),
|
||||||
|
reason,
|
||||||
|
message,
|
||||||
|
)
|
||||||
|
|
||||||
|
switch taskState {
|
||||||
|
case mesos.TaskState_TASK_RUNNING, mesos.TaskState_TASK_FINISHED, mesos.TaskState_TASK_STARTING, mesos.TaskState_TASK_STAGING:
|
||||||
|
if _, state := k.sched.Tasks().UpdateStatus(taskStatus); state == podtask.StateUnknown {
|
||||||
|
if taskState != mesos.TaskState_TASK_FINISHED {
|
||||||
|
//TODO(jdef) what if I receive this after a TASK_LOST or TASK_KILLED?
|
||||||
|
//I don't want to reincarnate then.. TASK_LOST is a special case because
|
||||||
|
//the master is stateless and there are scenarios where I may get TASK_LOST
|
||||||
|
//followed by TASK_RUNNING.
|
||||||
|
//TODO(jdef) consider running this asynchronously since there are API server
|
||||||
|
//calls that may be made
|
||||||
|
k.reconcileNonTerminalTask(driver, taskStatus)
|
||||||
|
} // else, we don't really care about FINISHED tasks that aren't registered
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if hostName := k.slaveHostNames.HostName(taskStatus.GetSlaveId().GetValue()); hostName == "" {
|
||||||
|
// a registered task has an update reported by a slave that we don't recognize.
|
||||||
|
// this should never happen! So we don't reconcile it.
|
||||||
|
log.Errorf("Ignore status %+v because the slave does not exist", taskStatus)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
case mesos.TaskState_TASK_FAILED, mesos.TaskState_TASK_ERROR:
|
||||||
|
if task, _ := k.sched.Tasks().UpdateStatus(taskStatus); task != nil {
|
||||||
|
if task.Has(podtask.Launched) && !task.Has(podtask.Bound) {
|
||||||
|
go k.sched.Reconcile(task)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// unknown task failed, not much we can do about it
|
||||||
|
return
|
||||||
|
}
|
||||||
|
// last-ditch effort to reconcile our records
|
||||||
|
fallthrough
|
||||||
|
case mesos.TaskState_TASK_LOST, mesos.TaskState_TASK_KILLED:
|
||||||
|
k.reconcileTerminalTask(driver, taskStatus)
|
||||||
|
default:
|
||||||
|
log.Errorf(
|
||||||
|
"unknown task status %q from %q for task %q on slave %q executor %q for reason %q with message %q",
|
||||||
|
taskState.String(),
|
||||||
|
source,
|
||||||
|
taskStatus.TaskId.GetValue(),
|
||||||
|
taskStatus.SlaveId.GetValue(),
|
||||||
|
taskStatus.ExecutorId.GetValue(),
|
||||||
|
reason,
|
||||||
|
message,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (k *framework) reconcileTerminalTask(driver bindings.SchedulerDriver, taskStatus *mesos.TaskStatus) {
|
||||||
|
task, state := k.sched.Tasks().UpdateStatus(taskStatus)
|
||||||
|
|
||||||
|
if (state == podtask.StateRunning || state == podtask.StatePending) &&
|
||||||
|
((taskStatus.GetSource() == mesos.TaskStatus_SOURCE_MASTER && taskStatus.GetReason() == mesos.TaskStatus_REASON_RECONCILIATION) ||
|
||||||
|
(taskStatus.GetSource() == mesos.TaskStatus_SOURCE_SLAVE && taskStatus.GetReason() == mesos.TaskStatus_REASON_EXECUTOR_TERMINATED) ||
|
||||||
|
(taskStatus.GetSource() == mesos.TaskStatus_SOURCE_SLAVE && taskStatus.GetReason() == mesos.TaskStatus_REASON_EXECUTOR_UNREGISTERED) ||
|
||||||
|
(taskStatus.GetSource() == mesos.TaskStatus_SOURCE_EXECUTOR && taskStatus.GetMessage() == messages.ContainersDisappeared)) {
|
||||||
|
//--
|
||||||
|
// pod-task has metadata that refers to:
|
||||||
|
// (1) a task that Mesos no longer knows about, or else
|
||||||
|
// (2) a pod that the Kubelet will never report as "failed"
|
||||||
|
// (3) a pod that the kubeletExecutor reported as lost (likely due to docker daemon crash/restart)
|
||||||
|
// For now, destroy the pod and hope that there's a replication controller backing it up.
|
||||||
|
// TODO(jdef) for case #2 don't delete the pod, just update it's status to Failed
|
||||||
|
pod := &task.Pod
|
||||||
|
log.Warningf("deleting rogue pod %v/%v for lost task %v", pod.Namespace, pod.Name, task.ID)
|
||||||
|
if err := k.client.Pods(pod.Namespace).Delete(pod.Name, api.NewDeleteOptions(0)); err != nil && !errors.IsNotFound(err) {
|
||||||
|
log.Errorf("failed to delete pod %v/%v for terminal task %v: %v", pod.Namespace, pod.Name, task.ID, err)
|
||||||
|
}
|
||||||
|
} else if taskStatus.GetReason() == mesos.TaskStatus_REASON_EXECUTOR_TERMINATED || taskStatus.GetReason() == mesos.TaskStatus_REASON_EXECUTOR_UNREGISTERED {
|
||||||
|
// attempt to prevent dangling pods in the pod and task registries
|
||||||
|
log.V(1).Infof("request explicit reconciliation to clean up for task %v after executor reported (terminated/unregistered)", taskStatus.TaskId.GetValue())
|
||||||
|
k.tasksReconciler.RequestExplicit()
|
||||||
|
} else if taskStatus.GetState() == mesos.TaskState_TASK_LOST && state == podtask.StateRunning && taskStatus.ExecutorId != nil && taskStatus.SlaveId != nil {
|
||||||
|
//TODO(jdef) this may not be meaningful once we have proper checkpointing and master detection
|
||||||
|
//If we're reconciling and receive this then the executor may be
|
||||||
|
//running a task that we need it to kill. It's possible that the framework
|
||||||
|
//is unrecognized by the master at this point, so KillTask is not guaranteed
|
||||||
|
//to do anything. The underlying driver transport may be able to send a
|
||||||
|
//FrameworkMessage directly to the slave to terminate the task.
|
||||||
|
log.V(2).Info("forwarding TASK_LOST message to executor %v on slave %v", taskStatus.ExecutorId, taskStatus.SlaveId)
|
||||||
|
data := fmt.Sprintf("%s:%s", messages.TaskLost, task.ID) //TODO(jdef) use a real message type
|
||||||
|
if _, err := driver.SendFrameworkMessage(taskStatus.ExecutorId, taskStatus.SlaveId, data); err != nil {
|
||||||
|
log.Error(err.Error())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// reconcile an unknown (from the perspective of our registry) non-terminal task
|
||||||
|
func (k *framework) reconcileNonTerminalTask(driver bindings.SchedulerDriver, taskStatus *mesos.TaskStatus) {
|
||||||
|
// attempt to recover task from pod info:
|
||||||
|
// - task data may contain an api.PodStatusResult; if status.reason == REASON_RECONCILIATION then status.data == nil
|
||||||
|
// - the Name can be parsed by container.ParseFullName() to yield a pod Name and Namespace
|
||||||
|
// - pull the pod metadata down from the api server
|
||||||
|
// - perform task recovery based on pod metadata
|
||||||
|
taskId := taskStatus.TaskId.GetValue()
|
||||||
|
if taskStatus.GetReason() == mesos.TaskStatus_REASON_RECONCILIATION && taskStatus.GetSource() == mesos.TaskStatus_SOURCE_MASTER {
|
||||||
|
// there will be no data in the task status that we can use to determine the associated pod
|
||||||
|
switch taskStatus.GetState() {
|
||||||
|
case mesos.TaskState_TASK_STAGING:
|
||||||
|
// there is still hope for this task, don't kill it just yet
|
||||||
|
//TODO(jdef) there should probably be a limit for how long we tolerate tasks stuck in this state
|
||||||
|
return
|
||||||
|
default:
|
||||||
|
// for TASK_{STARTING,RUNNING} we should have already attempted to recoverTasks() for.
|
||||||
|
// if the scheduler failed over before the executor fired TASK_STARTING, then we should *not*
|
||||||
|
// be processing this reconciliation update before we process the one from the executor.
|
||||||
|
// point: we don't know what this task is (perhaps there was unrecoverable metadata in the pod),
|
||||||
|
// so it gets killed.
|
||||||
|
log.Errorf("killing non-terminal, unrecoverable task %v", taskId)
|
||||||
|
}
|
||||||
|
} else if podStatus, err := podtask.ParsePodStatusResult(taskStatus); err != nil {
|
||||||
|
// possible rogue pod exists at this point because we can't identify it; should kill the task
|
||||||
|
log.Errorf("possible rogue pod; illegal task status data for task %v, expected an api.PodStatusResult: %v", taskId, err)
|
||||||
|
} else if name, namespace, err := container.ParsePodFullName(podStatus.Name); err != nil {
|
||||||
|
// possible rogue pod exists at this point because we can't identify it; should kill the task
|
||||||
|
log.Errorf("possible rogue pod; illegal api.PodStatusResult, unable to parse full pod name from: '%v' for task %v: %v",
|
||||||
|
podStatus.Name, taskId, err)
|
||||||
|
} else if pod, err := k.client.Pods(namespace).Get(name); err == nil {
|
||||||
|
if t, ok, err := podtask.RecoverFrom(*pod); ok {
|
||||||
|
log.Infof("recovered task %v from metadata in pod %v/%v", taskId, namespace, name)
|
||||||
|
_, err := k.sched.Tasks().Register(t)
|
||||||
|
if err != nil {
|
||||||
|
// someone beat us to it?!
|
||||||
|
log.Warningf("failed to register recovered task: %v", err)
|
||||||
|
return
|
||||||
|
} else {
|
||||||
|
k.sched.Tasks().UpdateStatus(taskStatus)
|
||||||
|
}
|
||||||
|
return
|
||||||
|
} else if err != nil {
|
||||||
|
//should kill the pod and the task
|
||||||
|
log.Errorf("killing pod, failed to recover task from pod %v/%v: %v", namespace, name, err)
|
||||||
|
if err := k.client.Pods(namespace).Delete(name, nil); err != nil {
|
||||||
|
log.Errorf("failed to delete pod %v/%v: %v", namespace, name, err)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
//this is pretty unexpected: we received a TASK_{STARTING,RUNNING} message, but the apiserver's pod
|
||||||
|
//metadata is not appropriate for task reconstruction -- which should almost certainly never
|
||||||
|
//be the case unless someone swapped out the pod on us (and kept the same namespace/name) while
|
||||||
|
//we were failed over.
|
||||||
|
|
||||||
|
//kill this task, allow the newly launched scheduler to schedule the new pod
|
||||||
|
log.Warningf("unexpected pod metadata for task %v in apiserver, assuming new unscheduled pod spec: %+v", taskId, pod)
|
||||||
|
}
|
||||||
|
} else if errors.IsNotFound(err) {
|
||||||
|
// pod lookup failed, should delete the task since the pod is no longer valid; may be redundant, that's ok
|
||||||
|
log.Infof("killing task %v since pod %v/%v no longer exists", taskId, namespace, name)
|
||||||
|
} else if errors.IsServerTimeout(err) {
|
||||||
|
log.V(2).Infof("failed to reconcile task due to API server timeout: %v", err)
|
||||||
|
return
|
||||||
|
} else {
|
||||||
|
log.Errorf("unexpected API server error, aborting reconcile for task %v: %v", taskId, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if _, err := driver.KillTask(taskStatus.TaskId); err != nil {
|
||||||
|
log.Errorf("failed to kill task %v: %v", taskId, err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// FrameworkMessage is called when the scheduler receives a message from the executor.
|
||||||
|
func (k *framework) FrameworkMessage(driver bindings.SchedulerDriver,
|
||||||
|
executorId *mesos.ExecutorID, slaveId *mesos.SlaveID, message string) {
|
||||||
|
log.Infof("Received messages from executor %v of slave %v, %v\n", executorId, slaveId, message)
|
||||||
|
}
|
||||||
|
|
||||||
|
// SlaveLost is called when some slave is lost.
|
||||||
|
func (k *framework) SlaveLost(driver bindings.SchedulerDriver, slaveId *mesos.SlaveID) {
|
||||||
|
log.Infof("Slave %v is lost\n", slaveId)
|
||||||
|
|
||||||
|
sid := slaveId.GetValue()
|
||||||
|
k.offers.InvalidateForSlave(sid)
|
||||||
|
|
||||||
|
// TODO(jdef): delete slave from our internal list? probably not since we may need to reconcile
|
||||||
|
// tasks. it would be nice to somehow flag the slave as lost so that, perhaps, we can periodically
|
||||||
|
// flush lost slaves older than X, and for which no tasks or pods reference.
|
||||||
|
|
||||||
|
// unfinished tasks/pods will be dropped. use a replication controller if you want pods to
|
||||||
|
// be restarted when slaves die.
|
||||||
|
}
|
||||||
|
|
||||||
|
// ExecutorLost is called when some executor is lost.
|
||||||
|
func (k *framework) ExecutorLost(driver bindings.SchedulerDriver, executorId *mesos.ExecutorID, slaveId *mesos.SlaveID, status int) {
|
||||||
|
log.Infof("Executor %v of slave %v is lost, status: %v\n", executorId, slaveId, status)
|
||||||
|
// TODO(yifan): Restart any unfinished tasks of the executor.
|
||||||
|
}
|
||||||
|
|
||||||
|
// Error is called when there is an unrecoverable error in the scheduler or scheduler driver.
|
||||||
|
// The driver should have been aborted before this is invoked.
|
||||||
|
func (k *framework) Error(driver bindings.SchedulerDriver, message string) {
|
||||||
|
log.Fatalf("fatal scheduler error: %v\n", message)
|
||||||
|
}
|
||||||
|
|
||||||
|
// filter func used for explicit task reconciliation, selects only non-terminal tasks which
|
||||||
|
// have been communicated to mesos (read: launched).
|
||||||
|
func explicitTaskFilter(t *podtask.T) bool {
|
||||||
|
switch t.State {
|
||||||
|
case podtask.StateRunning:
|
||||||
|
return true
|
||||||
|
case podtask.StatePending:
|
||||||
|
return t.Has(podtask.Launched)
|
||||||
|
default:
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// reconciler action factory, performs explicit task reconciliation for non-terminal
|
||||||
|
// tasks listed in the scheduler's internal taskRegistry.
|
||||||
|
func (k *framework) makeTaskRegistryReconciler() taskreconciler.Action {
|
||||||
|
return taskreconciler.Action(func(drv bindings.SchedulerDriver, cancel <-chan struct{}) <-chan error {
|
||||||
|
taskToSlave := make(map[string]string)
|
||||||
|
for _, t := range k.sched.Tasks().List(explicitTaskFilter) {
|
||||||
|
if t.Spec.SlaveID != "" {
|
||||||
|
taskToSlave[t.ID] = t.Spec.SlaveID
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return proc.ErrorChan(k.explicitlyReconcileTasks(drv, taskToSlave, cancel))
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// reconciler action factory, performs explicit task reconciliation for non-terminal
|
||||||
|
// tasks identified by annotations in the Kubernetes pod registry.
|
||||||
|
func (k *framework) makePodRegistryReconciler() taskreconciler.Action {
|
||||||
|
return taskreconciler.Action(func(drv bindings.SchedulerDriver, cancel <-chan struct{}) <-chan error {
|
||||||
|
podList, err := k.client.Pods(api.NamespaceAll).List(labels.Everything(), fields.Everything())
|
||||||
|
if err != nil {
|
||||||
|
return proc.ErrorChanf("failed to reconcile pod registry: %v", err)
|
||||||
|
}
|
||||||
|
taskToSlave := make(map[string]string)
|
||||||
|
for _, pod := range podList.Items {
|
||||||
|
if len(pod.Annotations) == 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
taskId, found := pod.Annotations[meta.TaskIdKey]
|
||||||
|
if !found {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
slaveId, found := pod.Annotations[meta.SlaveIdKey]
|
||||||
|
if !found {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
taskToSlave[taskId] = slaveId
|
||||||
|
}
|
||||||
|
return proc.ErrorChan(k.explicitlyReconcileTasks(drv, taskToSlave, cancel))
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// execute an explicit task reconciliation, as per http://mesos.apache.org/documentation/latest/reconciliation/
|
||||||
|
func (k *framework) explicitlyReconcileTasks(driver bindings.SchedulerDriver, taskToSlave map[string]string, cancel <-chan struct{}) error {
|
||||||
|
log.Info("explicit reconcile tasks")
|
||||||
|
|
||||||
|
// tell mesos to send us the latest status updates for all the non-terminal tasks that we know about
|
||||||
|
statusList := []*mesos.TaskStatus{}
|
||||||
|
remaining := sets.StringKeySet(taskToSlave)
|
||||||
|
for taskId, slaveId := range taskToSlave {
|
||||||
|
if slaveId == "" {
|
||||||
|
delete(taskToSlave, taskId)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
statusList = append(statusList, &mesos.TaskStatus{
|
||||||
|
TaskId: mutil.NewTaskID(taskId),
|
||||||
|
SlaveId: mutil.NewSlaveID(slaveId),
|
||||||
|
State: mesos.TaskState_TASK_RUNNING.Enum(), // req'd field, doesn't have to reflect reality
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
select {
|
||||||
|
case <-cancel:
|
||||||
|
return merrors.ReconciliationCancelledErr
|
||||||
|
default:
|
||||||
|
if _, err := driver.ReconcileTasks(statusList); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
start := time.Now()
|
||||||
|
first := true
|
||||||
|
for backoff := 1 * time.Second; first || remaining.Len() > 0; backoff = backoff * 2 {
|
||||||
|
first = false
|
||||||
|
// nothing to do here other than wait for status updates..
|
||||||
|
if backoff > k.schedulerConfig.ExplicitReconciliationMaxBackoff.Duration {
|
||||||
|
backoff = k.schedulerConfig.ExplicitReconciliationMaxBackoff.Duration
|
||||||
|
}
|
||||||
|
select {
|
||||||
|
case <-cancel:
|
||||||
|
return merrors.ReconciliationCancelledErr
|
||||||
|
case <-time.After(backoff):
|
||||||
|
for taskId := range remaining {
|
||||||
|
if task, _ := k.sched.Tasks().Get(taskId); task != nil && explicitTaskFilter(task) && task.UpdatedTime.Before(start) {
|
||||||
|
// keep this task in remaining list
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
remaining.Delete(taskId)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (ks *framework) recoverTasks() error {
|
||||||
|
podList, err := ks.client.Pods(api.NamespaceAll).List(labels.Everything(), fields.Everything())
|
||||||
|
if err != nil {
|
||||||
|
log.V(1).Infof("failed to recover pod registry, madness may ensue: %v", err)
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
recoverSlave := func(t *podtask.T) {
|
||||||
|
|
||||||
|
slaveId := t.Spec.SlaveID
|
||||||
|
ks.slaveHostNames.Register(slaveId, t.Offer.Host())
|
||||||
|
}
|
||||||
|
for _, pod := range podList.Items {
|
||||||
|
if _, isMirrorPod := pod.Annotations[kubetypes.ConfigMirrorAnnotationKey]; isMirrorPod {
|
||||||
|
// mirrored pods are never reconciled because the scheduler isn't responsible for
|
||||||
|
// scheduling them; they're started by the executor/kubelet upon instantiation and
|
||||||
|
// reflected in the apiserver afterward. the scheduler has no knowledge of them.
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if t, ok, err := podtask.RecoverFrom(pod); err != nil {
|
||||||
|
log.Errorf("failed to recover task from pod, will attempt to delete '%v/%v': %v", pod.Namespace, pod.Name, err)
|
||||||
|
err := ks.client.Pods(pod.Namespace).Delete(pod.Name, nil)
|
||||||
|
//TODO(jdef) check for temporary or not-found errors
|
||||||
|
if err != nil {
|
||||||
|
log.Errorf("failed to delete pod '%v/%v': %v", pod.Namespace, pod.Name, err)
|
||||||
|
}
|
||||||
|
} else if ok {
|
||||||
|
ks.sched.Tasks().Register(t)
|
||||||
|
recoverSlave(t)
|
||||||
|
log.Infof("recovered task %v from pod %v/%v", t.ID, pod.Namespace, pod.Name)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (ks *framework) KillTask(id string) error {
|
||||||
|
killTaskId := mutil.NewTaskID(id)
|
||||||
|
_, err := ks.driver.KillTask(killTaskId)
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
func (ks *framework) LaunchTask(t *podtask.T) error {
|
||||||
|
// assume caller is holding scheduler lock
|
||||||
|
taskList := []*mesos.TaskInfo{t.BuildTaskInfo(ks.executor)}
|
||||||
|
offerIds := []*mesos.OfferID{t.Offer.Details().Id}
|
||||||
|
filters := &mesos.Filters{}
|
||||||
|
_, err := ks.driver.LaunchTasks(offerIds, taskList, filters)
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
func (ks *framework) Offers() offers.Registry {
|
||||||
|
return ks.offers
|
||||||
|
}
|
@@ -14,7 +14,7 @@ See the License for the specific language governing permissions and
|
|||||||
limitations under the License.
|
limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
package scheduler
|
package framework
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"reflect"
|
"reflect"
|
||||||
@@ -25,9 +25,9 @@ import (
|
|||||||
"github.com/stretchr/testify/assert"
|
"github.com/stretchr/testify/assert"
|
||||||
"k8s.io/kubernetes/contrib/mesos/pkg/offers"
|
"k8s.io/kubernetes/contrib/mesos/pkg/offers"
|
||||||
"k8s.io/kubernetes/contrib/mesos/pkg/proc"
|
"k8s.io/kubernetes/contrib/mesos/pkg/proc"
|
||||||
|
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler"
|
||||||
schedcfg "k8s.io/kubernetes/contrib/mesos/pkg/scheduler/config"
|
schedcfg "k8s.io/kubernetes/contrib/mesos/pkg/scheduler/config"
|
||||||
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/podtask"
|
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/podtask"
|
||||||
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/slave"
|
|
||||||
"k8s.io/kubernetes/pkg/api"
|
"k8s.io/kubernetes/pkg/api"
|
||||||
"k8s.io/kubernetes/pkg/client/cache"
|
"k8s.io/kubernetes/pkg/client/cache"
|
||||||
)
|
)
|
||||||
@@ -81,12 +81,19 @@ func (r *mockRegistrator) Register(hostName string, labels map[string]string) (b
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func mockScheduler() scheduler.Scheduler {
|
||||||
|
mockScheduler := &scheduler.MockScheduler{}
|
||||||
|
reg := podtask.NewInMemoryRegistry()
|
||||||
|
mockScheduler.On("Tasks").Return(reg)
|
||||||
|
return mockScheduler
|
||||||
|
}
|
||||||
|
|
||||||
//test adding of ressource offer, should be added to offer registry and slaves
|
//test adding of ressource offer, should be added to offer registry and slaves
|
||||||
func TestResourceOffer_Add(t *testing.T) {
|
func TestResourceOffer_Add(t *testing.T) {
|
||||||
assert := assert.New(t)
|
assert := assert.New(t)
|
||||||
|
|
||||||
registrator := &mockRegistrator{cache.NewStore(cache.MetaNamespaceKeyFunc)}
|
registrator := &mockRegistrator{cache.NewStore(cache.MetaNamespaceKeyFunc)}
|
||||||
testScheduler := &KubernetesScheduler{
|
testFramework := &framework{
|
||||||
offers: offers.CreateRegistry(offers.RegistryConfig{
|
offers: offers.CreateRegistry(offers.RegistryConfig{
|
||||||
Compat: func(o *mesos.Offer) bool {
|
Compat: func(o *mesos.Offer) bool {
|
||||||
return true
|
return true
|
||||||
@@ -99,39 +106,40 @@ func TestResourceOffer_Add(t *testing.T) {
|
|||||||
TTL: schedcfg.DefaultOfferTTL,
|
TTL: schedcfg.DefaultOfferTTL,
|
||||||
ListenerDelay: schedcfg.DefaultListenerDelay,
|
ListenerDelay: schedcfg.DefaultListenerDelay,
|
||||||
}),
|
}),
|
||||||
slaveHostNames: slave.NewRegistry(),
|
slaveHostNames: newSlaveRegistry(),
|
||||||
nodeRegistrator: registrator,
|
nodeRegistrator: registrator,
|
||||||
|
sched: mockScheduler(),
|
||||||
}
|
}
|
||||||
|
|
||||||
hostname := "h1"
|
hostname := "h1"
|
||||||
offerID1 := util.NewOfferID("test1")
|
offerID1 := util.NewOfferID("test1")
|
||||||
offer1 := &mesos.Offer{Id: offerID1, Hostname: &hostname, SlaveId: util.NewSlaveID(hostname)}
|
offer1 := &mesos.Offer{Id: offerID1, Hostname: &hostname, SlaveId: util.NewSlaveID(hostname)}
|
||||||
offers1 := []*mesos.Offer{offer1}
|
offers1 := []*mesos.Offer{offer1}
|
||||||
testScheduler.ResourceOffers(nil, offers1)
|
testFramework.ResourceOffers(nil, offers1)
|
||||||
assert.Equal(1, len(registrator.store.List()))
|
assert.Equal(1, len(registrator.store.List()))
|
||||||
|
|
||||||
assert.Equal(1, getNumberOffers(testScheduler.offers))
|
assert.Equal(1, getNumberOffers(testFramework.offers))
|
||||||
//check slave hostname
|
//check slave hostname
|
||||||
assert.Equal(1, len(testScheduler.slaveHostNames.SlaveIDs()))
|
assert.Equal(1, len(testFramework.slaveHostNames.SlaveIDs()))
|
||||||
|
|
||||||
//add another offer
|
//add another offer
|
||||||
hostname2 := "h2"
|
hostname2 := "h2"
|
||||||
offer2 := &mesos.Offer{Id: util.NewOfferID("test2"), Hostname: &hostname2, SlaveId: util.NewSlaveID(hostname2)}
|
offer2 := &mesos.Offer{Id: util.NewOfferID("test2"), Hostname: &hostname2, SlaveId: util.NewSlaveID(hostname2)}
|
||||||
offers2 := []*mesos.Offer{offer2}
|
offers2 := []*mesos.Offer{offer2}
|
||||||
testScheduler.ResourceOffers(nil, offers2)
|
testFramework.ResourceOffers(nil, offers2)
|
||||||
|
|
||||||
//check it is stored in registry
|
//check it is stored in registry
|
||||||
assert.Equal(2, getNumberOffers(testScheduler.offers))
|
assert.Equal(2, getNumberOffers(testFramework.offers))
|
||||||
|
|
||||||
//check slave hostnames
|
//check slave hostnames
|
||||||
assert.Equal(2, len(testScheduler.slaveHostNames.SlaveIDs()))
|
assert.Equal(2, len(testFramework.slaveHostNames.SlaveIDs()))
|
||||||
}
|
}
|
||||||
|
|
||||||
//test adding of ressource offer, should be added to offer registry and slavesf
|
//test adding of ressource offer, should be added to offer registry and slavesf
|
||||||
func TestResourceOffer_Add_Rescind(t *testing.T) {
|
func TestResourceOffer_Add_Rescind(t *testing.T) {
|
||||||
assert := assert.New(t)
|
assert := assert.New(t)
|
||||||
|
|
||||||
testScheduler := &KubernetesScheduler{
|
testFramework := &framework{
|
||||||
offers: offers.CreateRegistry(offers.RegistryConfig{
|
offers: offers.CreateRegistry(offers.RegistryConfig{
|
||||||
Compat: func(o *mesos.Offer) bool {
|
Compat: func(o *mesos.Offer) bool {
|
||||||
return true
|
return true
|
||||||
@@ -144,42 +152,43 @@ func TestResourceOffer_Add_Rescind(t *testing.T) {
|
|||||||
TTL: schedcfg.DefaultOfferTTL,
|
TTL: schedcfg.DefaultOfferTTL,
|
||||||
ListenerDelay: schedcfg.DefaultListenerDelay,
|
ListenerDelay: schedcfg.DefaultListenerDelay,
|
||||||
}),
|
}),
|
||||||
slaveHostNames: slave.NewRegistry(),
|
slaveHostNames: newSlaveRegistry(),
|
||||||
|
sched: mockScheduler(),
|
||||||
}
|
}
|
||||||
|
|
||||||
hostname := "h1"
|
hostname := "h1"
|
||||||
offerID1 := util.NewOfferID("test1")
|
offerID1 := util.NewOfferID("test1")
|
||||||
offer1 := &mesos.Offer{Id: offerID1, Hostname: &hostname, SlaveId: util.NewSlaveID(hostname)}
|
offer1 := &mesos.Offer{Id: offerID1, Hostname: &hostname, SlaveId: util.NewSlaveID(hostname)}
|
||||||
offers1 := []*mesos.Offer{offer1}
|
offers1 := []*mesos.Offer{offer1}
|
||||||
testScheduler.ResourceOffers(nil, offers1)
|
testFramework.ResourceOffers(nil, offers1)
|
||||||
|
|
||||||
assert.Equal(1, getNumberOffers(testScheduler.offers))
|
assert.Equal(1, getNumberOffers(testFramework.offers))
|
||||||
|
|
||||||
//check slave hostname
|
//check slave hostname
|
||||||
assert.Equal(1, len(testScheduler.slaveHostNames.SlaveIDs()))
|
assert.Equal(1, len(testFramework.slaveHostNames.SlaveIDs()))
|
||||||
|
|
||||||
//add another offer
|
//add another offer
|
||||||
hostname2 := "h2"
|
hostname2 := "h2"
|
||||||
offer2 := &mesos.Offer{Id: util.NewOfferID("test2"), Hostname: &hostname2, SlaveId: util.NewSlaveID(hostname2)}
|
offer2 := &mesos.Offer{Id: util.NewOfferID("test2"), Hostname: &hostname2, SlaveId: util.NewSlaveID(hostname2)}
|
||||||
offers2 := []*mesos.Offer{offer2}
|
offers2 := []*mesos.Offer{offer2}
|
||||||
testScheduler.ResourceOffers(nil, offers2)
|
testFramework.ResourceOffers(nil, offers2)
|
||||||
|
|
||||||
assert.Equal(2, getNumberOffers(testScheduler.offers))
|
assert.Equal(2, getNumberOffers(testFramework.offers))
|
||||||
|
|
||||||
//check slave hostnames
|
//check slave hostnames
|
||||||
assert.Equal(2, len(testScheduler.slaveHostNames.SlaveIDs()))
|
assert.Equal(2, len(testFramework.slaveHostNames.SlaveIDs()))
|
||||||
|
|
||||||
//next whether offers can be rescinded
|
//next whether offers can be rescinded
|
||||||
testScheduler.OfferRescinded(nil, offerID1)
|
testFramework.OfferRescinded(nil, offerID1)
|
||||||
assert.Equal(1, getNumberOffers(testScheduler.offers))
|
assert.Equal(1, getNumberOffers(testFramework.offers))
|
||||||
|
|
||||||
//next whether offers can be rescinded
|
//next whether offers can be rescinded
|
||||||
testScheduler.OfferRescinded(nil, util.NewOfferID("test2"))
|
testFramework.OfferRescinded(nil, util.NewOfferID("test2"))
|
||||||
//walk offers again and check it is removed from registry
|
//walk offers again and check it is removed from registry
|
||||||
assert.Equal(0, getNumberOffers(testScheduler.offers))
|
assert.Equal(0, getNumberOffers(testFramework.offers))
|
||||||
|
|
||||||
//remove non existing ID
|
//remove non existing ID
|
||||||
testScheduler.OfferRescinded(nil, util.NewOfferID("notExist"))
|
testFramework.OfferRescinded(nil, util.NewOfferID("notExist"))
|
||||||
}
|
}
|
||||||
|
|
||||||
//test that when a slave is lost we remove all offers
|
//test that when a slave is lost we remove all offers
|
||||||
@@ -187,7 +196,7 @@ func TestSlave_Lost(t *testing.T) {
|
|||||||
assert := assert.New(t)
|
assert := assert.New(t)
|
||||||
|
|
||||||
//
|
//
|
||||||
testScheduler := &KubernetesScheduler{
|
testFramework := &framework{
|
||||||
offers: offers.CreateRegistry(offers.RegistryConfig{
|
offers: offers.CreateRegistry(offers.RegistryConfig{
|
||||||
Compat: func(o *mesos.Offer) bool {
|
Compat: func(o *mesos.Offer) bool {
|
||||||
return true
|
return true
|
||||||
@@ -197,45 +206,46 @@ func TestSlave_Lost(t *testing.T) {
|
|||||||
TTL: schedcfg.DefaultOfferTTL,
|
TTL: schedcfg.DefaultOfferTTL,
|
||||||
ListenerDelay: schedcfg.DefaultListenerDelay,
|
ListenerDelay: schedcfg.DefaultListenerDelay,
|
||||||
}),
|
}),
|
||||||
slaveHostNames: slave.NewRegistry(),
|
slaveHostNames: newSlaveRegistry(),
|
||||||
|
sched: mockScheduler(),
|
||||||
}
|
}
|
||||||
|
|
||||||
hostname := "h1"
|
hostname := "h1"
|
||||||
offer1 := &mesos.Offer{Id: util.NewOfferID("test1"), Hostname: &hostname, SlaveId: util.NewSlaveID(hostname)}
|
offer1 := &mesos.Offer{Id: util.NewOfferID("test1"), Hostname: &hostname, SlaveId: util.NewSlaveID(hostname)}
|
||||||
offers1 := []*mesos.Offer{offer1}
|
offers1 := []*mesos.Offer{offer1}
|
||||||
testScheduler.ResourceOffers(nil, offers1)
|
testFramework.ResourceOffers(nil, offers1)
|
||||||
offer2 := &mesos.Offer{Id: util.NewOfferID("test2"), Hostname: &hostname, SlaveId: util.NewSlaveID(hostname)}
|
offer2 := &mesos.Offer{Id: util.NewOfferID("test2"), Hostname: &hostname, SlaveId: util.NewSlaveID(hostname)}
|
||||||
offers2 := []*mesos.Offer{offer2}
|
offers2 := []*mesos.Offer{offer2}
|
||||||
testScheduler.ResourceOffers(nil, offers2)
|
testFramework.ResourceOffers(nil, offers2)
|
||||||
|
|
||||||
//add another offer from different slaveID
|
//add another offer from different slaveID
|
||||||
hostname2 := "h2"
|
hostname2 := "h2"
|
||||||
offer3 := &mesos.Offer{Id: util.NewOfferID("test3"), Hostname: &hostname2, SlaveId: util.NewSlaveID(hostname2)}
|
offer3 := &mesos.Offer{Id: util.NewOfferID("test3"), Hostname: &hostname2, SlaveId: util.NewSlaveID(hostname2)}
|
||||||
offers3 := []*mesos.Offer{offer3}
|
offers3 := []*mesos.Offer{offer3}
|
||||||
testScheduler.ResourceOffers(nil, offers3)
|
testFramework.ResourceOffers(nil, offers3)
|
||||||
|
|
||||||
//test precondition
|
//test precondition
|
||||||
assert.Equal(3, getNumberOffers(testScheduler.offers))
|
assert.Equal(3, getNumberOffers(testFramework.offers))
|
||||||
assert.Equal(2, len(testScheduler.slaveHostNames.SlaveIDs()))
|
assert.Equal(2, len(testFramework.slaveHostNames.SlaveIDs()))
|
||||||
|
|
||||||
//remove first slave
|
//remove first slave
|
||||||
testScheduler.SlaveLost(nil, util.NewSlaveID(hostname))
|
testFramework.SlaveLost(nil, util.NewSlaveID(hostname))
|
||||||
|
|
||||||
//offers should be removed
|
//offers should be removed
|
||||||
assert.Equal(1, getNumberOffers(testScheduler.offers))
|
assert.Equal(1, getNumberOffers(testFramework.offers))
|
||||||
//slave hostnames should still be all present
|
//slave hostnames should still be all present
|
||||||
assert.Equal(2, len(testScheduler.slaveHostNames.SlaveIDs()))
|
assert.Equal(2, len(testFramework.slaveHostNames.SlaveIDs()))
|
||||||
|
|
||||||
//remove second slave
|
//remove second slave
|
||||||
testScheduler.SlaveLost(nil, util.NewSlaveID(hostname2))
|
testFramework.SlaveLost(nil, util.NewSlaveID(hostname2))
|
||||||
|
|
||||||
//offers should be removed
|
//offers should be removed
|
||||||
assert.Equal(0, getNumberOffers(testScheduler.offers))
|
assert.Equal(0, getNumberOffers(testFramework.offers))
|
||||||
//slave hostnames should still be all present
|
//slave hostnames should still be all present
|
||||||
assert.Equal(2, len(testScheduler.slaveHostNames.SlaveIDs()))
|
assert.Equal(2, len(testFramework.slaveHostNames.SlaveIDs()))
|
||||||
|
|
||||||
//try to remove non existing slave
|
//try to remove non existing slave
|
||||||
testScheduler.SlaveLost(nil, util.NewSlaveID("notExist"))
|
testFramework.SlaveLost(nil, util.NewSlaveID("notExist"))
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -244,7 +254,7 @@ func TestDisconnect(t *testing.T) {
|
|||||||
assert := assert.New(t)
|
assert := assert.New(t)
|
||||||
|
|
||||||
//
|
//
|
||||||
testScheduler := &KubernetesScheduler{
|
testFramework := &framework{
|
||||||
offers: offers.CreateRegistry(offers.RegistryConfig{
|
offers: offers.CreateRegistry(offers.RegistryConfig{
|
||||||
Compat: func(o *mesos.Offer) bool {
|
Compat: func(o *mesos.Offer) bool {
|
||||||
return true
|
return true
|
||||||
@@ -254,30 +264,31 @@ func TestDisconnect(t *testing.T) {
|
|||||||
TTL: schedcfg.DefaultOfferTTL,
|
TTL: schedcfg.DefaultOfferTTL,
|
||||||
ListenerDelay: schedcfg.DefaultListenerDelay,
|
ListenerDelay: schedcfg.DefaultListenerDelay,
|
||||||
}),
|
}),
|
||||||
slaveHostNames: slave.NewRegistry(),
|
slaveHostNames: newSlaveRegistry(),
|
||||||
|
sched: mockScheduler(),
|
||||||
}
|
}
|
||||||
|
|
||||||
hostname := "h1"
|
hostname := "h1"
|
||||||
offer1 := &mesos.Offer{Id: util.NewOfferID("test1"), Hostname: &hostname, SlaveId: util.NewSlaveID(hostname)}
|
offer1 := &mesos.Offer{Id: util.NewOfferID("test1"), Hostname: &hostname, SlaveId: util.NewSlaveID(hostname)}
|
||||||
offers1 := []*mesos.Offer{offer1}
|
offers1 := []*mesos.Offer{offer1}
|
||||||
testScheduler.ResourceOffers(nil, offers1)
|
testFramework.ResourceOffers(nil, offers1)
|
||||||
offer2 := &mesos.Offer{Id: util.NewOfferID("test2"), Hostname: &hostname, SlaveId: util.NewSlaveID(hostname)}
|
offer2 := &mesos.Offer{Id: util.NewOfferID("test2"), Hostname: &hostname, SlaveId: util.NewSlaveID(hostname)}
|
||||||
offers2 := []*mesos.Offer{offer2}
|
offers2 := []*mesos.Offer{offer2}
|
||||||
testScheduler.ResourceOffers(nil, offers2)
|
testFramework.ResourceOffers(nil, offers2)
|
||||||
|
|
||||||
//add another offer from different slaveID
|
//add another offer from different slaveID
|
||||||
hostname2 := "h2"
|
hostname2 := "h2"
|
||||||
offer3 := &mesos.Offer{Id: util.NewOfferID("test2"), Hostname: &hostname2, SlaveId: util.NewSlaveID(hostname2)}
|
offer3 := &mesos.Offer{Id: util.NewOfferID("test2"), Hostname: &hostname2, SlaveId: util.NewSlaveID(hostname2)}
|
||||||
offers3 := []*mesos.Offer{offer3}
|
offers3 := []*mesos.Offer{offer3}
|
||||||
testScheduler.ResourceOffers(nil, offers3)
|
testFramework.ResourceOffers(nil, offers3)
|
||||||
|
|
||||||
//disconnect
|
//disconnect
|
||||||
testScheduler.Disconnected(nil)
|
testFramework.Disconnected(nil)
|
||||||
|
|
||||||
//all offers should be removed
|
//all offers should be removed
|
||||||
assert.Equal(0, getNumberOffers(testScheduler.offers))
|
assert.Equal(0, getNumberOffers(testFramework.offers))
|
||||||
//slave hostnames should still be all present
|
//slave hostnames should still be all present
|
||||||
assert.Equal(2, len(testScheduler.slaveHostNames.SlaveIDs()))
|
assert.Equal(2, len(testFramework.slaveHostNames.SlaveIDs()))
|
||||||
}
|
}
|
||||||
|
|
||||||
//test we can handle different status updates, TODO check state transitions
|
//test we can handle different status updates, TODO check state transitions
|
||||||
@@ -287,7 +298,7 @@ func TestStatus_Update(t *testing.T) {
|
|||||||
// setup expectations
|
// setup expectations
|
||||||
mockdriver.On("KillTask", util.NewTaskID("test-task-001")).Return(mesos.Status_DRIVER_RUNNING, nil)
|
mockdriver.On("KillTask", util.NewTaskID("test-task-001")).Return(mesos.Status_DRIVER_RUNNING, nil)
|
||||||
|
|
||||||
testScheduler := &KubernetesScheduler{
|
testFramework := &framework{
|
||||||
offers: offers.CreateRegistry(offers.RegistryConfig{
|
offers: offers.CreateRegistry(offers.RegistryConfig{
|
||||||
Compat: func(o *mesos.Offer) bool {
|
Compat: func(o *mesos.Offer) bool {
|
||||||
return true
|
return true
|
||||||
@@ -297,28 +308,28 @@ func TestStatus_Update(t *testing.T) {
|
|||||||
TTL: schedcfg.DefaultOfferTTL,
|
TTL: schedcfg.DefaultOfferTTL,
|
||||||
ListenerDelay: schedcfg.DefaultListenerDelay,
|
ListenerDelay: schedcfg.DefaultListenerDelay,
|
||||||
}),
|
}),
|
||||||
slaveHostNames: slave.NewRegistry(),
|
slaveHostNames: newSlaveRegistry(),
|
||||||
driver: &mockdriver,
|
driver: &mockdriver,
|
||||||
taskRegistry: podtask.NewInMemoryRegistry(),
|
sched: mockScheduler(),
|
||||||
}
|
}
|
||||||
|
|
||||||
taskStatus_task_starting := util.NewTaskStatus(
|
taskStatus_task_starting := util.NewTaskStatus(
|
||||||
util.NewTaskID("test-task-001"),
|
util.NewTaskID("test-task-001"),
|
||||||
mesos.TaskState_TASK_RUNNING,
|
mesos.TaskState_TASK_RUNNING,
|
||||||
)
|
)
|
||||||
testScheduler.StatusUpdate(testScheduler.driver, taskStatus_task_starting)
|
testFramework.StatusUpdate(testFramework.driver, taskStatus_task_starting)
|
||||||
|
|
||||||
taskStatus_task_running := util.NewTaskStatus(
|
taskStatus_task_running := util.NewTaskStatus(
|
||||||
util.NewTaskID("test-task-001"),
|
util.NewTaskID("test-task-001"),
|
||||||
mesos.TaskState_TASK_RUNNING,
|
mesos.TaskState_TASK_RUNNING,
|
||||||
)
|
)
|
||||||
testScheduler.StatusUpdate(testScheduler.driver, taskStatus_task_running)
|
testFramework.StatusUpdate(testFramework.driver, taskStatus_task_running)
|
||||||
|
|
||||||
taskStatus_task_failed := util.NewTaskStatus(
|
taskStatus_task_failed := util.NewTaskStatus(
|
||||||
util.NewTaskID("test-task-001"),
|
util.NewTaskID("test-task-001"),
|
||||||
mesos.TaskState_TASK_FAILED,
|
mesos.TaskState_TASK_FAILED,
|
||||||
)
|
)
|
||||||
testScheduler.StatusUpdate(testScheduler.driver, taskStatus_task_failed)
|
testFramework.StatusUpdate(testFramework.driver, taskStatus_task_failed)
|
||||||
|
|
||||||
//assert that mock was invoked
|
//assert that mock was invoked
|
||||||
mockdriver.AssertExpectations(t)
|
mockdriver.AssertExpectations(t)
|
@@ -14,25 +14,26 @@ See the License for the specific language governing permissions and
|
|||||||
limitations under the License.
|
limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
package slave
|
package framework
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"sync"
|
"sync"
|
||||||
)
|
)
|
||||||
|
|
||||||
type Registry struct {
|
// slaveRegistry manages node hostnames for slave ids.
|
||||||
|
type slaveRegistry struct {
|
||||||
lock sync.Mutex
|
lock sync.Mutex
|
||||||
hostNames map[string]string
|
hostNames map[string]string
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewRegistry() *Registry {
|
func newSlaveRegistry() *slaveRegistry {
|
||||||
return &Registry{
|
return &slaveRegistry{
|
||||||
hostNames: map[string]string{},
|
hostNames: map[string]string{},
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Register creates a mapping between a slaveId and slave if not existing.
|
// Register creates a mapping between a slaveId and slave if not existing.
|
||||||
func (st *Registry) Register(slaveId, slaveHostname string) {
|
func (st *slaveRegistry) Register(slaveId, slaveHostname string) {
|
||||||
st.lock.Lock()
|
st.lock.Lock()
|
||||||
defer st.lock.Unlock()
|
defer st.lock.Unlock()
|
||||||
_, exists := st.hostNames[slaveId]
|
_, exists := st.hostNames[slaveId]
|
||||||
@@ -42,7 +43,7 @@ func (st *Registry) Register(slaveId, slaveHostname string) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// SlaveIDs returns the keys of the registry
|
// SlaveIDs returns the keys of the registry
|
||||||
func (st *Registry) SlaveIDs() []string {
|
func (st *slaveRegistry) SlaveIDs() []string {
|
||||||
st.lock.Lock()
|
st.lock.Lock()
|
||||||
defer st.lock.Unlock()
|
defer st.lock.Unlock()
|
||||||
slaveIds := make([]string, 0, len(st.hostNames))
|
slaveIds := make([]string, 0, len(st.hostNames))
|
||||||
@@ -53,7 +54,7 @@ func (st *Registry) SlaveIDs() []string {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// HostName looks up a hostname for a given slaveId
|
// HostName looks up a hostname for a given slaveId
|
||||||
func (st *Registry) HostName(slaveId string) string {
|
func (st *slaveRegistry) HostName(slaveId string) string {
|
||||||
st.lock.Lock()
|
st.lock.Lock()
|
||||||
defer st.lock.Unlock()
|
defer st.lock.Unlock()
|
||||||
return st.hostNames[slaveId]
|
return st.hostNames[slaveId]
|
@@ -14,7 +14,7 @@ See the License for the specific language governing permissions and
|
|||||||
limitations under the License.
|
limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
package slave
|
package framework
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"testing"
|
"testing"
|
||||||
@@ -26,7 +26,7 @@ import (
|
|||||||
func TestSlaveStorage_Register(t *testing.T) {
|
func TestSlaveStorage_Register(t *testing.T) {
|
||||||
assert := assert.New(t)
|
assert := assert.New(t)
|
||||||
|
|
||||||
slaveStorage := NewRegistry()
|
slaveStorage := newSlaveRegistry()
|
||||||
assert.Equal(0, len(slaveStorage.hostNames))
|
assert.Equal(0, len(slaveStorage.hostNames))
|
||||||
|
|
||||||
slaveId := "slave1"
|
slaveId := "slave1"
|
||||||
@@ -42,7 +42,7 @@ func TestSlaveStorage_Register(t *testing.T) {
|
|||||||
func TestSlaveStorage_HostName(t *testing.T) {
|
func TestSlaveStorage_HostName(t *testing.T) {
|
||||||
assert := assert.New(t)
|
assert := assert.New(t)
|
||||||
|
|
||||||
slaveStorage := NewRegistry()
|
slaveStorage := newSlaveRegistry()
|
||||||
assert.Equal(0, len(slaveStorage.hostNames))
|
assert.Equal(0, len(slaveStorage.hostNames))
|
||||||
|
|
||||||
slaveId := "slave1"
|
slaveId := "slave1"
|
||||||
@@ -62,7 +62,7 @@ func TestSlaveStorage_HostName(t *testing.T) {
|
|||||||
func TestSlaveStorage_SlaveIds(t *testing.T) {
|
func TestSlaveStorage_SlaveIds(t *testing.T) {
|
||||||
assert := assert.New(t)
|
assert := assert.New(t)
|
||||||
|
|
||||||
slaveStorage := NewRegistry()
|
slaveStorage := newSlaveRegistry()
|
||||||
assert.Equal(0, len(slaveStorage.hostNames))
|
assert.Equal(0, len(slaveStorage.hostNames))
|
||||||
|
|
||||||
slaveId := "1"
|
slaveId := "1"
|
19
contrib/mesos/pkg/scheduler/components/podreconciler/doc.go
Normal file
19
contrib/mesos/pkg/scheduler/components/podreconciler/doc.go
Normal file
@@ -0,0 +1,19 @@
|
|||||||
|
/*
|
||||||
|
Copyright 2015 The Kubernetes Authors All rights reserved.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
// Package podreconciler implements pod reconcilation of pods which failed
|
||||||
|
// to launch, i.e. before binding by the executor took place.
|
||||||
|
package podreconciler
|
@@ -0,0 +1,120 @@
|
|||||||
|
/*
|
||||||
|
Copyright 2015 The Kubernetes Authors All rights reserved.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package podreconciler
|
||||||
|
|
||||||
|
import (
|
||||||
|
"time"
|
||||||
|
|
||||||
|
log "github.com/golang/glog"
|
||||||
|
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler"
|
||||||
|
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/components/deleter"
|
||||||
|
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/errors"
|
||||||
|
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/podtask"
|
||||||
|
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/queuer"
|
||||||
|
"k8s.io/kubernetes/pkg/api"
|
||||||
|
apierrors "k8s.io/kubernetes/pkg/api/errors"
|
||||||
|
client "k8s.io/kubernetes/pkg/client/unversioned"
|
||||||
|
)
|
||||||
|
|
||||||
|
// PodReconciler reconciles a pod with the apiserver
|
||||||
|
type PodReconciler interface {
|
||||||
|
Reconcile(t *podtask.T)
|
||||||
|
}
|
||||||
|
|
||||||
|
type podReconciler struct {
|
||||||
|
sched scheduler.Scheduler
|
||||||
|
client *client.Client
|
||||||
|
qr queuer.Queuer
|
||||||
|
deleter deleter.Deleter
|
||||||
|
}
|
||||||
|
|
||||||
|
func New(sched scheduler.Scheduler, client *client.Client, qr queuer.Queuer, deleter deleter.Deleter) PodReconciler {
|
||||||
|
return &podReconciler{
|
||||||
|
sched: sched,
|
||||||
|
client: client,
|
||||||
|
qr: qr,
|
||||||
|
deleter: deleter,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// this pod may be out of sync with respect to the API server registry:
|
||||||
|
// this pod | apiserver registry
|
||||||
|
// -------------|----------------------
|
||||||
|
// host=.* | 404 ; pod was deleted
|
||||||
|
// host=.* | 5xx ; failed to sync, try again later?
|
||||||
|
// host="" | host="" ; perhaps no updates to process?
|
||||||
|
// host="" | host="..." ; pod has been scheduled and assigned, is there a task assigned? (check TaskIdKey in binding?)
|
||||||
|
// host="..." | host="" ; pod is no longer scheduled, does it need to be re-queued?
|
||||||
|
// host="..." | host="..." ; perhaps no updates to process?
|
||||||
|
//
|
||||||
|
// TODO(jdef) this needs an integration test
|
||||||
|
func (s *podReconciler) Reconcile(t *podtask.T) {
|
||||||
|
log.V(1).Infof("reconcile pod %v, assigned to slave %q", t.Pod.Name, t.Spec.AssignedSlave)
|
||||||
|
ctx := api.WithNamespace(api.NewDefaultContext(), t.Pod.Namespace)
|
||||||
|
pod, err := s.client.Pods(api.NamespaceValue(ctx)).Get(t.Pod.Name)
|
||||||
|
if err != nil {
|
||||||
|
if apierrors.IsNotFound(err) {
|
||||||
|
// attempt to delete
|
||||||
|
if err = s.deleter.DeleteOne(&queuer.Pod{Pod: &t.Pod}); err != nil && err != errors.NoSuchPodErr && err != errors.NoSuchTaskErr {
|
||||||
|
log.Errorf("failed to delete pod: %v: %v", t.Pod.Name, err)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
//TODO(jdef) other errors should probably trigger a retry (w/ backoff).
|
||||||
|
//For now, drop the pod on the floor
|
||||||
|
log.Warning("aborting reconciliation for pod %v: %v", t.Pod.Name, err)
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
log.Infof("pod %v scheduled on %q according to apiserver", pod.Name, pod.Spec.NodeName)
|
||||||
|
if t.Spec.AssignedSlave != pod.Spec.NodeName {
|
||||||
|
if pod.Spec.NodeName == "" {
|
||||||
|
// pod is unscheduled.
|
||||||
|
// it's possible that we dropped the pod in the scheduler error handler
|
||||||
|
// because of task misalignment with the pod (task.Has(podtask.Launched) == true)
|
||||||
|
|
||||||
|
podKey, err := podtask.MakePodKey(ctx, pod.Name)
|
||||||
|
if err != nil {
|
||||||
|
log.Error(err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
s.sched.Lock()
|
||||||
|
defer s.sched.Unlock()
|
||||||
|
|
||||||
|
if _, state := s.sched.Tasks().ForPod(podKey); state != podtask.StateUnknown {
|
||||||
|
//TODO(jdef) reconcile the task
|
||||||
|
log.Errorf("task already registered for pod %v", pod.Name)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
now := time.Now()
|
||||||
|
log.V(3).Infof("reoffering pod %v", podKey)
|
||||||
|
s.qr.Reoffer(queuer.NewPodWithDeadline(pod, &now))
|
||||||
|
} else {
|
||||||
|
// pod is scheduled.
|
||||||
|
// not sure how this happened behind our backs. attempt to reconstruct
|
||||||
|
// at least a partial podtask.T record.
|
||||||
|
//TODO(jdef) reconcile the task
|
||||||
|
log.Errorf("pod already scheduled: %v", pod.Name)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
//TODO(jdef) for now, ignore the fact that the rest of the spec may be different
|
||||||
|
//and assume that our knowledge of the pod aligns with that of the apiserver
|
||||||
|
log.Error("pod reconciliation does not support updates; not yet implemented")
|
||||||
|
}
|
||||||
|
}
|
63
contrib/mesos/pkg/scheduler/components/podstoreadapter.go
Normal file
63
contrib/mesos/pkg/scheduler/components/podstoreadapter.go
Normal file
@@ -0,0 +1,63 @@
|
|||||||
|
/*
|
||||||
|
Copyright 2015 The Kubernetes Authors All rights reserved.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package components
|
||||||
|
|
||||||
|
import (
|
||||||
|
"k8s.io/kubernetes/contrib/mesos/pkg/queue"
|
||||||
|
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/queuer"
|
||||||
|
"k8s.io/kubernetes/pkg/api"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Consumes *api.Pod, produces *Pod; the k8s reflector wants to push *api.Pod
|
||||||
|
// objects at us, but we want to store more flexible (Pod) type defined in
|
||||||
|
// this package. The adapter implementation facilitates this. It's a little
|
||||||
|
// hackish since the object type going in is different than the object type
|
||||||
|
// coming out -- you've been warned.
|
||||||
|
type podStoreAdapter struct {
|
||||||
|
queue.FIFO
|
||||||
|
}
|
||||||
|
|
||||||
|
func (psa *podStoreAdapter) Add(obj interface{}) error {
|
||||||
|
pod := obj.(*api.Pod)
|
||||||
|
return psa.FIFO.Add(&queuer.Pod{Pod: pod})
|
||||||
|
}
|
||||||
|
|
||||||
|
func (psa *podStoreAdapter) Update(obj interface{}) error {
|
||||||
|
pod := obj.(*api.Pod)
|
||||||
|
return psa.FIFO.Update(&queuer.Pod{Pod: pod})
|
||||||
|
}
|
||||||
|
|
||||||
|
func (psa *podStoreAdapter) Delete(obj interface{}) error {
|
||||||
|
pod := obj.(*api.Pod)
|
||||||
|
return psa.FIFO.Delete(&queuer.Pod{Pod: pod})
|
||||||
|
}
|
||||||
|
|
||||||
|
func (psa *podStoreAdapter) Get(obj interface{}) (interface{}, bool, error) {
|
||||||
|
pod := obj.(*api.Pod)
|
||||||
|
return psa.FIFO.Get(&queuer.Pod{Pod: pod})
|
||||||
|
}
|
||||||
|
|
||||||
|
// Replace will delete the contents of the store, using instead the
|
||||||
|
// given map. This store implementation does NOT take ownership of the map.
|
||||||
|
func (psa *podStoreAdapter) Replace(objs []interface{}, resourceVersion string) error {
|
||||||
|
newobjs := make([]interface{}, len(objs))
|
||||||
|
for i, v := range objs {
|
||||||
|
pod := v.(*api.Pod)
|
||||||
|
newobjs[i] = &queuer.Pod{Pod: pod}
|
||||||
|
}
|
||||||
|
return psa.FIFO.Replace(newobjs, resourceVersion)
|
||||||
|
}
|
137
contrib/mesos/pkg/scheduler/components/scheduler.go
Normal file
137
contrib/mesos/pkg/scheduler/components/scheduler.go
Normal file
@@ -0,0 +1,137 @@
|
|||||||
|
/*
|
||||||
|
Copyright 2015 The Kubernetes Authors All rights reserved.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package components
|
||||||
|
|
||||||
|
import (
|
||||||
|
"net/http"
|
||||||
|
"sync"
|
||||||
|
|
||||||
|
mesos "github.com/mesos/mesos-go/mesosproto"
|
||||||
|
"k8s.io/kubernetes/contrib/mesos/pkg/backoff"
|
||||||
|
"k8s.io/kubernetes/contrib/mesos/pkg/offers"
|
||||||
|
"k8s.io/kubernetes/contrib/mesos/pkg/queue"
|
||||||
|
"k8s.io/kubernetes/contrib/mesos/pkg/runtime"
|
||||||
|
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler"
|
||||||
|
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/components/algorithm"
|
||||||
|
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/components/algorithm/podschedulers"
|
||||||
|
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/components/binder"
|
||||||
|
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/components/controller"
|
||||||
|
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/components/deleter"
|
||||||
|
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/components/errorhandler"
|
||||||
|
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/components/framework"
|
||||||
|
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/components/podreconciler"
|
||||||
|
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/config"
|
||||||
|
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/podtask"
|
||||||
|
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/queuer"
|
||||||
|
"k8s.io/kubernetes/pkg/api"
|
||||||
|
"k8s.io/kubernetes/pkg/client/cache"
|
||||||
|
"k8s.io/kubernetes/pkg/client/record"
|
||||||
|
client "k8s.io/kubernetes/pkg/client/unversioned"
|
||||||
|
)
|
||||||
|
|
||||||
|
// sched implements the Scheduler interface.
|
||||||
|
type sched struct {
|
||||||
|
podReconciler podreconciler.PodReconciler
|
||||||
|
framework framework.Framework
|
||||||
|
controller controller.Controller
|
||||||
|
|
||||||
|
// unsafe state, needs to be guarded, especially changes to podtask.T objects
|
||||||
|
sync.RWMutex
|
||||||
|
taskRegistry podtask.Registry
|
||||||
|
}
|
||||||
|
|
||||||
|
func New(c *config.Config, fw framework.Framework, ps podschedulers.PodScheduler,
|
||||||
|
client *client.Client, recorder record.EventRecorder, terminate <-chan struct{}, mux *http.ServeMux, lw *cache.ListWatch) scheduler.Scheduler {
|
||||||
|
|
||||||
|
core := &sched{
|
||||||
|
framework: fw,
|
||||||
|
taskRegistry: podtask.NewInMemoryRegistry(),
|
||||||
|
}
|
||||||
|
|
||||||
|
// Watch and queue pods that need scheduling.
|
||||||
|
podUpdatesBypass := make(chan queue.Entry, c.UpdatesBacklog)
|
||||||
|
podUpdates := &podStoreAdapter{queue.NewHistorical(podUpdatesBypass)}
|
||||||
|
reflector := cache.NewReflector(lw, &api.Pod{}, podUpdates, 0)
|
||||||
|
|
||||||
|
q := queuer.New(queue.NewDelayFIFO(), podUpdates)
|
||||||
|
|
||||||
|
algorithm := algorithm.New(core, podUpdates, ps)
|
||||||
|
|
||||||
|
podDeleter := deleter.New(core, q)
|
||||||
|
|
||||||
|
core.podReconciler = podreconciler.New(core, client, q, podDeleter)
|
||||||
|
|
||||||
|
bo := backoff.New(c.InitialPodBackoff.Duration, c.MaxPodBackoff.Duration)
|
||||||
|
newBC := func(podKey string) queue.BreakChan {
|
||||||
|
return queue.BreakChan(core.Offers().Listen(podKey, func(offer *mesos.Offer) bool {
|
||||||
|
core.Lock()
|
||||||
|
defer core.Unlock()
|
||||||
|
switch task, state := core.Tasks().ForPod(podKey); state {
|
||||||
|
case podtask.StatePending:
|
||||||
|
// Assess fitness of pod with the current offer. The scheduler normally
|
||||||
|
// "backs off" when it can't find an offer that matches up with a pod.
|
||||||
|
// The backoff period for a pod can terminate sooner if an offer becomes
|
||||||
|
// available that matches up.
|
||||||
|
return !task.Has(podtask.Launched) && ps.FitPredicate()(task, offer, nil)
|
||||||
|
default:
|
||||||
|
// no point in continuing to check for matching offers
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}))
|
||||||
|
}
|
||||||
|
errorHandler := errorhandler.New(core, bo, q, newBC)
|
||||||
|
|
||||||
|
binder := binder.New(core)
|
||||||
|
|
||||||
|
startLatch := make(chan struct{})
|
||||||
|
|
||||||
|
runtime.On(startLatch, func() {
|
||||||
|
reflector.Run() // TODO(jdef) should listen for termination
|
||||||
|
podDeleter.Run(podUpdatesBypass, terminate)
|
||||||
|
q.Run(terminate)
|
||||||
|
|
||||||
|
q.InstallDebugHandlers(mux)
|
||||||
|
podtask.InstallDebugHandlers(core.Tasks(), mux)
|
||||||
|
})
|
||||||
|
|
||||||
|
core.controller = controller.New(client, algorithm, recorder, q.Yield, errorHandler.Error, binder, startLatch)
|
||||||
|
return core
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *sched) Run(done <-chan struct{}) {
|
||||||
|
c.controller.Run(done)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *sched) Reconcile(t *podtask.T) {
|
||||||
|
c.podReconciler.Reconcile(t)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *sched) Tasks() podtask.Registry {
|
||||||
|
return c.taskRegistry
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *sched) Offers() offers.Registry {
|
||||||
|
return c.framework.Offers()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *sched) KillTask(id string) error {
|
||||||
|
return c.framework.KillTask(id)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *sched) LaunchTask(t *podtask.T) error {
|
||||||
|
return c.framework.LaunchTask(t)
|
||||||
|
}
|
@@ -0,0 +1,18 @@
|
|||||||
|
/*
|
||||||
|
Copyright 2015 The Kubernetes Authors All rights reserved.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
// Package taskreconciler implement Mesos task reconcilation.
|
||||||
|
package taskreconciler
|
@@ -0,0 +1,235 @@
|
|||||||
|
/*
|
||||||
|
Copyright 2015 The Kubernetes Authors All rights reserved.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package taskreconciler
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
log "github.com/golang/glog"
|
||||||
|
mesos "github.com/mesos/mesos-go/mesosproto"
|
||||||
|
bindings "github.com/mesos/mesos-go/scheduler"
|
||||||
|
"k8s.io/kubernetes/contrib/mesos/pkg/proc"
|
||||||
|
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/errors"
|
||||||
|
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/metrics"
|
||||||
|
)
|
||||||
|
|
||||||
|
type Action func(driver bindings.SchedulerDriver, cancel <-chan struct{}) <-chan error
|
||||||
|
|
||||||
|
type TasksReconciler interface {
|
||||||
|
RequestExplicit()
|
||||||
|
RequestImplicit()
|
||||||
|
Run(driver bindings.SchedulerDriver, done <-chan struct{})
|
||||||
|
}
|
||||||
|
|
||||||
|
type tasksReconciler struct {
|
||||||
|
proc.Doer
|
||||||
|
Action Action
|
||||||
|
explicit chan struct{} // send an empty struct to trigger explicit reconciliation
|
||||||
|
implicit chan struct{} // send an empty struct to trigger implicit reconciliation
|
||||||
|
cooldown time.Duration
|
||||||
|
explicitReconciliationAbortTimeout time.Duration
|
||||||
|
}
|
||||||
|
|
||||||
|
func New(doer proc.Doer, action Action,
|
||||||
|
cooldown, explicitReconciliationAbortTimeout time.Duration, done <-chan struct{}) TasksReconciler {
|
||||||
|
return &tasksReconciler{
|
||||||
|
Doer: doer,
|
||||||
|
explicit: make(chan struct{}, 1),
|
||||||
|
implicit: make(chan struct{}, 1),
|
||||||
|
cooldown: cooldown,
|
||||||
|
explicitReconciliationAbortTimeout: explicitReconciliationAbortTimeout,
|
||||||
|
Action: func(driver bindings.SchedulerDriver, cancel <-chan struct{}) <-chan error {
|
||||||
|
// trigged the reconciler action in the doer's execution context,
|
||||||
|
// but it could take a while and the scheduler needs to be able to
|
||||||
|
// process updates, the callbacks for which ALSO execute in the SAME
|
||||||
|
// deferred execution context -- so the action MUST be executed async.
|
||||||
|
errOnce := proc.NewErrorOnce(cancel)
|
||||||
|
return errOnce.Send(doer.Do(func() {
|
||||||
|
// only triggers the action if we're the currently elected,
|
||||||
|
// registered master and runs the action async.
|
||||||
|
go func() {
|
||||||
|
var err <-chan error
|
||||||
|
defer errOnce.Send(err)
|
||||||
|
err = action(driver, cancel)
|
||||||
|
}()
|
||||||
|
})).Err()
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *tasksReconciler) RequestExplicit() {
|
||||||
|
select {
|
||||||
|
case r.explicit <- struct{}{}: // noop
|
||||||
|
default: // request queue full; noop
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *tasksReconciler) RequestImplicit() {
|
||||||
|
select {
|
||||||
|
case r.implicit <- struct{}{}: // noop
|
||||||
|
default: // request queue full; noop
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// execute task reconciliation, returns when r.done is closed. intended to run as a goroutine.
|
||||||
|
// if reconciliation is requested while another is in progress, the in-progress operation will be
|
||||||
|
// cancelled before the new reconciliation operation begins.
|
||||||
|
func (r *tasksReconciler) Run(driver bindings.SchedulerDriver, done <-chan struct{}) {
|
||||||
|
var cancel, finished chan struct{}
|
||||||
|
requestLoop:
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-done:
|
||||||
|
return
|
||||||
|
default: // proceed
|
||||||
|
}
|
||||||
|
select {
|
||||||
|
case <-r.implicit:
|
||||||
|
metrics.ReconciliationRequested.WithLabelValues("implicit").Inc()
|
||||||
|
select {
|
||||||
|
case <-done:
|
||||||
|
return
|
||||||
|
case <-r.explicit:
|
||||||
|
break // give preference to a pending request for explicit
|
||||||
|
default: // continue
|
||||||
|
// don't run implicit reconciliation while explicit is ongoing
|
||||||
|
if finished != nil {
|
||||||
|
select {
|
||||||
|
case <-finished: // continue w/ implicit
|
||||||
|
default:
|
||||||
|
log.Infoln("skipping implicit reconcile because explicit reconcile is ongoing")
|
||||||
|
continue requestLoop
|
||||||
|
}
|
||||||
|
}
|
||||||
|
errOnce := proc.NewErrorOnce(done)
|
||||||
|
errCh := r.Do(func() {
|
||||||
|
var err error
|
||||||
|
defer errOnce.Report(err)
|
||||||
|
log.Infoln("implicit reconcile tasks")
|
||||||
|
metrics.ReconciliationExecuted.WithLabelValues("implicit").Inc()
|
||||||
|
if _, err = driver.ReconcileTasks([]*mesos.TaskStatus{}); err != nil {
|
||||||
|
log.V(1).Infof("failed to request implicit reconciliation from mesos: %v", err)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
proc.OnError(errOnce.Send(errCh).Err(), func(err error) {
|
||||||
|
log.Errorf("failed to run implicit reconciliation: %v", err)
|
||||||
|
}, done)
|
||||||
|
goto slowdown
|
||||||
|
}
|
||||||
|
case <-done:
|
||||||
|
return
|
||||||
|
case <-r.explicit: // continue
|
||||||
|
metrics.ReconciliationRequested.WithLabelValues("explicit").Inc()
|
||||||
|
}
|
||||||
|
|
||||||
|
if cancel != nil {
|
||||||
|
close(cancel)
|
||||||
|
cancel = nil
|
||||||
|
|
||||||
|
// play nice and wait for the prior operation to finish, complain
|
||||||
|
// if it doesn't
|
||||||
|
select {
|
||||||
|
case <-done:
|
||||||
|
return
|
||||||
|
case <-finished: // noop, expected
|
||||||
|
case <-time.After(r.explicitReconciliationAbortTimeout): // very unexpected
|
||||||
|
log.Error("reconciler action failed to stop upon cancellation")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// copy 'finished' to 'fin' here in case we end up with simultaneous go-routines,
|
||||||
|
// if cancellation takes too long or fails - we don't want to close the same chan
|
||||||
|
// more than once
|
||||||
|
cancel = make(chan struct{})
|
||||||
|
finished = make(chan struct{})
|
||||||
|
go func(fin chan struct{}) {
|
||||||
|
startedAt := time.Now()
|
||||||
|
defer func() {
|
||||||
|
metrics.ReconciliationLatency.Observe(metrics.InMicroseconds(time.Since(startedAt)))
|
||||||
|
}()
|
||||||
|
|
||||||
|
metrics.ReconciliationExecuted.WithLabelValues("explicit").Inc()
|
||||||
|
defer close(fin)
|
||||||
|
err := <-r.Action(driver, cancel)
|
||||||
|
if err == errors.ReconciliationCancelledErr {
|
||||||
|
metrics.ReconciliationCancelled.WithLabelValues("explicit").Inc()
|
||||||
|
log.Infoln(err.Error())
|
||||||
|
} else if err != nil {
|
||||||
|
log.Errorf("reconciler action failed: %v", err)
|
||||||
|
}
|
||||||
|
}(finished)
|
||||||
|
slowdown:
|
||||||
|
// don't allow reconciliation to run very frequently, either explicit or implicit
|
||||||
|
select {
|
||||||
|
case <-done:
|
||||||
|
return
|
||||||
|
case <-time.After(r.cooldown): // noop
|
||||||
|
}
|
||||||
|
} // for
|
||||||
|
}
|
||||||
|
|
||||||
|
// MakeComposite invokes the given ReconcilerAction funcs in sequence, aborting the sequence if reconciliation
|
||||||
|
// is cancelled. if any other errors occur the composite reconciler will attempt to complete the
|
||||||
|
// sequence, reporting only the last generated error.
|
||||||
|
func MakeComposite(done <-chan struct{}, actions ...Action) Action {
|
||||||
|
if x := len(actions); x == 0 {
|
||||||
|
// programming error
|
||||||
|
panic("no actions specified for composite reconciler")
|
||||||
|
} else if x == 1 {
|
||||||
|
return actions[0]
|
||||||
|
}
|
||||||
|
chained := func(d bindings.SchedulerDriver, c <-chan struct{}, a, b Action) <-chan error {
|
||||||
|
ech := a(d, c)
|
||||||
|
ch := make(chan error, 1)
|
||||||
|
go func() {
|
||||||
|
select {
|
||||||
|
case <-done:
|
||||||
|
case <-c:
|
||||||
|
case e := <-ech:
|
||||||
|
if e != nil {
|
||||||
|
ch <- e
|
||||||
|
return
|
||||||
|
}
|
||||||
|
ech = b(d, c)
|
||||||
|
select {
|
||||||
|
case <-done:
|
||||||
|
case <-c:
|
||||||
|
case e := <-ech:
|
||||||
|
if e != nil {
|
||||||
|
ch <- e
|
||||||
|
return
|
||||||
|
}
|
||||||
|
close(ch)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
ch <- fmt.Errorf("aborting composite reconciler action")
|
||||||
|
}()
|
||||||
|
return ch
|
||||||
|
}
|
||||||
|
result := func(d bindings.SchedulerDriver, c <-chan struct{}) <-chan error {
|
||||||
|
return chained(d, c, actions[0], actions[1])
|
||||||
|
}
|
||||||
|
for i := 2; i < len(actions); i++ {
|
||||||
|
i := i
|
||||||
|
next := func(d bindings.SchedulerDriver, c <-chan struct{}) <-chan error {
|
||||||
|
return chained(d, c, Action(result), actions[i])
|
||||||
|
}
|
||||||
|
result = next
|
||||||
|
}
|
||||||
|
return Action(result)
|
||||||
|
}
|
@@ -16,3 +16,58 @@ limitations under the License.
|
|||||||
|
|
||||||
// Package scheduler implements the Kubernetes Mesos scheduler.
|
// Package scheduler implements the Kubernetes Mesos scheduler.
|
||||||
package scheduler
|
package scheduler
|
||||||
|
|
||||||
|
// Created from contrib/mesos/docs/scheduler.monopic:
|
||||||
|
//
|
||||||
|
// ┌───────────────────────────────────────────────────────────────────────┐
|
||||||
|
// │ ┌───────────────────────────────────────┐ ┌─┴──────────────────────┐ ┌───────────────┐
|
||||||
|
// ┌────────▼─────────┐ │Queuer │ Await() │ podUpdates │ │ │
|
||||||
|
// │ podUpdatesBypass │ │- Yield() *api.Pod ├──pod CRUD ─▶ (queue.HistoricalFIFO) ◀──reflector──▶pods ListWatch ├──apiserver──▶
|
||||||
|
// └────────▲─────────┘ │- Requeue(pod)/Dequeue(id)/Reoffer(pod)│ events │ │ │ │
|
||||||
|
// │ └───────────────────▲───────────────────┘ └───────────┬────────────┘ └───────────────┘
|
||||||
|
// │ │ │
|
||||||
|
// │ │ │
|
||||||
|
// └───────────────┐┌───────────────────▲────────────────────▲─────────────────────┐ └───────────────────────┐
|
||||||
|
// ││ │ │ ┌────────────────────┼─────────────────┐
|
||||||
|
// ┌───────────────────┼┼──────────────────────────────────────┐ │ ┌───────────────────┼────┼───────────┐ │ │
|
||||||
|
// ┌───────────▼──────────┐┌───────┴┴───────┐ ┌───────────────────┐ ┌──┴─┴─┴──────┐ ┌────────┴────┴───┐ ┌────▼────────▼─────────────┐ │
|
||||||
|
// │Binder (task launcher)││Deleter │ │PodReconciler │ │Controller │ │ ErrorHandler │ │SchedulerAlgorithm │ │
|
||||||
|
// │- Bind(binding) ││- DeleteOne(pod)│ │- Reconcile(pod) │ │- Run() │ │- Error(pod, err)│ │- Schedule(pod) -> NodeName│ │
|
||||||
|
// │ ││ │◀──│ │ │ │──▶│ │ │ │ │
|
||||||
|
// │ ┌─────┐││ ┌─────┐ │ │ ┌─────┐ │ │ ┌─────┐ │ │ ┌─────┐ │ │┌─────┐ │ │
|
||||||
|
// └───────────────┤sched├┘└────┤sched├─────┘ └──────┤sched├───▲──┘ └───┤sched├───┘ └────┤sched├──────┘ └┤sched├──────────────┬─────┘ │
|
||||||
|
// ├-│││-┴──────┴--││-┴────────────────┴--│--┴───┼──────────┴--│--┴────────────┴-│---┴──────────┴-│││-┤ ┌────────────▼─────────▼─────────┐
|
||||||
|
// │ │││ ││ │ │ │ │ │││ │ │ podScheduler │
|
||||||
|
// │ ││└───────────▼┼─────────────────────▼──────┼─────────────▼─────────────────▼────────────────┘││ │ │ (e.g. fcfsPodScheduler) │
|
||||||
|
// │ │└─────────────┼────────────────────────────┼─────────────┼──────────────────▼────────────────┘│ │ │ │
|
||||||
|
// │ │ │ │ │ │ │ │ │ scheduleOne(pod, offers ...) │
|
||||||
|
// │ │ │ │ │ │ │ │ │ ┌──────────────────────────┤
|
||||||
|
// │ │ │ ╲ │ │ │ ╱ │ │ │ ▼ │ │ │ allocationStrategy │
|
||||||
|
// │ │ │ ╲ └┐ │ ┌┘ ╱ │ │ │ │ │ │ - FitPredicate │
|
||||||
|
// │ │ │ ╲ │ │ │ ╱ │ │ │ │ │ │ - Procurement │
|
||||||
|
// │ │ │ ╲ └┐ │ ┌┘ ╱ │ │ │ │ └─────┴──────────────────────────┘
|
||||||
|
// │┌▼────────────┐┌▼──────────┐┌─▼─▼─▼─▼─▼─┐┌───┴────────┐┌───▼───┐ ┌────▼───┐ │
|
||||||
|
// ││LaunchTask(t)││KillTask(t)││sync.Mutex ││reconcile(t)││Tasks()│ │Offers()│ │
|
||||||
|
// │└──────┬──────┘└─────┬─────┘└───────────┘└────────▲───┘└───┬───┘ └────┬───┘ │
|
||||||
|
// │ │ │ │ │ │ │
|
||||||
|
// │ │ └──────────────────┐ │ ┌───▼────────────┐ │ │
|
||||||
|
// │ └──────────────────────────────┐ │ │ │podtask.Registry│ │ │
|
||||||
|
// │ │ │ │ └────────────────┘ │ │ ┌──────────────────────┐
|
||||||
|
// │ │ │ │ │ │ │ │
|
||||||
|
// │Scheduler │ └──────┐ │ │ │ │ A ──────────▶ B │
|
||||||
|
// └──────────────────────────────────────┼────────┼─┬│----┬──────────────────────┼───────────────────┘ │ │
|
||||||
|
// ┌──────────────────────────────────────┼────────┼─┤sched├──────────────────────┼─────────────────────────┐ │ A has a reference │
|
||||||
|
// │Framework │ │ └─────┘ ┌────▼───┐ │ │ on B and calls B │
|
||||||
|
// │ ┌──────▼──────┐┌▼──────────┐ │Offers()│ │ │ │
|
||||||
|
// │ │LaunchTask(t)││KillTask(t)│ └────┬───┘ │ └──────────────────────┘
|
||||||
|
// │ └─────────┬───┘└──────┬────┘ ┌────────▼───────┐ │
|
||||||
|
// │implements: mesos-go/scheduler.Scheduler └───────────▼ │offers.Registry │ │
|
||||||
|
// │ │ └────────────────┘ │
|
||||||
|
// │ ┌─────────────────┐ ┌──▼─────────────┐ │
|
||||||
|
// └────────────────────────┤ ├───────┤ Mesos ├────────────────────────────────────┘
|
||||||
|
// │ TasksReconciler │ │ Scheduler │
|
||||||
|
// │ ├───────▶ Driver │
|
||||||
|
// └─────────────────┘ └────────┬───────┘
|
||||||
|
// │
|
||||||
|
// │
|
||||||
|
// ▼
|
||||||
|
18
contrib/mesos/pkg/scheduler/errors/doc.go
Normal file
18
contrib/mesos/pkg/scheduler/errors/doc.go
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
/*
|
||||||
|
Copyright 2015 The Kubernetes Authors All rights reserved.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
// Package errors contains all scheduler wide used errors
|
||||||
|
package errors
|
28
contrib/mesos/pkg/scheduler/errors/errors.go
Normal file
28
contrib/mesos/pkg/scheduler/errors/errors.go
Normal file
@@ -0,0 +1,28 @@
|
|||||||
|
/*
|
||||||
|
Copyright 2015 The Kubernetes Authors All rights reserved.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package errors
|
||||||
|
|
||||||
|
import (
|
||||||
|
"errors"
|
||||||
|
)
|
||||||
|
|
||||||
|
var (
|
||||||
|
NoSuchPodErr = errors.New("No such pod exists")
|
||||||
|
NoSuchTaskErr = errors.New("No such task exists")
|
||||||
|
ReconciliationCancelledErr = errors.New("explicit task reconciliation cancelled")
|
||||||
|
NoSuitableOffersErr = errors.New("No suitable offers for pod/task")
|
||||||
|
)
|
@@ -112,10 +112,10 @@ type SchedulerProcess struct {
|
|||||||
fin chan struct{}
|
fin chan struct{}
|
||||||
}
|
}
|
||||||
|
|
||||||
func New(sched bindings.Scheduler) *SchedulerProcess {
|
func New(framework bindings.Scheduler) *SchedulerProcess {
|
||||||
p := &SchedulerProcess{
|
p := &SchedulerProcess{
|
||||||
Process: proc.New(),
|
Process: proc.New(),
|
||||||
Scheduler: sched,
|
Scheduler: framework,
|
||||||
stage: initStage,
|
stage: initStage,
|
||||||
elected: make(chan struct{}),
|
elected: make(chan struct{}),
|
||||||
failover: make(chan struct{}),
|
failover: make(chan struct{}),
|
||||||
|
18
contrib/mesos/pkg/scheduler/integration/doc.go
Normal file
18
contrib/mesos/pkg/scheduler/integration/doc.go
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
/*
|
||||||
|
Copyright 2015 The Kubernetes Authors All rights reserved.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
// Package integration implements integration tests.
|
||||||
|
package integration
|
@@ -14,7 +14,7 @@ See the License for the specific language governing permissions and
|
|||||||
limitations under the License.
|
limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
package scheduler
|
package integration
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
@@ -25,14 +25,6 @@ import (
|
|||||||
"testing"
|
"testing"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"k8s.io/kubernetes/pkg/api"
|
|
||||||
"k8s.io/kubernetes/pkg/api/testapi"
|
|
||||||
"k8s.io/kubernetes/pkg/api/unversioned"
|
|
||||||
"k8s.io/kubernetes/pkg/client/cache"
|
|
||||||
client "k8s.io/kubernetes/pkg/client/unversioned"
|
|
||||||
"k8s.io/kubernetes/pkg/runtime"
|
|
||||||
"k8s.io/kubernetes/pkg/watch"
|
|
||||||
|
|
||||||
log "github.com/golang/glog"
|
log "github.com/golang/glog"
|
||||||
mesos "github.com/mesos/mesos-go/mesosproto"
|
mesos "github.com/mesos/mesos-go/mesosproto"
|
||||||
"github.com/mesos/mesos-go/mesosutil"
|
"github.com/mesos/mesos-go/mesosutil"
|
||||||
@@ -41,13 +33,24 @@ import (
|
|||||||
"github.com/stretchr/testify/mock"
|
"github.com/stretchr/testify/mock"
|
||||||
assertext "k8s.io/kubernetes/contrib/mesos/pkg/assert"
|
assertext "k8s.io/kubernetes/contrib/mesos/pkg/assert"
|
||||||
"k8s.io/kubernetes/contrib/mesos/pkg/executor/messages"
|
"k8s.io/kubernetes/contrib/mesos/pkg/executor/messages"
|
||||||
"k8s.io/kubernetes/contrib/mesos/pkg/queue"
|
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler"
|
||||||
|
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/components"
|
||||||
|
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/components/algorithm/podschedulers"
|
||||||
|
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/components/controller"
|
||||||
|
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/components/framework"
|
||||||
schedcfg "k8s.io/kubernetes/contrib/mesos/pkg/scheduler/config"
|
schedcfg "k8s.io/kubernetes/contrib/mesos/pkg/scheduler/config"
|
||||||
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/ha"
|
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/ha"
|
||||||
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/meta"
|
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/meta"
|
||||||
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/podtask"
|
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/podtask"
|
||||||
mresource "k8s.io/kubernetes/contrib/mesos/pkg/scheduler/resource"
|
mresource "k8s.io/kubernetes/contrib/mesos/pkg/scheduler/resource"
|
||||||
|
"k8s.io/kubernetes/pkg/api"
|
||||||
|
"k8s.io/kubernetes/pkg/api/testapi"
|
||||||
|
"k8s.io/kubernetes/pkg/api/unversioned"
|
||||||
|
"k8s.io/kubernetes/pkg/client/cache"
|
||||||
|
client "k8s.io/kubernetes/pkg/client/unversioned"
|
||||||
|
"k8s.io/kubernetes/pkg/runtime"
|
||||||
"k8s.io/kubernetes/pkg/util"
|
"k8s.io/kubernetes/pkg/util"
|
||||||
|
"k8s.io/kubernetes/pkg/watch"
|
||||||
)
|
)
|
||||||
|
|
||||||
// A apiserver mock which partially mocks the pods API
|
// A apiserver mock which partially mocks the pods API
|
||||||
@@ -399,19 +402,6 @@ func (a *EventAssertions) EventWithReason(observer *EventObserver, reason string
|
|||||||
}, msgAndArgs...)
|
}, msgAndArgs...)
|
||||||
}
|
}
|
||||||
|
|
||||||
type joinableDriver struct {
|
|
||||||
MockSchedulerDriver
|
|
||||||
joinFunc func() (mesos.Status, error)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Join invokes joinFunc if it has been set, otherwise blocks forever
|
|
||||||
func (m *joinableDriver) Join() (mesos.Status, error) {
|
|
||||||
if m.joinFunc != nil {
|
|
||||||
return m.joinFunc()
|
|
||||||
}
|
|
||||||
select {}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Create mesos.TaskStatus for a given task
|
// Create mesos.TaskStatus for a given task
|
||||||
func newTaskStatusForTask(task *mesos.TaskInfo, state mesos.TaskState) *mesos.TaskStatus {
|
func newTaskStatusForTask(task *mesos.TaskInfo, state mesos.TaskState) *mesos.TaskStatus {
|
||||||
healthy := state == mesos.TaskState_TASK_RUNNING
|
healthy := state == mesos.TaskState_TASK_RUNNING
|
||||||
@@ -436,12 +426,12 @@ type LaunchedTask struct {
|
|||||||
|
|
||||||
type lifecycleTest struct {
|
type lifecycleTest struct {
|
||||||
apiServer *TestServer
|
apiServer *TestServer
|
||||||
driver *joinableDriver
|
driver *framework.JoinableDriver
|
||||||
eventObs *EventObserver
|
eventObs *EventObserver
|
||||||
plugin *schedulingPlugin
|
|
||||||
podsListWatch *MockPodsListWatch
|
podsListWatch *MockPodsListWatch
|
||||||
scheduler *KubernetesScheduler
|
framework framework.Framework
|
||||||
schedulerProc *ha.SchedulerProcess
|
schedulerProc *ha.SchedulerProcess
|
||||||
|
sched scheduler.Scheduler
|
||||||
t *testing.T
|
t *testing.T
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -454,15 +444,33 @@ func newLifecycleTest(t *testing.T) lifecycleTest {
|
|||||||
// create fake apiserver
|
// create fake apiserver
|
||||||
apiServer := NewTestServer(t, api.NamespaceDefault, podsListWatch)
|
apiServer := NewTestServer(t, api.NamespaceDefault, podsListWatch)
|
||||||
|
|
||||||
// create executor with some data for static pods if set
|
// create ExecutorInfo with some data for static pods if set
|
||||||
executor := mesosutil.NewExecutorInfo(
|
ei := mesosutil.NewExecutorInfo(
|
||||||
mesosutil.NewExecutorID("executor-id"),
|
mesosutil.NewExecutorID("executor-id"),
|
||||||
mesosutil.NewCommandInfo("executor-cmd"),
|
mesosutil.NewCommandInfo("executor-cmd"),
|
||||||
)
|
)
|
||||||
executor.Data = []byte{0, 1, 2}
|
ei.Data = []byte{0, 1, 2}
|
||||||
|
|
||||||
// create scheduler
|
// create framework
|
||||||
strategy := NewAllocationStrategy(
|
client := client.NewOrDie(&client.Config{
|
||||||
|
Host: apiServer.server.URL,
|
||||||
|
Version: testapi.Default.Version(),
|
||||||
|
})
|
||||||
|
c := *schedcfg.CreateDefaultConfig()
|
||||||
|
fw := framework.New(framework.Config{
|
||||||
|
Executor: ei,
|
||||||
|
Client: client,
|
||||||
|
SchedulerConfig: c,
|
||||||
|
LookupNode: apiServer.LookupNode,
|
||||||
|
})
|
||||||
|
|
||||||
|
// TODO(sttts): re-enable the following tests
|
||||||
|
// assert.NotNil(framework.client, "client is nil")
|
||||||
|
// assert.NotNil(framework.executor, "executor is nil")
|
||||||
|
// assert.NotNil(framework.offers, "offer registry is nil")
|
||||||
|
|
||||||
|
// create pod scheduler
|
||||||
|
strategy := podschedulers.NewAllocationStrategy(
|
||||||
podtask.NewDefaultPredicate(
|
podtask.NewDefaultPredicate(
|
||||||
mresource.DefaultDefaultContainerCPULimit,
|
mresource.DefaultDefaultContainerCPULimit,
|
||||||
mresource.DefaultDefaultContainerMemLimit,
|
mresource.DefaultDefaultContainerMemLimit,
|
||||||
@@ -472,64 +480,39 @@ func newLifecycleTest(t *testing.T) lifecycleTest {
|
|||||||
mresource.DefaultDefaultContainerMemLimit,
|
mresource.DefaultDefaultContainerMemLimit,
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
fcfs := podschedulers.NewFCFSPodScheduler(strategy, apiServer.LookupNode)
|
||||||
scheduler := New(Config{
|
|
||||||
Executor: executor,
|
|
||||||
Client: client.NewOrDie(&client.Config{
|
|
||||||
Host: apiServer.server.URL,
|
|
||||||
Version: testapi.Default.Version(),
|
|
||||||
}),
|
|
||||||
Scheduler: NewFCFSPodScheduler(strategy, apiServer.LookupNode),
|
|
||||||
Schedcfg: *schedcfg.CreateDefaultConfig(),
|
|
||||||
LookupNode: apiServer.LookupNode,
|
|
||||||
})
|
|
||||||
|
|
||||||
assert.NotNil(scheduler.client, "client is nil")
|
|
||||||
assert.NotNil(scheduler.executor, "executor is nil")
|
|
||||||
assert.NotNil(scheduler.offers, "offer registry is nil")
|
|
||||||
|
|
||||||
// create scheduler process
|
// create scheduler process
|
||||||
schedulerProc := ha.New(scheduler)
|
schedulerProc := ha.New(fw)
|
||||||
|
|
||||||
// get plugin config from it
|
// create scheduler
|
||||||
config := scheduler.NewPluginConfig(
|
|
||||||
schedulerProc.Terminal(),
|
|
||||||
http.DefaultServeMux,
|
|
||||||
&podsListWatch.ListWatch,
|
|
||||||
)
|
|
||||||
assert.NotNil(config)
|
|
||||||
|
|
||||||
// make events observable
|
|
||||||
eventObs := NewEventObserver()
|
eventObs := NewEventObserver()
|
||||||
config.Recorder = eventObs
|
scheduler := components.New(&c, fw, fcfs, client, eventObs, schedulerProc.Terminal(), http.DefaultServeMux, &podsListWatch.ListWatch)
|
||||||
|
assert.NotNil(scheduler)
|
||||||
// create plugin
|
|
||||||
plugin := NewPlugin(config).(*schedulingPlugin)
|
|
||||||
assert.NotNil(plugin)
|
|
||||||
|
|
||||||
// create mock mesos scheduler driver
|
// create mock mesos scheduler driver
|
||||||
driver := &joinableDriver{}
|
driver := &framework.JoinableDriver{}
|
||||||
|
|
||||||
return lifecycleTest{
|
return lifecycleTest{
|
||||||
apiServer: apiServer,
|
apiServer: apiServer,
|
||||||
driver: driver,
|
driver: driver,
|
||||||
eventObs: eventObs,
|
eventObs: eventObs,
|
||||||
plugin: plugin,
|
|
||||||
podsListWatch: podsListWatch,
|
podsListWatch: podsListWatch,
|
||||||
scheduler: scheduler,
|
framework: fw,
|
||||||
schedulerProc: schedulerProc,
|
schedulerProc: schedulerProc,
|
||||||
|
sched: scheduler,
|
||||||
t: t,
|
t: t,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (lt lifecycleTest) Start() <-chan LaunchedTask {
|
func (lt lifecycleTest) Start() <-chan LaunchedTask {
|
||||||
assert := &EventAssertions{*assert.New(lt.t)}
|
assert := &EventAssertions{*assert.New(lt.t)}
|
||||||
lt.plugin.Run(lt.schedulerProc.Terminal())
|
lt.sched.Run(lt.schedulerProc.Terminal())
|
||||||
|
|
||||||
// init scheduler
|
// init framework
|
||||||
err := lt.scheduler.Init(
|
err := lt.framework.Init(
|
||||||
|
lt.sched,
|
||||||
lt.schedulerProc.Master(),
|
lt.schedulerProc.Master(),
|
||||||
lt.plugin,
|
|
||||||
http.DefaultServeMux,
|
http.DefaultServeMux,
|
||||||
)
|
)
|
||||||
assert.NoError(err)
|
assert.NoError(err)
|
||||||
@@ -582,7 +565,7 @@ func (lt lifecycleTest) Start() <-chan LaunchedTask {
|
|||||||
<-started
|
<-started
|
||||||
|
|
||||||
// tell scheduler to be registered
|
// tell scheduler to be registered
|
||||||
lt.scheduler.Registered(
|
lt.framework.Registered(
|
||||||
lt.driver,
|
lt.driver,
|
||||||
mesosutil.NewFrameworkID("kubernetes-id"),
|
mesosutil.NewFrameworkID("kubernetes-id"),
|
||||||
mesosutil.NewMasterInfo("master-id", (192<<24)+(168<<16)+(0<<8)+1, 5050),
|
mesosutil.NewMasterInfo("master-id", (192<<24)+(168<<16)+(0<<8)+1, 5050),
|
||||||
@@ -601,19 +584,10 @@ func (lt lifecycleTest) End() <-chan struct{} {
|
|||||||
return lt.schedulerProc.End()
|
return lt.schedulerProc.End()
|
||||||
}
|
}
|
||||||
|
|
||||||
// Test to create the scheduler plugin with an empty plugin config
|
// TestScheduler_LifeCycle creates a scheduler plugin with the config returned by the scheduler,
|
||||||
func TestPlugin_New(t *testing.T) {
|
|
||||||
assert := assert.New(t)
|
|
||||||
|
|
||||||
c := PluginConfig{}
|
|
||||||
p := NewPlugin(&c)
|
|
||||||
assert.NotNil(p)
|
|
||||||
}
|
|
||||||
|
|
||||||
// TestPlugin_LifeCycle creates a scheduler plugin with the config returned by the scheduler,
|
|
||||||
// and plays through the whole life cycle of the plugin while creating pods, deleting
|
// and plays through the whole life cycle of the plugin while creating pods, deleting
|
||||||
// and failing them.
|
// and failing them.
|
||||||
func TestPlugin_LifeCycle(t *testing.T) {
|
func TestScheduler_LifeCycle(t *testing.T) {
|
||||||
assert := &EventAssertions{*assert.New(t)}
|
assert := &EventAssertions{*assert.New(t)}
|
||||||
lt := newLifecycleTest(t)
|
lt := newLifecycleTest(t)
|
||||||
defer lt.Close()
|
defer lt.Close()
|
||||||
@@ -627,29 +601,29 @@ func TestPlugin_LifeCycle(t *testing.T) {
|
|||||||
lt.podsListWatch.Add(pod, true) // notify watchers
|
lt.podsListWatch.Add(pod, true) // notify watchers
|
||||||
|
|
||||||
// wait for failedScheduling event because there is no offer
|
// wait for failedScheduling event because there is no offer
|
||||||
assert.EventWithReason(lt.eventObs, FailedScheduling, "failedScheduling event not received")
|
assert.EventWithReason(lt.eventObs, controller.FailedScheduling, "failedScheduling event not received")
|
||||||
|
|
||||||
// add some matching offer
|
// add some matching offer
|
||||||
offers := []*mesos.Offer{NewTestOffer(fmt.Sprintf("offer%d", i))}
|
offers := []*mesos.Offer{NewTestOffer(fmt.Sprintf("offer%d", i))}
|
||||||
lt.scheduler.ResourceOffers(nil, offers)
|
lt.framework.ResourceOffers(nil, offers)
|
||||||
|
|
||||||
// first offer is declined because node is not available yet
|
// first offer is declined because node is not available yet
|
||||||
lt.apiServer.WaitForNode("some_hostname")
|
lt.apiServer.WaitForNode("some_hostname")
|
||||||
|
|
||||||
// add one more offer
|
// add one more offer
|
||||||
lt.scheduler.ResourceOffers(nil, offers)
|
lt.framework.ResourceOffers(nil, offers)
|
||||||
|
|
||||||
// and wait for scheduled pod
|
// and wait for scheduled pod
|
||||||
assert.EventWithReason(lt.eventObs, Scheduled)
|
assert.EventWithReason(lt.eventObs, controller.Scheduled)
|
||||||
select {
|
select {
|
||||||
case launchedTask := <-launchedTasks:
|
case launchedTask := <-launchedTasks:
|
||||||
// report back that the task has been staged, and then started by mesos
|
// report back that the task has been staged, and then started by mesos
|
||||||
lt.scheduler.StatusUpdate(
|
lt.framework.StatusUpdate(
|
||||||
lt.driver,
|
lt.driver,
|
||||||
newTaskStatusForTask(launchedTask.taskInfo, mesos.TaskState_TASK_STAGING),
|
newTaskStatusForTask(launchedTask.taskInfo, mesos.TaskState_TASK_STAGING),
|
||||||
)
|
)
|
||||||
|
|
||||||
lt.scheduler.StatusUpdate(
|
lt.framework.StatusUpdate(
|
||||||
lt.driver,
|
lt.driver,
|
||||||
newTaskStatusForTask(launchedTask.taskInfo, mesos.TaskState_TASK_RUNNING),
|
newTaskStatusForTask(launchedTask.taskInfo, mesos.TaskState_TASK_RUNNING),
|
||||||
)
|
)
|
||||||
@@ -660,7 +634,7 @@ func TestPlugin_LifeCycle(t *testing.T) {
|
|||||||
// report back that the task has been lost
|
// report back that the task has been lost
|
||||||
lt.driver.AssertNumberOfCalls(t, "SendFrameworkMessage", 0)
|
lt.driver.AssertNumberOfCalls(t, "SendFrameworkMessage", 0)
|
||||||
|
|
||||||
lt.scheduler.StatusUpdate(
|
lt.framework.StatusUpdate(
|
||||||
lt.driver,
|
lt.driver,
|
||||||
newTaskStatusForTask(launchedTask.taskInfo, mesos.TaskState_TASK_LOST),
|
newTaskStatusForTask(launchedTask.taskInfo, mesos.TaskState_TASK_LOST),
|
||||||
)
|
)
|
||||||
@@ -677,22 +651,22 @@ func TestPlugin_LifeCycle(t *testing.T) {
|
|||||||
// Launch a pod and wait until the scheduler driver is called
|
// Launch a pod and wait until the scheduler driver is called
|
||||||
schedulePodWithOffers := func(pod *api.Pod, offers []*mesos.Offer) (*api.Pod, *LaunchedTask, *mesos.Offer) {
|
schedulePodWithOffers := func(pod *api.Pod, offers []*mesos.Offer) (*api.Pod, *LaunchedTask, *mesos.Offer) {
|
||||||
// wait for failedScheduling event because there is no offer
|
// wait for failedScheduling event because there is no offer
|
||||||
assert.EventWithReason(lt.eventObs, FailedScheduling, "failedScheduling event not received")
|
assert.EventWithReason(lt.eventObs, controller.FailedScheduling, "failedScheduling event not received")
|
||||||
|
|
||||||
// supply a matching offer
|
// supply a matching offer
|
||||||
lt.scheduler.ResourceOffers(lt.driver, offers)
|
lt.framework.ResourceOffers(lt.driver, offers)
|
||||||
for _, offer := range offers {
|
for _, offer := range offers {
|
||||||
if _, ok := offeredNodes[offer.GetHostname()]; !ok {
|
if _, ok := offeredNodes[offer.GetHostname()]; !ok {
|
||||||
offeredNodes[offer.GetHostname()] = struct{}{}
|
offeredNodes[offer.GetHostname()] = struct{}{}
|
||||||
lt.apiServer.WaitForNode(offer.GetHostname())
|
lt.apiServer.WaitForNode(offer.GetHostname())
|
||||||
|
|
||||||
// reoffer since it must have been declined above
|
// reoffer since it must have been declined above
|
||||||
lt.scheduler.ResourceOffers(lt.driver, []*mesos.Offer{offer})
|
lt.framework.ResourceOffers(lt.driver, []*mesos.Offer{offer})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// and wait to get scheduled
|
// and wait to get scheduled
|
||||||
assert.EventWithReason(lt.eventObs, Scheduled)
|
assert.EventWithReason(lt.eventObs, controller.Scheduled)
|
||||||
|
|
||||||
// wait for driver.launchTasks call
|
// wait for driver.launchTasks call
|
||||||
select {
|
select {
|
||||||
@@ -722,11 +696,11 @@ func TestPlugin_LifeCycle(t *testing.T) {
|
|||||||
pod, launchedTask, offer := launchPodWithOffers(pod, offers)
|
pod, launchedTask, offer := launchPodWithOffers(pod, offers)
|
||||||
if pod != nil {
|
if pod != nil {
|
||||||
// report back status
|
// report back status
|
||||||
lt.scheduler.StatusUpdate(
|
lt.framework.StatusUpdate(
|
||||||
lt.driver,
|
lt.driver,
|
||||||
newTaskStatusForTask(launchedTask.taskInfo, mesos.TaskState_TASK_STAGING),
|
newTaskStatusForTask(launchedTask.taskInfo, mesos.TaskState_TASK_STAGING),
|
||||||
)
|
)
|
||||||
lt.scheduler.StatusUpdate(
|
lt.framework.StatusUpdate(
|
||||||
lt.driver,
|
lt.driver,
|
||||||
newTaskStatusForTask(launchedTask.taskInfo, mesos.TaskState_TASK_RUNNING),
|
newTaskStatusForTask(launchedTask.taskInfo, mesos.TaskState_TASK_RUNNING),
|
||||||
)
|
)
|
||||||
@@ -762,7 +736,7 @@ func TestPlugin_LifeCycle(t *testing.T) {
|
|||||||
select {
|
select {
|
||||||
case <-killTaskCalled:
|
case <-killTaskCalled:
|
||||||
// report back that the task is finished
|
// report back that the task is finished
|
||||||
lt.scheduler.StatusUpdate(
|
lt.framework.StatusUpdate(
|
||||||
lt.driver,
|
lt.driver,
|
||||||
newTaskStatusForTask(launchedTask.taskInfo, mesos.TaskState_TASK_FINISHED),
|
newTaskStatusForTask(launchedTask.taskInfo, mesos.TaskState_TASK_FINISHED),
|
||||||
)
|
)
|
||||||
@@ -787,8 +761,8 @@ func TestPlugin_LifeCycle(t *testing.T) {
|
|||||||
assert.Equal(offers[1].Id.GetValue(), usedOffer.Id.GetValue())
|
assert.Equal(offers[1].Id.GetValue(), usedOffer.Id.GetValue())
|
||||||
assert.Equal(pod.Spec.NodeName, *usedOffer.Hostname)
|
assert.Equal(pod.Spec.NodeName, *usedOffer.Hostname)
|
||||||
|
|
||||||
lt.scheduler.OfferRescinded(lt.driver, offers[0].Id)
|
lt.framework.OfferRescinded(lt.driver, offers[0].Id)
|
||||||
lt.scheduler.OfferRescinded(lt.driver, offers[2].Id)
|
lt.framework.OfferRescinded(lt.driver, offers[2].Id)
|
||||||
|
|
||||||
// start pods:
|
// start pods:
|
||||||
// - which are failing while binding,
|
// - which are failing while binding,
|
||||||
@@ -800,7 +774,7 @@ func TestPlugin_LifeCycle(t *testing.T) {
|
|||||||
status := newTaskStatusForTask(task, mesos.TaskState_TASK_FAILED)
|
status := newTaskStatusForTask(task, mesos.TaskState_TASK_FAILED)
|
||||||
message := messages.CreateBindingFailure
|
message := messages.CreateBindingFailure
|
||||||
status.Message = &message
|
status.Message = &message
|
||||||
lt.scheduler.StatusUpdate(lt.driver, status)
|
lt.framework.StatusUpdate(lt.driver, status)
|
||||||
|
|
||||||
// wait until pod is looked up at the apiserver
|
// wait until pod is looked up at the apiserver
|
||||||
assertext.EventuallyTrue(t, util.ForeverTestTimeout, func() bool {
|
assertext.EventuallyTrue(t, util.ForeverTestTimeout, func() bool {
|
||||||
@@ -822,7 +796,7 @@ func TestPlugin_LifeCycle(t *testing.T) {
|
|||||||
|
|
||||||
podKey, _ := podtask.MakePodKey(api.NewDefaultContext(), pod.Name)
|
podKey, _ := podtask.MakePodKey(api.NewDefaultContext(), pod.Name)
|
||||||
assertext.EventuallyTrue(t, util.ForeverTestTimeout, func() bool {
|
assertext.EventuallyTrue(t, util.ForeverTestTimeout, func() bool {
|
||||||
t, _ := lt.plugin.api.tasks().ForPod(podKey)
|
t, _ := lt.sched.Tasks().ForPod(podKey)
|
||||||
return t == nil
|
return t == nil
|
||||||
})
|
})
|
||||||
|
|
||||||
@@ -845,143 +819,3 @@ func TestPlugin_LifeCycle(t *testing.T) {
|
|||||||
time.Sleep(time.Second / 2)
|
time.Sleep(time.Second / 2)
|
||||||
failPodFromExecutor(launchedTask.taskInfo)
|
failPodFromExecutor(launchedTask.taskInfo)
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestDeleteOne_NonexistentPod(t *testing.T) {
|
|
||||||
assert := assert.New(t)
|
|
||||||
obj := &MockScheduler{}
|
|
||||||
reg := podtask.NewInMemoryRegistry()
|
|
||||||
obj.On("tasks").Return(reg)
|
|
||||||
|
|
||||||
qr := newQueuer(nil)
|
|
||||||
assert.Equal(0, len(qr.podQueue.List()))
|
|
||||||
d := &deleter{
|
|
||||||
api: obj,
|
|
||||||
qr: qr,
|
|
||||||
}
|
|
||||||
pod := &Pod{Pod: &api.Pod{
|
|
||||||
ObjectMeta: api.ObjectMeta{
|
|
||||||
Name: "foo",
|
|
||||||
Namespace: api.NamespaceDefault,
|
|
||||||
}}}
|
|
||||||
err := d.deleteOne(pod)
|
|
||||||
assert.Equal(err, noSuchPodErr)
|
|
||||||
obj.AssertExpectations(t)
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestDeleteOne_PendingPod(t *testing.T) {
|
|
||||||
assert := assert.New(t)
|
|
||||||
obj := &MockScheduler{}
|
|
||||||
reg := podtask.NewInMemoryRegistry()
|
|
||||||
obj.On("tasks").Return(reg)
|
|
||||||
|
|
||||||
pod := &Pod{Pod: &api.Pod{
|
|
||||||
ObjectMeta: api.ObjectMeta{
|
|
||||||
Name: "foo",
|
|
||||||
UID: "foo0",
|
|
||||||
Namespace: api.NamespaceDefault,
|
|
||||||
}}}
|
|
||||||
|
|
||||||
task, err := podtask.New(api.NewDefaultContext(), "bar", *pod.Pod, &mesos.ExecutorInfo{})
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("failed to create task: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
_, err = reg.Register(task)
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("failed to register task: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
// preconditions
|
|
||||||
qr := newQueuer(nil)
|
|
||||||
qr.podQueue.Add(pod, queue.ReplaceExisting)
|
|
||||||
assert.Equal(1, len(qr.podQueue.List()))
|
|
||||||
_, found := qr.podQueue.Get("default/foo")
|
|
||||||
assert.True(found)
|
|
||||||
|
|
||||||
// exec & post conditions
|
|
||||||
d := &deleter{
|
|
||||||
api: obj,
|
|
||||||
qr: qr,
|
|
||||||
}
|
|
||||||
err = d.deleteOne(pod)
|
|
||||||
assert.Nil(err)
|
|
||||||
_, found = qr.podQueue.Get("foo0")
|
|
||||||
assert.False(found)
|
|
||||||
assert.Equal(0, len(qr.podQueue.List()))
|
|
||||||
obj.AssertExpectations(t)
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestDeleteOne_Running(t *testing.T) {
|
|
||||||
assert := assert.New(t)
|
|
||||||
obj := &MockScheduler{}
|
|
||||||
reg := podtask.NewInMemoryRegistry()
|
|
||||||
obj.On("tasks").Return(reg)
|
|
||||||
|
|
||||||
pod := &Pod{Pod: &api.Pod{
|
|
||||||
ObjectMeta: api.ObjectMeta{
|
|
||||||
Name: "foo",
|
|
||||||
UID: "foo0",
|
|
||||||
Namespace: api.NamespaceDefault,
|
|
||||||
}}}
|
|
||||||
|
|
||||||
task, err := podtask.New(api.NewDefaultContext(), "bar", *pod.Pod, &mesos.ExecutorInfo{})
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("unexpected error: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
task, err = reg.Register(task)
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("unexpected error: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
task.Set(podtask.Launched)
|
|
||||||
err = reg.Update(task)
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("unexpected error: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
// preconditions
|
|
||||||
qr := newQueuer(nil)
|
|
||||||
qr.podQueue.Add(pod, queue.ReplaceExisting)
|
|
||||||
assert.Equal(1, len(qr.podQueue.List()))
|
|
||||||
_, found := qr.podQueue.Get("default/foo")
|
|
||||||
assert.True(found)
|
|
||||||
|
|
||||||
obj.On("killTask", task.ID).Return(nil)
|
|
||||||
|
|
||||||
// exec & post conditions
|
|
||||||
d := &deleter{
|
|
||||||
api: obj,
|
|
||||||
qr: qr,
|
|
||||||
}
|
|
||||||
err = d.deleteOne(pod)
|
|
||||||
assert.Nil(err)
|
|
||||||
_, found = qr.podQueue.Get("foo0")
|
|
||||||
assert.False(found)
|
|
||||||
assert.Equal(0, len(qr.podQueue.List()))
|
|
||||||
obj.AssertExpectations(t)
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestDeleteOne_badPodNaming(t *testing.T) {
|
|
||||||
assert := assert.New(t)
|
|
||||||
obj := &MockScheduler{}
|
|
||||||
pod := &Pod{Pod: &api.Pod{}}
|
|
||||||
d := &deleter{
|
|
||||||
api: obj,
|
|
||||||
qr: newQueuer(nil),
|
|
||||||
}
|
|
||||||
|
|
||||||
err := d.deleteOne(pod)
|
|
||||||
assert.NotNil(err)
|
|
||||||
|
|
||||||
pod.Pod.ObjectMeta.Name = "foo"
|
|
||||||
err = d.deleteOne(pod)
|
|
||||||
assert.NotNil(err)
|
|
||||||
|
|
||||||
pod.Pod.ObjectMeta.Name = ""
|
|
||||||
pod.Pod.ObjectMeta.Namespace = "bar"
|
|
||||||
err = d.deleteOne(pod)
|
|
||||||
assert.NotNil(err)
|
|
||||||
|
|
||||||
obj.AssertExpectations(t)
|
|
||||||
}
|
|
@@ -25,7 +25,6 @@ const (
|
|||||||
TaskIdKey = "k8s.mesosphere.io/taskId"
|
TaskIdKey = "k8s.mesosphere.io/taskId"
|
||||||
SlaveIdKey = "k8s.mesosphere.io/slaveId"
|
SlaveIdKey = "k8s.mesosphere.io/slaveId"
|
||||||
OfferIdKey = "k8s.mesosphere.io/offerId"
|
OfferIdKey = "k8s.mesosphere.io/offerId"
|
||||||
ExecutorIdKey = "k8s.mesosphere.io/executorId"
|
|
||||||
PortMappingKeyPrefix = "k8s.mesosphere.io/port_"
|
PortMappingKeyPrefix = "k8s.mesosphere.io/port_"
|
||||||
PortMappingKeyFormat = PortMappingKeyPrefix + "%s_%d"
|
PortMappingKeyFormat = PortMappingKeyPrefix + "%s_%d"
|
||||||
PortNameMappingKeyPrefix = "k8s.mesosphere.io/portName_"
|
PortNameMappingKeyPrefix = "k8s.mesosphere.io/portName_"
|
||||||
|
@@ -1,930 +0,0 @@
|
|||||||
/*
|
|
||||||
Copyright 2015 The Kubernetes Authors All rights reserved.
|
|
||||||
|
|
||||||
Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
you may not use this file except in compliance with the License.
|
|
||||||
You may obtain a copy of the License at
|
|
||||||
|
|
||||||
http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
|
|
||||||
Unless required by applicable law or agreed to in writing, software
|
|
||||||
distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
See the License for the specific language governing permissions and
|
|
||||||
limitations under the License.
|
|
||||||
*/
|
|
||||||
|
|
||||||
package scheduler
|
|
||||||
|
|
||||||
import (
|
|
||||||
"fmt"
|
|
||||||
"io"
|
|
||||||
"net/http"
|
|
||||||
"strconv"
|
|
||||||
"sync"
|
|
||||||
"time"
|
|
||||||
|
|
||||||
log "github.com/golang/glog"
|
|
||||||
mesos "github.com/mesos/mesos-go/mesosproto"
|
|
||||||
mutil "github.com/mesos/mesos-go/mesosutil"
|
|
||||||
"k8s.io/kubernetes/contrib/mesos/pkg/backoff"
|
|
||||||
"k8s.io/kubernetes/contrib/mesos/pkg/offers"
|
|
||||||
"k8s.io/kubernetes/contrib/mesos/pkg/queue"
|
|
||||||
"k8s.io/kubernetes/contrib/mesos/pkg/runtime"
|
|
||||||
annotation "k8s.io/kubernetes/contrib/mesos/pkg/scheduler/meta"
|
|
||||||
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/podtask"
|
|
||||||
"k8s.io/kubernetes/pkg/api"
|
|
||||||
"k8s.io/kubernetes/pkg/api/errors"
|
|
||||||
"k8s.io/kubernetes/pkg/client/cache"
|
|
||||||
"k8s.io/kubernetes/pkg/client/record"
|
|
||||||
client "k8s.io/kubernetes/pkg/client/unversioned"
|
|
||||||
"k8s.io/kubernetes/pkg/fields"
|
|
||||||
"k8s.io/kubernetes/pkg/util"
|
|
||||||
plugin "k8s.io/kubernetes/plugin/pkg/scheduler"
|
|
||||||
"k8s.io/kubernetes/plugin/pkg/scheduler/algorithm"
|
|
||||||
)
|
|
||||||
|
|
||||||
const (
|
|
||||||
enqueuePopTimeout = 200 * time.Millisecond
|
|
||||||
enqueueWaitTimeout = 1 * time.Second
|
|
||||||
yieldPopTimeout = 200 * time.Millisecond
|
|
||||||
yieldWaitTimeout = 1 * time.Second
|
|
||||||
pluginRecoveryDelay = 100 * time.Millisecond // delay after scheduler plugin crashes, before we resume scheduling
|
|
||||||
)
|
|
||||||
|
|
||||||
const (
|
|
||||||
FailedScheduling = "FailedScheduling"
|
|
||||||
Scheduled = "Scheduled"
|
|
||||||
)
|
|
||||||
|
|
||||||
// scheduler abstraction to allow for easier unit testing
|
|
||||||
type schedulerInterface interface {
|
|
||||||
sync.Locker // synchronize scheduler plugin operations
|
|
||||||
|
|
||||||
SlaveIndex
|
|
||||||
algorithm() PodScheduler
|
|
||||||
offers() offers.Registry
|
|
||||||
tasks() podtask.Registry
|
|
||||||
|
|
||||||
// driver calls
|
|
||||||
|
|
||||||
killTask(taskId string) error
|
|
||||||
launchTask(*podtask.T) error
|
|
||||||
|
|
||||||
// convenience
|
|
||||||
|
|
||||||
createPodTask(api.Context, *api.Pod) (*podtask.T, error)
|
|
||||||
}
|
|
||||||
|
|
||||||
type k8smScheduler struct {
|
|
||||||
sync.Mutex
|
|
||||||
internal *KubernetesScheduler
|
|
||||||
}
|
|
||||||
|
|
||||||
func (k *k8smScheduler) algorithm() PodScheduler {
|
|
||||||
return k.internal
|
|
||||||
}
|
|
||||||
|
|
||||||
func (k *k8smScheduler) offers() offers.Registry {
|
|
||||||
return k.internal.offers
|
|
||||||
}
|
|
||||||
|
|
||||||
func (k *k8smScheduler) tasks() podtask.Registry {
|
|
||||||
return k.internal.taskRegistry
|
|
||||||
}
|
|
||||||
|
|
||||||
func (k *k8smScheduler) createPodTask(ctx api.Context, pod *api.Pod) (*podtask.T, error) {
|
|
||||||
return podtask.New(ctx, "", *pod, k.internal.executor)
|
|
||||||
}
|
|
||||||
|
|
||||||
func (k *k8smScheduler) slaveHostNameFor(id string) string {
|
|
||||||
return k.internal.slaveHostNames.HostName(id)
|
|
||||||
}
|
|
||||||
|
|
||||||
func (k *k8smScheduler) killTask(taskId string) error {
|
|
||||||
killTaskId := mutil.NewTaskID(taskId)
|
|
||||||
_, err := k.internal.driver.KillTask(killTaskId)
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
func (k *k8smScheduler) launchTask(task *podtask.T) error {
|
|
||||||
// assume caller is holding scheduler lock
|
|
||||||
taskList := []*mesos.TaskInfo{task.BuildTaskInfo()}
|
|
||||||
offerIds := []*mesos.OfferID{task.Offer.Details().Id}
|
|
||||||
filters := &mesos.Filters{}
|
|
||||||
_, err := k.internal.driver.LaunchTasks(offerIds, taskList, filters)
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
type binder struct {
|
|
||||||
api schedulerInterface
|
|
||||||
}
|
|
||||||
|
|
||||||
// implements binding.Registry, launches the pod-associated-task in mesos
|
|
||||||
func (b *binder) Bind(binding *api.Binding) error {
|
|
||||||
|
|
||||||
ctx := api.WithNamespace(api.NewContext(), binding.Namespace)
|
|
||||||
|
|
||||||
// default upstream scheduler passes pod.Name as binding.Name
|
|
||||||
podKey, err := podtask.MakePodKey(ctx, binding.Name)
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
b.api.Lock()
|
|
||||||
defer b.api.Unlock()
|
|
||||||
|
|
||||||
switch task, state := b.api.tasks().ForPod(podKey); state {
|
|
||||||
case podtask.StatePending:
|
|
||||||
return b.bind(ctx, binding, task)
|
|
||||||
default:
|
|
||||||
// in this case it's likely that the pod has been deleted between Schedule
|
|
||||||
// and Bind calls
|
|
||||||
log.Infof("No pending task for pod %s", podKey)
|
|
||||||
return noSuchPodErr //TODO(jdef) this error is somewhat misleading since the task could be running?!
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (b *binder) rollback(task *podtask.T, err error) error {
|
|
||||||
task.Offer.Release()
|
|
||||||
task.Reset()
|
|
||||||
if err2 := b.api.tasks().Update(task); err2 != nil {
|
|
||||||
log.Errorf("failed to update pod task: %v", err2)
|
|
||||||
}
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
// assumes that: caller has acquired scheduler lock and that the task is still pending
|
|
||||||
//
|
|
||||||
// bind does not actually do the binding itself, but launches the pod as a Mesos task. The
|
|
||||||
// kubernetes executor on the slave will finally do the binding. This is different from the
|
|
||||||
// upstream scheduler in the sense that the upstream scheduler does the binding and the
|
|
||||||
// kubelet will notice that and launches the pod.
|
|
||||||
func (b *binder) bind(ctx api.Context, binding *api.Binding, task *podtask.T) (err error) {
|
|
||||||
// sanity check: ensure that the task hasAcceptedOffer(), it's possible that between
|
|
||||||
// Schedule() and now that the offer for this task was rescinded or invalidated.
|
|
||||||
// ((we should never see this here))
|
|
||||||
if !task.HasAcceptedOffer() {
|
|
||||||
return fmt.Errorf("task has not accepted a valid offer %v", task.ID)
|
|
||||||
}
|
|
||||||
|
|
||||||
// By this time, there is a chance that the slave is disconnected.
|
|
||||||
offerId := task.GetOfferId()
|
|
||||||
if offer, ok := b.api.offers().Get(offerId); !ok || offer.HasExpired() {
|
|
||||||
// already rescinded or timed out or otherwise invalidated
|
|
||||||
return b.rollback(task, fmt.Errorf("failed prior to launchTask due to expired offer for task %v", task.ID))
|
|
||||||
}
|
|
||||||
|
|
||||||
if err = b.prepareTaskForLaunch(ctx, binding.Target.Name, task, offerId); err == nil {
|
|
||||||
log.V(2).Infof("launching task: %q on target %q slave %q for pod \"%v/%v\", cpu %.2f, mem %.2f MB",
|
|
||||||
task.ID, binding.Target.Name, task.Spec.SlaveID, task.Pod.Namespace, task.Pod.Name, task.Spec.CPU, task.Spec.Memory)
|
|
||||||
if err = b.api.launchTask(task); err == nil {
|
|
||||||
b.api.offers().Invalidate(offerId)
|
|
||||||
task.Set(podtask.Launched)
|
|
||||||
if err = b.api.tasks().Update(task); err != nil {
|
|
||||||
// this should only happen if the task has been removed or has changed status,
|
|
||||||
// which SHOULD NOT HAPPEN as long as we're synchronizing correctly
|
|
||||||
log.Errorf("failed to update task w/ Launched status: %v", err)
|
|
||||||
}
|
|
||||||
return
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return b.rollback(task, fmt.Errorf("Failed to launch task %v: %v", task.ID, err))
|
|
||||||
}
|
|
||||||
|
|
||||||
//TODO(jdef) unit test this, ensure that task's copy of api.Pod is not modified
|
|
||||||
func (b *binder) prepareTaskForLaunch(ctx api.Context, machine string, task *podtask.T, offerId string) error {
|
|
||||||
pod := task.Pod
|
|
||||||
|
|
||||||
// we make an effort here to avoid making changes to the task's copy of the pod, since
|
|
||||||
// we want that to reflect the initial user spec, and not the modified spec that we
|
|
||||||
// build for the executor to consume.
|
|
||||||
oemCt := pod.Spec.Containers
|
|
||||||
pod.Spec.Containers = append([]api.Container{}, oemCt...) // (shallow) clone before mod
|
|
||||||
|
|
||||||
if pod.Annotations == nil {
|
|
||||||
pod.Annotations = make(map[string]string)
|
|
||||||
}
|
|
||||||
|
|
||||||
task.SaveRecoveryInfo(pod.Annotations)
|
|
||||||
pod.Annotations[annotation.BindingHostKey] = task.Spec.AssignedSlave
|
|
||||||
|
|
||||||
for _, entry := range task.Spec.PortMap {
|
|
||||||
oemPorts := pod.Spec.Containers[entry.ContainerIdx].Ports
|
|
||||||
ports := append([]api.ContainerPort{}, oemPorts...)
|
|
||||||
p := &ports[entry.PortIdx]
|
|
||||||
p.HostPort = int(entry.OfferPort)
|
|
||||||
op := strconv.FormatUint(entry.OfferPort, 10)
|
|
||||||
pod.Annotations[fmt.Sprintf(annotation.PortMappingKeyFormat, p.Protocol, p.ContainerPort)] = op
|
|
||||||
if p.Name != "" {
|
|
||||||
pod.Annotations[fmt.Sprintf(annotation.PortNameMappingKeyFormat, p.Protocol, p.Name)] = op
|
|
||||||
}
|
|
||||||
pod.Spec.Containers[entry.ContainerIdx].Ports = ports
|
|
||||||
}
|
|
||||||
|
|
||||||
// the kubelet-executor uses this to instantiate the pod
|
|
||||||
log.V(3).Infof("prepared pod spec: %+v", pod)
|
|
||||||
|
|
||||||
data, err := api.Codec.Encode(&pod)
|
|
||||||
if err != nil {
|
|
||||||
log.V(2).Infof("Failed to marshal the pod spec: %v", err)
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
task.Spec.Data = data
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
type kubeScheduler struct {
|
|
||||||
api schedulerInterface
|
|
||||||
podUpdates queue.FIFO
|
|
||||||
}
|
|
||||||
|
|
||||||
// recoverAssignedSlave recovers the assigned Mesos slave from a pod by searching
|
|
||||||
// the BindingHostKey. For tasks in the registry of the scheduler, the same
|
|
||||||
// value is stored in T.Spec.AssignedSlave. Before launching, the BindingHostKey
|
|
||||||
// annotation is added and the executor will eventually persist that to the
|
|
||||||
// apiserver on binding.
|
|
||||||
func recoverAssignedSlave(pod *api.Pod) string {
|
|
||||||
return pod.Annotations[annotation.BindingHostKey]
|
|
||||||
}
|
|
||||||
|
|
||||||
// Schedule implements the Scheduler interface of Kubernetes.
|
|
||||||
// It returns the selectedMachine's name and error (if there's any).
|
|
||||||
func (k *kubeScheduler) Schedule(pod *api.Pod, unused algorithm.NodeLister) (string, error) {
|
|
||||||
log.Infof("Try to schedule pod %v\n", pod.Name)
|
|
||||||
ctx := api.WithNamespace(api.NewDefaultContext(), pod.Namespace)
|
|
||||||
|
|
||||||
// default upstream scheduler passes pod.Name as binding.PodID
|
|
||||||
podKey, err := podtask.MakePodKey(ctx, pod.Name)
|
|
||||||
if err != nil {
|
|
||||||
return "", err
|
|
||||||
}
|
|
||||||
|
|
||||||
k.api.Lock()
|
|
||||||
defer k.api.Unlock()
|
|
||||||
|
|
||||||
switch task, state := k.api.tasks().ForPod(podKey); state {
|
|
||||||
case podtask.StateUnknown:
|
|
||||||
// There's a bit of a potential race here, a pod could have been yielded() and
|
|
||||||
// then before we get *here* it could be deleted.
|
|
||||||
// We use meta to index the pod in the store since that's what k8s reflector does.
|
|
||||||
podName, err := cache.MetaNamespaceKeyFunc(pod)
|
|
||||||
if err != nil {
|
|
||||||
log.Warningf("aborting Schedule, unable to understand pod object %+v", pod)
|
|
||||||
return "", noSuchPodErr
|
|
||||||
}
|
|
||||||
if deleted := k.podUpdates.Poll(podName, queue.DELETE_EVENT); deleted {
|
|
||||||
// avoid scheduling a pod that's been deleted between yieldPod() and Schedule()
|
|
||||||
log.Infof("aborting Schedule, pod has been deleted %+v", pod)
|
|
||||||
return "", noSuchPodErr
|
|
||||||
}
|
|
||||||
|
|
||||||
task, err := k.api.createPodTask(ctx, pod)
|
|
||||||
if err != nil {
|
|
||||||
return "", err
|
|
||||||
}
|
|
||||||
|
|
||||||
task, err = k.api.tasks().Register(task)
|
|
||||||
if err != nil {
|
|
||||||
return "", err
|
|
||||||
}
|
|
||||||
|
|
||||||
return k.doSchedule(task)
|
|
||||||
|
|
||||||
//TODO(jdef) it's possible that the pod state has diverged from what
|
|
||||||
//we knew previously, we should probably update the task.Pod state here
|
|
||||||
//before proceeding with scheduling
|
|
||||||
case podtask.StatePending:
|
|
||||||
if pod.UID != task.Pod.UID {
|
|
||||||
// we're dealing with a brand new pod spec here, so the old one must have been
|
|
||||||
// deleted -- and so our task store is out of sync w/ respect to reality
|
|
||||||
//TODO(jdef) reconcile task
|
|
||||||
return "", fmt.Errorf("task %v spec is out of sync with pod %v spec, aborting schedule", task.ID, pod.Name)
|
|
||||||
} else if task.Has(podtask.Launched) {
|
|
||||||
// task has been marked as "launched" but the pod binding creation may have failed in k8s,
|
|
||||||
// but we're going to let someone else handle it, probably the mesos task error handler
|
|
||||||
return "", fmt.Errorf("task %s has already been launched, aborting schedule", task.ID)
|
|
||||||
} else {
|
|
||||||
return k.doSchedule(task)
|
|
||||||
}
|
|
||||||
|
|
||||||
default:
|
|
||||||
return "", fmt.Errorf("task %s is not pending, nothing to schedule", task.ID)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// doSchedule schedules the given task and returns the machine the task is scheduled on
|
|
||||||
// or an error if the scheduling failed.
|
|
||||||
func (k *kubeScheduler) doSchedule(task *podtask.T) (string, error) {
|
|
||||||
var offer offers.Perishable
|
|
||||||
var err error
|
|
||||||
|
|
||||||
if task.HasAcceptedOffer() {
|
|
||||||
// verify that the offer is still on the table
|
|
||||||
var ok bool
|
|
||||||
offer, ok = k.api.offers().Get(task.GetOfferId())
|
|
||||||
|
|
||||||
if !ok || offer.HasExpired() {
|
|
||||||
task.Offer.Release()
|
|
||||||
task.Reset()
|
|
||||||
if err = k.api.tasks().Update(task); err != nil {
|
|
||||||
return "", err
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if offer == nil {
|
|
||||||
offer, err = k.api.algorithm().SchedulePod(k.api.offers(), k.api, task)
|
|
||||||
}
|
|
||||||
|
|
||||||
if err != nil {
|
|
||||||
return "", err
|
|
||||||
}
|
|
||||||
|
|
||||||
details := offer.Details()
|
|
||||||
if details == nil {
|
|
||||||
return "", fmt.Errorf("offer already invalid/expired for task %v", task.ID)
|
|
||||||
}
|
|
||||||
|
|
||||||
slaveId := details.GetSlaveId().GetValue()
|
|
||||||
slaveHostName := k.api.slaveHostNameFor(slaveId)
|
|
||||||
if slaveHostName == "" {
|
|
||||||
// not much sense in Release()ing the offer here since its owner died
|
|
||||||
offer.Release()
|
|
||||||
k.api.offers().Invalidate(details.Id.GetValue())
|
|
||||||
return "", fmt.Errorf("Slave disappeared (%v) while scheduling task %v", slaveId, task.ID)
|
|
||||||
}
|
|
||||||
|
|
||||||
if task.Offer != nil && task.Offer != offer {
|
|
||||||
return "", fmt.Errorf("task.offer assignment must be idempotent, task %+v: offer %+v", task, offer)
|
|
||||||
}
|
|
||||||
|
|
||||||
task.Offer = offer
|
|
||||||
if err := k.api.algorithm().Procurement()(task, details); err != nil {
|
|
||||||
offer.Release()
|
|
||||||
task.Reset()
|
|
||||||
return "", err
|
|
||||||
}
|
|
||||||
|
|
||||||
if err := k.api.tasks().Update(task); err != nil {
|
|
||||||
offer.Release()
|
|
||||||
return "", err
|
|
||||||
}
|
|
||||||
|
|
||||||
return slaveHostName, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
type queuer struct {
|
|
||||||
lock sync.Mutex // shared by condition variables of this struct
|
|
||||||
podUpdates queue.FIFO // queue of pod updates to be processed
|
|
||||||
podQueue *queue.DelayFIFO // queue of pods to be scheduled
|
|
||||||
deltaCond sync.Cond // pod changes are available for processing
|
|
||||||
unscheduledCond sync.Cond // there are unscheduled pods for processing
|
|
||||||
}
|
|
||||||
|
|
||||||
func newQueuer(store queue.FIFO) *queuer {
|
|
||||||
q := &queuer{
|
|
||||||
podQueue: queue.NewDelayFIFO(),
|
|
||||||
podUpdates: store,
|
|
||||||
}
|
|
||||||
q.deltaCond.L = &q.lock
|
|
||||||
q.unscheduledCond.L = &q.lock
|
|
||||||
return q
|
|
||||||
}
|
|
||||||
|
|
||||||
func (q *queuer) installDebugHandlers(mux *http.ServeMux) {
|
|
||||||
mux.HandleFunc("/debug/scheduler/podqueue", func(w http.ResponseWriter, r *http.Request) {
|
|
||||||
for _, x := range q.podQueue.List() {
|
|
||||||
if _, err := io.WriteString(w, fmt.Sprintf("%+v\n", x)); err != nil {
|
|
||||||
break
|
|
||||||
}
|
|
||||||
}
|
|
||||||
})
|
|
||||||
mux.HandleFunc("/debug/scheduler/podstore", func(w http.ResponseWriter, r *http.Request) {
|
|
||||||
for _, x := range q.podUpdates.List() {
|
|
||||||
if _, err := io.WriteString(w, fmt.Sprintf("%+v\n", x)); err != nil {
|
|
||||||
break
|
|
||||||
}
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
// signal that there are probably pod updates waiting to be processed
|
|
||||||
func (q *queuer) updatesAvailable() {
|
|
||||||
q.deltaCond.Broadcast()
|
|
||||||
}
|
|
||||||
|
|
||||||
// delete a pod from the to-be-scheduled queue
|
|
||||||
func (q *queuer) dequeue(id string) {
|
|
||||||
q.podQueue.Delete(id)
|
|
||||||
}
|
|
||||||
|
|
||||||
// re-add a pod to the to-be-scheduled queue, will not overwrite existing pod data (that
|
|
||||||
// may have already changed).
|
|
||||||
func (q *queuer) requeue(pod *Pod) {
|
|
||||||
// use KeepExisting in case the pod has already been updated (can happen if binding fails
|
|
||||||
// due to constraint voilations); we don't want to overwrite a newer entry with stale data.
|
|
||||||
q.podQueue.Add(pod, queue.KeepExisting)
|
|
||||||
q.unscheduledCond.Broadcast()
|
|
||||||
}
|
|
||||||
|
|
||||||
// same as requeue but calls podQueue.Offer instead of podQueue.Add
|
|
||||||
func (q *queuer) reoffer(pod *Pod) {
|
|
||||||
// use KeepExisting in case the pod has already been updated (can happen if binding fails
|
|
||||||
// due to constraint voilations); we don't want to overwrite a newer entry with stale data.
|
|
||||||
if q.podQueue.Offer(pod, queue.KeepExisting) {
|
|
||||||
q.unscheduledCond.Broadcast()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// spawns a go-routine to watch for unscheduled pods and queue them up
|
|
||||||
// for scheduling. returns immediately.
|
|
||||||
func (q *queuer) Run(done <-chan struct{}) {
|
|
||||||
go runtime.Until(func() {
|
|
||||||
log.Info("Watching for newly created pods")
|
|
||||||
q.lock.Lock()
|
|
||||||
defer q.lock.Unlock()
|
|
||||||
|
|
||||||
for {
|
|
||||||
// limit blocking here for short intervals so that scheduling
|
|
||||||
// may proceed even if there have been no recent pod changes
|
|
||||||
p := q.podUpdates.Await(enqueuePopTimeout)
|
|
||||||
if p == nil {
|
|
||||||
signalled := runtime.After(q.deltaCond.Wait)
|
|
||||||
// we've yielded the lock
|
|
||||||
select {
|
|
||||||
case <-time.After(enqueueWaitTimeout):
|
|
||||||
q.deltaCond.Broadcast() // abort Wait()
|
|
||||||
<-signalled // wait for lock re-acquisition
|
|
||||||
log.V(4).Infoln("timed out waiting for a pod update")
|
|
||||||
case <-signalled:
|
|
||||||
// we've acquired the lock and there may be
|
|
||||||
// changes for us to process now
|
|
||||||
}
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
pod := p.(*Pod)
|
|
||||||
if recoverAssignedSlave(pod.Pod) != "" {
|
|
||||||
log.V(3).Infof("dequeuing assigned pod for scheduling: %v", pod.Pod.Name)
|
|
||||||
q.dequeue(pod.GetUID())
|
|
||||||
} else {
|
|
||||||
// use ReplaceExisting because we are always pushing the latest state
|
|
||||||
now := time.Now()
|
|
||||||
pod.deadline = &now
|
|
||||||
if q.podQueue.Offer(pod, queue.ReplaceExisting) {
|
|
||||||
q.unscheduledCond.Broadcast()
|
|
||||||
log.V(3).Infof("queued pod for scheduling: %v", pod.Pod.Name)
|
|
||||||
} else {
|
|
||||||
log.Warningf("failed to queue pod for scheduling: %v", pod.Pod.Name)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}, 1*time.Second, done)
|
|
||||||
}
|
|
||||||
|
|
||||||
// implementation of scheduling plugin's NextPod func; see k8s plugin/pkg/scheduler
|
|
||||||
func (q *queuer) yield() *api.Pod {
|
|
||||||
log.V(2).Info("attempting to yield a pod")
|
|
||||||
q.lock.Lock()
|
|
||||||
defer q.lock.Unlock()
|
|
||||||
|
|
||||||
for {
|
|
||||||
// limit blocking here to short intervals so that we don't block the
|
|
||||||
// enqueuer Run() routine for very long
|
|
||||||
kpod := q.podQueue.Await(yieldPopTimeout)
|
|
||||||
if kpod == nil {
|
|
||||||
signalled := runtime.After(q.unscheduledCond.Wait)
|
|
||||||
// lock is yielded at this point and we're going to wait for either
|
|
||||||
// a timeout, or a signal that there's data
|
|
||||||
select {
|
|
||||||
case <-time.After(yieldWaitTimeout):
|
|
||||||
q.unscheduledCond.Broadcast() // abort Wait()
|
|
||||||
<-signalled // wait for the go-routine, and the lock
|
|
||||||
log.V(4).Infoln("timed out waiting for a pod to yield")
|
|
||||||
case <-signalled:
|
|
||||||
// we have acquired the lock, and there
|
|
||||||
// may be a pod for us to pop now
|
|
||||||
}
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
pod := kpod.(*Pod).Pod
|
|
||||||
if podName, err := cache.MetaNamespaceKeyFunc(pod); err != nil {
|
|
||||||
log.Warningf("yield unable to understand pod object %+v, will skip: %v", pod, err)
|
|
||||||
} else if !q.podUpdates.Poll(podName, queue.POP_EVENT) {
|
|
||||||
log.V(1).Infof("yield popped a transitioning pod, skipping: %+v", pod)
|
|
||||||
} else if recoverAssignedSlave(pod) != "" {
|
|
||||||
// should never happen if enqueuePods is filtering properly
|
|
||||||
log.Warningf("yield popped an already-scheduled pod, skipping: %+v", pod)
|
|
||||||
} else {
|
|
||||||
return pod
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
type errorHandler struct {
|
|
||||||
api schedulerInterface
|
|
||||||
backoff *backoff.Backoff
|
|
||||||
qr *queuer
|
|
||||||
}
|
|
||||||
|
|
||||||
// implementation of scheduling plugin's Error func; see plugin/pkg/scheduler
|
|
||||||
func (k *errorHandler) handleSchedulingError(pod *api.Pod, schedulingErr error) {
|
|
||||||
|
|
||||||
if schedulingErr == noSuchPodErr {
|
|
||||||
log.V(2).Infof("Not rescheduling non-existent pod %v", pod.Name)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
log.Infof("Error scheduling %v: %v; retrying", pod.Name, schedulingErr)
|
|
||||||
defer util.HandleCrash()
|
|
||||||
|
|
||||||
// default upstream scheduler passes pod.Name as binding.PodID
|
|
||||||
ctx := api.WithNamespace(api.NewDefaultContext(), pod.Namespace)
|
|
||||||
podKey, err := podtask.MakePodKey(ctx, pod.Name)
|
|
||||||
if err != nil {
|
|
||||||
log.Errorf("Failed to construct pod key, aborting scheduling for pod %v: %v", pod.Name, err)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
k.backoff.GC()
|
|
||||||
k.api.Lock()
|
|
||||||
defer k.api.Unlock()
|
|
||||||
|
|
||||||
switch task, state := k.api.tasks().ForPod(podKey); state {
|
|
||||||
case podtask.StateUnknown:
|
|
||||||
// if we don't have a mapping here any more then someone deleted the pod
|
|
||||||
log.V(2).Infof("Could not resolve pod to task, aborting pod reschdule: %s", podKey)
|
|
||||||
return
|
|
||||||
|
|
||||||
case podtask.StatePending:
|
|
||||||
if task.Has(podtask.Launched) {
|
|
||||||
log.V(2).Infof("Skipping re-scheduling for already-launched pod %v", podKey)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
breakoutEarly := queue.BreakChan(nil)
|
|
||||||
if schedulingErr == noSuitableOffersErr {
|
|
||||||
log.V(3).Infof("adding backoff breakout handler for pod %v", podKey)
|
|
||||||
breakoutEarly = queue.BreakChan(k.api.offers().Listen(podKey, func(offer *mesos.Offer) bool {
|
|
||||||
k.api.Lock()
|
|
||||||
defer k.api.Unlock()
|
|
||||||
switch task, state := k.api.tasks().Get(task.ID); state {
|
|
||||||
case podtask.StatePending:
|
|
||||||
// Assess fitness of pod with the current offer. The scheduler normally
|
|
||||||
// "backs off" when it can't find an offer that matches up with a pod.
|
|
||||||
// The backoff period for a pod can terminate sooner if an offer becomes
|
|
||||||
// available that matches up.
|
|
||||||
return !task.Has(podtask.Launched) && k.api.algorithm().FitPredicate()(task, offer, nil)
|
|
||||||
default:
|
|
||||||
// no point in continuing to check for matching offers
|
|
||||||
return true
|
|
||||||
}
|
|
||||||
}))
|
|
||||||
}
|
|
||||||
delay := k.backoff.Get(podKey)
|
|
||||||
log.V(3).Infof("requeuing pod %v with delay %v", podKey, delay)
|
|
||||||
k.qr.requeue(&Pod{Pod: pod, delay: &delay, notify: breakoutEarly})
|
|
||||||
|
|
||||||
default:
|
|
||||||
log.V(2).Infof("Task is no longer pending, aborting reschedule for pod %v", podKey)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
type deleter struct {
|
|
||||||
api schedulerInterface
|
|
||||||
qr *queuer
|
|
||||||
}
|
|
||||||
|
|
||||||
// currently monitors for "pod deleted" events, upon which handle()
|
|
||||||
// is invoked.
|
|
||||||
func (k *deleter) Run(updates <-chan queue.Entry, done <-chan struct{}) {
|
|
||||||
go runtime.Until(func() {
|
|
||||||
for {
|
|
||||||
entry := <-updates
|
|
||||||
pod := entry.Value().(*Pod)
|
|
||||||
if entry.Is(queue.DELETE_EVENT) {
|
|
||||||
if err := k.deleteOne(pod); err != nil {
|
|
||||||
log.Error(err)
|
|
||||||
}
|
|
||||||
} else if !entry.Is(queue.POP_EVENT) {
|
|
||||||
k.qr.updatesAvailable()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}, 1*time.Second, done)
|
|
||||||
}
|
|
||||||
|
|
||||||
func (k *deleter) deleteOne(pod *Pod) error {
|
|
||||||
ctx := api.WithNamespace(api.NewDefaultContext(), pod.Namespace)
|
|
||||||
podKey, err := podtask.MakePodKey(ctx, pod.Name)
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
log.V(2).Infof("pod deleted: %v", podKey)
|
|
||||||
|
|
||||||
// order is important here: we want to make sure we have the lock before
|
|
||||||
// removing the pod from the scheduling queue. this makes the concurrent
|
|
||||||
// execution of scheduler-error-handling and delete-handling easier to
|
|
||||||
// reason about.
|
|
||||||
k.api.Lock()
|
|
||||||
defer k.api.Unlock()
|
|
||||||
|
|
||||||
// prevent the scheduler from attempting to pop this; it's also possible that
|
|
||||||
// it's concurrently being scheduled (somewhere between pod scheduling and
|
|
||||||
// binding) - if so, then we'll end up removing it from taskRegistry which
|
|
||||||
// will abort Bind()ing
|
|
||||||
k.qr.dequeue(pod.GetUID())
|
|
||||||
|
|
||||||
switch task, state := k.api.tasks().ForPod(podKey); state {
|
|
||||||
case podtask.StateUnknown:
|
|
||||||
log.V(2).Infof("Could not resolve pod '%s' to task id", podKey)
|
|
||||||
return noSuchPodErr
|
|
||||||
|
|
||||||
// determine if the task has already been launched to mesos, if not then
|
|
||||||
// cleanup is easier (unregister) since there's no state to sync
|
|
||||||
case podtask.StatePending:
|
|
||||||
if !task.Has(podtask.Launched) {
|
|
||||||
// we've been invoked in between Schedule() and Bind()
|
|
||||||
if task.HasAcceptedOffer() {
|
|
||||||
task.Offer.Release()
|
|
||||||
task.Reset()
|
|
||||||
task.Set(podtask.Deleted)
|
|
||||||
//TODO(jdef) probably want better handling here
|
|
||||||
if err := k.api.tasks().Update(task); err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
}
|
|
||||||
k.api.tasks().Unregister(task)
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
fallthrough
|
|
||||||
|
|
||||||
case podtask.StateRunning:
|
|
||||||
// signal to watchers that the related pod is going down
|
|
||||||
task.Set(podtask.Deleted)
|
|
||||||
if err := k.api.tasks().Update(task); err != nil {
|
|
||||||
log.Errorf("failed to update task w/ Deleted status: %v", err)
|
|
||||||
}
|
|
||||||
return k.api.killTask(task.ID)
|
|
||||||
|
|
||||||
default:
|
|
||||||
log.Infof("cannot kill pod '%s': non-terminal task not found %v", podKey, task.ID)
|
|
||||||
return noSuchTaskErr
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Create creates a scheduler plugin and all supporting background functions.
|
|
||||||
func (k *KubernetesScheduler) NewDefaultPluginConfig(terminate <-chan struct{}, mux *http.ServeMux) *PluginConfig {
|
|
||||||
// use ListWatch watching pods using the client by default
|
|
||||||
return k.NewPluginConfig(terminate, mux, createAllPodsLW(k.client))
|
|
||||||
}
|
|
||||||
|
|
||||||
func (k *KubernetesScheduler) NewPluginConfig(terminate <-chan struct{}, mux *http.ServeMux,
|
|
||||||
podsWatcher *cache.ListWatch) *PluginConfig {
|
|
||||||
|
|
||||||
// Watch and queue pods that need scheduling.
|
|
||||||
updates := make(chan queue.Entry, k.schedcfg.UpdatesBacklog)
|
|
||||||
podUpdates := &podStoreAdapter{queue.NewHistorical(updates)}
|
|
||||||
reflector := cache.NewReflector(podsWatcher, &api.Pod{}, podUpdates, 0)
|
|
||||||
|
|
||||||
// lock that guards critial sections that involve transferring pods from
|
|
||||||
// the store (cache) to the scheduling queue; its purpose is to maintain
|
|
||||||
// an ordering (vs interleaving) of operations that's easier to reason about.
|
|
||||||
kapi := &k8smScheduler{internal: k}
|
|
||||||
q := newQueuer(podUpdates)
|
|
||||||
podDeleter := &deleter{
|
|
||||||
api: kapi,
|
|
||||||
qr: q,
|
|
||||||
}
|
|
||||||
eh := &errorHandler{
|
|
||||||
api: kapi,
|
|
||||||
backoff: backoff.New(k.schedcfg.InitialPodBackoff.Duration, k.schedcfg.MaxPodBackoff.Duration),
|
|
||||||
qr: q,
|
|
||||||
}
|
|
||||||
startLatch := make(chan struct{})
|
|
||||||
eventBroadcaster := record.NewBroadcaster()
|
|
||||||
runtime.On(startLatch, func() {
|
|
||||||
eventBroadcaster.StartRecordingToSink(k.client.Events(""))
|
|
||||||
reflector.Run() // TODO(jdef) should listen for termination
|
|
||||||
podDeleter.Run(updates, terminate)
|
|
||||||
q.Run(terminate)
|
|
||||||
|
|
||||||
q.installDebugHandlers(mux)
|
|
||||||
podtask.InstallDebugHandlers(k.taskRegistry, mux)
|
|
||||||
})
|
|
||||||
return &PluginConfig{
|
|
||||||
Config: &plugin.Config{
|
|
||||||
NodeLister: nil,
|
|
||||||
Algorithm: &kubeScheduler{
|
|
||||||
api: kapi,
|
|
||||||
podUpdates: podUpdates,
|
|
||||||
},
|
|
||||||
Binder: &binder{api: kapi},
|
|
||||||
NextPod: q.yield,
|
|
||||||
Error: eh.handleSchedulingError,
|
|
||||||
Recorder: eventBroadcaster.NewRecorder(api.EventSource{Component: "scheduler"}),
|
|
||||||
},
|
|
||||||
api: kapi,
|
|
||||||
client: k.client,
|
|
||||||
qr: q,
|
|
||||||
deleter: podDeleter,
|
|
||||||
starting: startLatch,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
type PluginConfig struct {
|
|
||||||
*plugin.Config
|
|
||||||
api schedulerInterface
|
|
||||||
client *client.Client
|
|
||||||
qr *queuer
|
|
||||||
deleter *deleter
|
|
||||||
starting chan struct{} // startup latch
|
|
||||||
}
|
|
||||||
|
|
||||||
func NewPlugin(c *PluginConfig) PluginInterface {
|
|
||||||
return &schedulingPlugin{
|
|
||||||
config: c.Config,
|
|
||||||
api: c.api,
|
|
||||||
client: c.client,
|
|
||||||
qr: c.qr,
|
|
||||||
deleter: c.deleter,
|
|
||||||
starting: c.starting,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
type schedulingPlugin struct {
|
|
||||||
config *plugin.Config
|
|
||||||
api schedulerInterface
|
|
||||||
client *client.Client
|
|
||||||
qr *queuer
|
|
||||||
deleter *deleter
|
|
||||||
starting chan struct{}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (s *schedulingPlugin) Run(done <-chan struct{}) {
|
|
||||||
defer close(s.starting)
|
|
||||||
go runtime.Until(s.scheduleOne, pluginRecoveryDelay, done)
|
|
||||||
}
|
|
||||||
|
|
||||||
// hacked from GoogleCloudPlatform/kubernetes/plugin/pkg/scheduler/scheduler.go,
|
|
||||||
// with the Modeler stuff removed since we don't use it because we have mesos.
|
|
||||||
func (s *schedulingPlugin) scheduleOne() {
|
|
||||||
pod := s.config.NextPod()
|
|
||||||
|
|
||||||
// pods which are pre-scheduled (i.e. NodeName is set) are deleted by the kubelet
|
|
||||||
// in upstream. Not so in Mesos because the kubelet hasn't see that pod yet. Hence,
|
|
||||||
// the scheduler has to take care of this:
|
|
||||||
if pod.Spec.NodeName != "" && pod.DeletionTimestamp != nil {
|
|
||||||
log.V(3).Infof("deleting pre-scheduled, not yet running pod: %s/%s", pod.Namespace, pod.Name)
|
|
||||||
s.client.Pods(pod.Namespace).Delete(pod.Name, api.NewDeleteOptions(0))
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
log.V(3).Infof("Attempting to schedule: %+v", pod)
|
|
||||||
dest, err := s.config.Algorithm.Schedule(pod, s.config.NodeLister) // call kubeScheduler.Schedule
|
|
||||||
if err != nil {
|
|
||||||
log.V(1).Infof("Failed to schedule: %+v", pod)
|
|
||||||
s.config.Recorder.Eventf(pod, FailedScheduling, "Error scheduling: %v", err)
|
|
||||||
s.config.Error(pod, err)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
b := &api.Binding{
|
|
||||||
ObjectMeta: api.ObjectMeta{Namespace: pod.Namespace, Name: pod.Name},
|
|
||||||
Target: api.ObjectReference{
|
|
||||||
Kind: "Node",
|
|
||||||
Name: dest,
|
|
||||||
},
|
|
||||||
}
|
|
||||||
if err := s.config.Binder.Bind(b); err != nil {
|
|
||||||
log.V(1).Infof("Failed to bind pod: %+v", err)
|
|
||||||
s.config.Recorder.Eventf(pod, FailedScheduling, "Binding rejected: %v", err)
|
|
||||||
s.config.Error(pod, err)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
s.config.Recorder.Eventf(pod, Scheduled, "Successfully assigned %v to %v", pod.Name, dest)
|
|
||||||
}
|
|
||||||
|
|
||||||
// this pod may be out of sync with respect to the API server registry:
|
|
||||||
// this pod | apiserver registry
|
|
||||||
// -------------|----------------------
|
|
||||||
// host=.* | 404 ; pod was deleted
|
|
||||||
// host=.* | 5xx ; failed to sync, try again later?
|
|
||||||
// host="" | host="" ; perhaps no updates to process?
|
|
||||||
// host="" | host="..." ; pod has been scheduled and assigned, is there a task assigned? (check TaskIdKey in binding?)
|
|
||||||
// host="..." | host="" ; pod is no longer scheduled, does it need to be re-queued?
|
|
||||||
// host="..." | host="..." ; perhaps no updates to process?
|
|
||||||
//
|
|
||||||
// TODO(jdef) this needs an integration test
|
|
||||||
func (s *schedulingPlugin) reconcileTask(t *podtask.T) {
|
|
||||||
log.V(1).Infof("reconcile pod %v, assigned to slave %q", t.Pod.Name, t.Spec.AssignedSlave)
|
|
||||||
ctx := api.WithNamespace(api.NewDefaultContext(), t.Pod.Namespace)
|
|
||||||
pod, err := s.client.Pods(api.NamespaceValue(ctx)).Get(t.Pod.Name)
|
|
||||||
if err != nil {
|
|
||||||
if errors.IsNotFound(err) {
|
|
||||||
// attempt to delete
|
|
||||||
if err = s.deleter.deleteOne(&Pod{Pod: &t.Pod}); err != nil && err != noSuchPodErr && err != noSuchTaskErr {
|
|
||||||
log.Errorf("failed to delete pod: %v: %v", t.Pod.Name, err)
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
//TODO(jdef) other errors should probably trigger a retry (w/ backoff).
|
|
||||||
//For now, drop the pod on the floor
|
|
||||||
log.Warning("aborting reconciliation for pod %v: %v", t.Pod.Name, err)
|
|
||||||
}
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
log.Infof("pod %v scheduled on %q according to apiserver", pod.Name, pod.Spec.NodeName)
|
|
||||||
if t.Spec.AssignedSlave != pod.Spec.NodeName {
|
|
||||||
if pod.Spec.NodeName == "" {
|
|
||||||
// pod is unscheduled.
|
|
||||||
// it's possible that we dropped the pod in the scheduler error handler
|
|
||||||
// because of task misalignment with the pod (task.Has(podtask.Launched) == true)
|
|
||||||
|
|
||||||
podKey, err := podtask.MakePodKey(ctx, pod.Name)
|
|
||||||
if err != nil {
|
|
||||||
log.Error(err)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
s.api.Lock()
|
|
||||||
defer s.api.Unlock()
|
|
||||||
|
|
||||||
if _, state := s.api.tasks().ForPod(podKey); state != podtask.StateUnknown {
|
|
||||||
//TODO(jdef) reconcile the task
|
|
||||||
log.Errorf("task already registered for pod %v", pod.Name)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
now := time.Now()
|
|
||||||
log.V(3).Infof("reoffering pod %v", podKey)
|
|
||||||
s.qr.reoffer(&Pod{
|
|
||||||
Pod: pod,
|
|
||||||
deadline: &now,
|
|
||||||
})
|
|
||||||
} else {
|
|
||||||
// pod is scheduled.
|
|
||||||
// not sure how this happened behind our backs. attempt to reconstruct
|
|
||||||
// at least a partial podtask.T record.
|
|
||||||
//TODO(jdef) reconcile the task
|
|
||||||
log.Errorf("pod already scheduled: %v", pod.Name)
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
//TODO(jdef) for now, ignore the fact that the rest of the spec may be different
|
|
||||||
//and assume that our knowledge of the pod aligns with that of the apiserver
|
|
||||||
log.Error("pod reconciliation does not support updates; not yet implemented")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func parseSelectorOrDie(s string) fields.Selector {
|
|
||||||
selector, err := fields.ParseSelector(s)
|
|
||||||
if err != nil {
|
|
||||||
panic(err)
|
|
||||||
}
|
|
||||||
return selector
|
|
||||||
}
|
|
||||||
|
|
||||||
// createAllPodsLW returns a listWatch that finds all pods
|
|
||||||
func createAllPodsLW(cl *client.Client) *cache.ListWatch {
|
|
||||||
return cache.NewListWatchFromClient(cl, "pods", api.NamespaceAll, parseSelectorOrDie(""))
|
|
||||||
}
|
|
||||||
|
|
||||||
// Consumes *api.Pod, produces *Pod; the k8s reflector wants to push *api.Pod
|
|
||||||
// objects at us, but we want to store more flexible (Pod) type defined in
|
|
||||||
// this package. The adapter implementation facilitates this. It's a little
|
|
||||||
// hackish since the object type going in is different than the object type
|
|
||||||
// coming out -- you've been warned.
|
|
||||||
type podStoreAdapter struct {
|
|
||||||
queue.FIFO
|
|
||||||
}
|
|
||||||
|
|
||||||
func (psa *podStoreAdapter) Add(obj interface{}) error {
|
|
||||||
pod := obj.(*api.Pod)
|
|
||||||
return psa.FIFO.Add(&Pod{Pod: pod})
|
|
||||||
}
|
|
||||||
|
|
||||||
func (psa *podStoreAdapter) Update(obj interface{}) error {
|
|
||||||
pod := obj.(*api.Pod)
|
|
||||||
return psa.FIFO.Update(&Pod{Pod: pod})
|
|
||||||
}
|
|
||||||
|
|
||||||
func (psa *podStoreAdapter) Delete(obj interface{}) error {
|
|
||||||
pod := obj.(*api.Pod)
|
|
||||||
return psa.FIFO.Delete(&Pod{Pod: pod})
|
|
||||||
}
|
|
||||||
|
|
||||||
func (psa *podStoreAdapter) Get(obj interface{}) (interface{}, bool, error) {
|
|
||||||
pod := obj.(*api.Pod)
|
|
||||||
return psa.FIFO.Get(&Pod{Pod: pod})
|
|
||||||
}
|
|
||||||
|
|
||||||
// Replace will delete the contents of the store, using instead the
|
|
||||||
// given map. This store implementation does NOT take ownership of the map.
|
|
||||||
func (psa *podStoreAdapter) Replace(objs []interface{}, resourceVersion string) error {
|
|
||||||
newobjs := make([]interface{}, len(objs))
|
|
||||||
for i, v := range objs {
|
|
||||||
pod := v.(*api.Pod)
|
|
||||||
newobjs[i] = &Pod{Pod: pod}
|
|
||||||
}
|
|
||||||
return psa.FIFO.Replace(newobjs, resourceVersion)
|
|
||||||
}
|
|
@@ -18,6 +18,7 @@ package podtask
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/gogo/protobuf/proto"
|
"github.com/gogo/protobuf/proto"
|
||||||
@@ -62,7 +63,6 @@ type T struct {
|
|||||||
UpdatedTime time.Time // time of the most recent StatusUpdate we've seen from the mesos master
|
UpdatedTime time.Time // time of the most recent StatusUpdate we've seen from the mesos master
|
||||||
|
|
||||||
podStatus api.PodStatus
|
podStatus api.PodStatus
|
||||||
executor *mesos.ExecutorInfo // readonly
|
|
||||||
podKey string
|
podKey string
|
||||||
launchTime time.Time
|
launchTime time.Time
|
||||||
bindTime time.Time
|
bindTime time.Time
|
||||||
@@ -130,21 +130,49 @@ func generateTaskName(pod *api.Pod) string {
|
|||||||
return fmt.Sprintf("%s.%s.pods", pod.Name, ns)
|
return fmt.Sprintf("%s.%s.pods", pod.Name, ns)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (t *T) BuildTaskInfo() *mesos.TaskInfo {
|
func setCommandArgument(ei *mesos.ExecutorInfo, flag, value string, create bool) {
|
||||||
|
argv := []string{}
|
||||||
|
overwrite := false
|
||||||
|
if ei.Command != nil && ei.Command.Arguments != nil {
|
||||||
|
argv = ei.Command.Arguments
|
||||||
|
for i, arg := range argv {
|
||||||
|
if strings.HasPrefix(arg, flag+"=") {
|
||||||
|
overwrite = true
|
||||||
|
argv[i] = flag + "=" + value
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !overwrite && create {
|
||||||
|
argv = append(argv, flag+"="+value)
|
||||||
|
if ei.Command == nil {
|
||||||
|
ei.Command = &mesos.CommandInfo{}
|
||||||
|
}
|
||||||
|
ei.Command.Arguments = argv
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (t *T) BuildTaskInfo(prototype *mesos.ExecutorInfo) *mesos.TaskInfo {
|
||||||
info := &mesos.TaskInfo{
|
info := &mesos.TaskInfo{
|
||||||
Name: proto.String(generateTaskName(&t.Pod)),
|
Name: proto.String(generateTaskName(&t.Pod)),
|
||||||
TaskId: mutil.NewTaskID(t.ID),
|
TaskId: mutil.NewTaskID(t.ID),
|
||||||
SlaveId: mutil.NewSlaveID(t.Spec.SlaveID),
|
SlaveId: mutil.NewSlaveID(t.Spec.SlaveID),
|
||||||
Executor: t.executor,
|
Executor: proto.Clone(prototype).(*mesos.ExecutorInfo),
|
||||||
Data: t.Spec.Data,
|
Data: t.Spec.Data,
|
||||||
Resources: []*mesos.Resource{
|
Resources: []*mesos.Resource{
|
||||||
mutil.NewScalarResource("cpus", float64(t.Spec.CPU)),
|
mutil.NewScalarResource("cpus", float64(t.Spec.CPU)),
|
||||||
mutil.NewScalarResource("mem", float64(t.Spec.Memory)),
|
mutil.NewScalarResource("mem", float64(t.Spec.Memory)),
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
if portsResource := rangeResource("ports", t.Spec.Ports); portsResource != nil {
|
if portsResource := rangeResource("ports", t.Spec.Ports); portsResource != nil {
|
||||||
info.Resources = append(info.Resources, portsResource)
|
info.Resources = append(info.Resources, portsResource)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// hostname needs of the executor needs to match that of the offer, otherwise
|
||||||
|
// the kubelet node status checker/updater is very unhappy
|
||||||
|
setCommandArgument(info.Executor, "--hostname-override", t.Spec.AssignedSlave, true)
|
||||||
|
|
||||||
return info
|
return info
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -170,10 +198,7 @@ func (t *T) Has(f FlagType) (exists bool) {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
func New(ctx api.Context, id string, pod api.Pod, executor *mesos.ExecutorInfo) (*T, error) {
|
func New(ctx api.Context, id string, pod *api.Pod) (*T, error) {
|
||||||
if executor == nil {
|
|
||||||
return nil, fmt.Errorf("illegal argument: executor was nil")
|
|
||||||
}
|
|
||||||
key, err := MakePodKey(ctx, pod.Name)
|
key, err := MakePodKey(ctx, pod.Name)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
@@ -182,13 +207,12 @@ func New(ctx api.Context, id string, pod api.Pod, executor *mesos.ExecutorInfo)
|
|||||||
id = "pod." + uuid.NewUUID().String()
|
id = "pod." + uuid.NewUUID().String()
|
||||||
}
|
}
|
||||||
task := &T{
|
task := &T{
|
||||||
ID: id,
|
ID: id,
|
||||||
Pod: pod,
|
Pod: *pod,
|
||||||
State: StatePending,
|
State: StatePending,
|
||||||
podKey: key,
|
podKey: key,
|
||||||
mapper: MappingTypeForPod(&pod),
|
mapper: MappingTypeForPod(pod),
|
||||||
Flags: make(map[FlagType]struct{}),
|
Flags: make(map[FlagType]struct{}),
|
||||||
executor: proto.Clone(executor).(*mesos.ExecutorInfo),
|
|
||||||
}
|
}
|
||||||
task.CreateTime = time.Now()
|
task.CreateTime = time.Now()
|
||||||
return task, nil
|
return task, nil
|
||||||
@@ -198,7 +222,6 @@ func (t *T) SaveRecoveryInfo(dict map[string]string) {
|
|||||||
dict[annotation.TaskIdKey] = t.ID
|
dict[annotation.TaskIdKey] = t.ID
|
||||||
dict[annotation.SlaveIdKey] = t.Spec.SlaveID
|
dict[annotation.SlaveIdKey] = t.Spec.SlaveID
|
||||||
dict[annotation.OfferIdKey] = t.Offer.Details().Id.GetValue()
|
dict[annotation.OfferIdKey] = t.Offer.Details().Id.GetValue()
|
||||||
dict[annotation.ExecutorIdKey] = t.executor.ExecutorId.GetValue()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// reconstruct a task from metadata stashed in a pod entry. there are limited pod states that
|
// reconstruct a task from metadata stashed in a pod entry. there are limited pod states that
|
||||||
@@ -256,7 +279,6 @@ func RecoverFrom(pod api.Pod) (*T, bool, error) {
|
|||||||
annotation.TaskIdKey,
|
annotation.TaskIdKey,
|
||||||
annotation.SlaveIdKey,
|
annotation.SlaveIdKey,
|
||||||
annotation.OfferIdKey,
|
annotation.OfferIdKey,
|
||||||
annotation.ExecutorIdKey,
|
|
||||||
} {
|
} {
|
||||||
v, found := pod.Annotations[k]
|
v, found := pod.Annotations[k]
|
||||||
if !found {
|
if !found {
|
||||||
@@ -271,10 +293,6 @@ func RecoverFrom(pod api.Pod) (*T, bool, error) {
|
|||||||
offerId = v
|
offerId = v
|
||||||
case annotation.TaskIdKey:
|
case annotation.TaskIdKey:
|
||||||
t.ID = v
|
t.ID = v
|
||||||
case annotation.ExecutorIdKey:
|
|
||||||
// this is nowhere near sufficient to re-launch a task, but we really just
|
|
||||||
// want this for tracking
|
|
||||||
t.executor = &mesos.ExecutorInfo{ExecutorId: mutil.NewExecutorID(v)}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
t.Offer = offers.Expired(offerId, t.Spec.AssignedSlave, 0)
|
t.Offer = offers.Expired(offerId, t.Spec.AssignedSlave, 0)
|
||||||
|
@@ -35,12 +35,12 @@ const (
|
|||||||
)
|
)
|
||||||
|
|
||||||
func fakePodTask(id string) (*T, error) {
|
func fakePodTask(id string) (*T, error) {
|
||||||
return New(api.NewDefaultContext(), "", api.Pod{
|
return New(api.NewDefaultContext(), "", &api.Pod{
|
||||||
ObjectMeta: api.ObjectMeta{
|
ObjectMeta: api.ObjectMeta{
|
||||||
Name: id,
|
Name: id,
|
||||||
Namespace: api.NamespaceDefault,
|
Namespace: api.NamespaceDefault,
|
||||||
},
|
},
|
||||||
}, &mesos.ExecutorInfo{})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestUnlimitedResources(t *testing.T) {
|
func TestUnlimitedResources(t *testing.T) {
|
||||||
|
@@ -52,7 +52,7 @@ func TestDefaultHostPortMatching(t *testing.T) {
|
|||||||
}},
|
}},
|
||||||
}},
|
}},
|
||||||
}
|
}
|
||||||
task, err = New(api.NewDefaultContext(), "", *pod, &mesos.ExecutorInfo{})
|
task, err = New(api.NewDefaultContext(), "", pod)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
@@ -100,7 +100,7 @@ func TestWildcardHostPortMatching(t *testing.T) {
|
|||||||
}},
|
}},
|
||||||
}},
|
}},
|
||||||
}
|
}
|
||||||
task, err = New(api.NewDefaultContext(), "", *pod, &mesos.ExecutorInfo{})
|
task, err = New(api.NewDefaultContext(), "", pod)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
@@ -123,7 +123,7 @@ func TestWildcardHostPortMatching(t *testing.T) {
|
|||||||
}},
|
}},
|
||||||
}},
|
}},
|
||||||
}
|
}
|
||||||
task, err = New(api.NewDefaultContext(), "", *pod, &mesos.ExecutorInfo{})
|
task, err = New(api.NewDefaultContext(), "", pod)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
@@ -144,7 +144,7 @@ func TestWildcardHostPortMatching(t *testing.T) {
|
|||||||
}},
|
}},
|
||||||
}},
|
}},
|
||||||
}
|
}
|
||||||
task, err = New(api.NewDefaultContext(), "", *pod, &mesos.ExecutorInfo{})
|
task, err = New(api.NewDefaultContext(), "", pod)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
|
@@ -17,8 +17,6 @@ limitations under the License.
|
|||||||
package podtask
|
package podtask
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"strings"
|
|
||||||
|
|
||||||
log "github.com/golang/glog"
|
log "github.com/golang/glog"
|
||||||
mesos "github.com/mesos/mesos-go/mesosproto"
|
mesos "github.com/mesos/mesos-go/mesosproto"
|
||||||
mresource "k8s.io/kubernetes/contrib/mesos/pkg/scheduler/resource"
|
mresource "k8s.io/kubernetes/contrib/mesos/pkg/scheduler/resource"
|
||||||
@@ -74,31 +72,11 @@ func ValidateProcurement(t *T, offer *mesos.Offer) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func setCommandArgument(ei *mesos.ExecutorInfo, flag, value string, create bool) {
|
|
||||||
argv := ei.Command.Arguments
|
|
||||||
overwrite := false
|
|
||||||
for i, arg := range argv {
|
|
||||||
if strings.HasPrefix(arg, flag+"=") {
|
|
||||||
overwrite = true
|
|
||||||
argv[i] = flag + "=" + value
|
|
||||||
break
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if !overwrite && create {
|
|
||||||
ei.Command.Arguments = append(argv, flag+"="+value)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// NodeProcurement updates t.Spec in preparation for the task to be launched on the
|
// NodeProcurement updates t.Spec in preparation for the task to be launched on the
|
||||||
// slave associated with the offer.
|
// slave associated with the offer.
|
||||||
func NodeProcurement(t *T, offer *mesos.Offer) error {
|
func NodeProcurement(t *T, offer *mesos.Offer) error {
|
||||||
t.Spec.SlaveID = offer.GetSlaveId().GetValue()
|
t.Spec.SlaveID = offer.GetSlaveId().GetValue()
|
||||||
t.Spec.AssignedSlave = offer.GetHostname()
|
t.Spec.AssignedSlave = offer.GetHostname()
|
||||||
|
|
||||||
// hostname needs of the executor needs to match that of the offer, otherwise
|
|
||||||
// the kubelet node status checker/updater is very unhappy
|
|
||||||
setCommandArgument(t.executor, "--hostname-override", offer.GetHostname(), true)
|
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
19
contrib/mesos/pkg/scheduler/queuer/doc.go
Normal file
19
contrib/mesos/pkg/scheduler/queuer/doc.go
Normal file
@@ -0,0 +1,19 @@
|
|||||||
|
/*
|
||||||
|
Copyright 2015 The Kubernetes Authors All rights reserved.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
// Package queuer implements a Pod Queuer which stores and yields pods waiting
|
||||||
|
// being scheduled.
|
||||||
|
package queuer
|
@@ -14,7 +14,7 @@ See the License for the specific language governing permissions and
|
|||||||
limitations under the License.
|
limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
package scheduler
|
package queuer
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
@@ -29,8 +29,12 @@ import (
|
|||||||
type Pod struct {
|
type Pod struct {
|
||||||
*api.Pod
|
*api.Pod
|
||||||
deadline *time.Time
|
deadline *time.Time
|
||||||
delay *time.Duration
|
Delay *time.Duration
|
||||||
notify queue.BreakChan
|
Notify queue.BreakChan
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewPodWithDeadline(pod *api.Pod, deadline *time.Time) *Pod {
|
||||||
|
return &Pod{Pod: pod, deadline: deadline}
|
||||||
}
|
}
|
||||||
|
|
||||||
// implements Copyable
|
// implements Copyable
|
||||||
@@ -54,21 +58,21 @@ func (p *Pod) GetUID() string {
|
|||||||
|
|
||||||
// implements Deadlined
|
// implements Deadlined
|
||||||
func (dp *Pod) Deadline() (time.Time, bool) {
|
func (dp *Pod) Deadline() (time.Time, bool) {
|
||||||
if dp.deadline != nil {
|
if dp.Deadline != nil {
|
||||||
return *(dp.deadline), true
|
return *(dp.deadline), true
|
||||||
}
|
}
|
||||||
return time.Time{}, false
|
return time.Time{}, false
|
||||||
}
|
}
|
||||||
|
|
||||||
func (dp *Pod) GetDelay() time.Duration {
|
func (dp *Pod) GetDelay() time.Duration {
|
||||||
if dp.delay != nil {
|
if dp.Delay != nil {
|
||||||
return *(dp.delay)
|
return *(dp.Delay)
|
||||||
}
|
}
|
||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *Pod) Breaker() queue.BreakChan {
|
func (p *Pod) Breaker() queue.BreakChan {
|
||||||
return p.notify
|
return p.Notify
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *Pod) String() string {
|
func (p *Pod) String() string {
|
209
contrib/mesos/pkg/scheduler/queuer/queuer.go
Normal file
209
contrib/mesos/pkg/scheduler/queuer/queuer.go
Normal file
@@ -0,0 +1,209 @@
|
|||||||
|
/*
|
||||||
|
Copyright 2015 The Kubernetes Authors All rights reserved.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package queuer
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"io"
|
||||||
|
"net/http"
|
||||||
|
"sync"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
log "github.com/golang/glog"
|
||||||
|
"k8s.io/kubernetes/contrib/mesos/pkg/queue"
|
||||||
|
"k8s.io/kubernetes/contrib/mesos/pkg/runtime"
|
||||||
|
annotation "k8s.io/kubernetes/contrib/mesos/pkg/scheduler/meta"
|
||||||
|
"k8s.io/kubernetes/pkg/api"
|
||||||
|
"k8s.io/kubernetes/pkg/client/cache"
|
||||||
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
enqueuePopTimeout = 200 * time.Millisecond
|
||||||
|
enqueueWaitTimeout = 1 * time.Second
|
||||||
|
yieldPopTimeout = 200 * time.Millisecond
|
||||||
|
yieldWaitTimeout = 1 * time.Second
|
||||||
|
)
|
||||||
|
|
||||||
|
type Queuer interface {
|
||||||
|
InstallDebugHandlers(mux *http.ServeMux)
|
||||||
|
UpdatesAvailable()
|
||||||
|
Dequeue(id string)
|
||||||
|
Requeue(pod *Pod)
|
||||||
|
Reoffer(pod *Pod)
|
||||||
|
|
||||||
|
Yield() *api.Pod
|
||||||
|
|
||||||
|
Run(done <-chan struct{})
|
||||||
|
}
|
||||||
|
|
||||||
|
type queuer struct {
|
||||||
|
lock sync.Mutex // shared by condition variables of this struct
|
||||||
|
updates queue.FIFO // queue of pod updates to be processed
|
||||||
|
queue *queue.DelayFIFO // queue of pods to be scheduled
|
||||||
|
deltaCond sync.Cond // pod changes are available for processing
|
||||||
|
unscheduledCond sync.Cond // there are unscheduled pods for processing
|
||||||
|
}
|
||||||
|
|
||||||
|
func New(queue *queue.DelayFIFO, updates queue.FIFO) Queuer {
|
||||||
|
q := &queuer{
|
||||||
|
queue: queue,
|
||||||
|
updates: updates,
|
||||||
|
}
|
||||||
|
q.deltaCond.L = &q.lock
|
||||||
|
q.unscheduledCond.L = &q.lock
|
||||||
|
return q
|
||||||
|
}
|
||||||
|
|
||||||
|
func (q *queuer) InstallDebugHandlers(mux *http.ServeMux) {
|
||||||
|
mux.HandleFunc("/debug/scheduler/podqueue", func(w http.ResponseWriter, r *http.Request) {
|
||||||
|
for _, x := range q.queue.List() {
|
||||||
|
if _, err := io.WriteString(w, fmt.Sprintf("%+v\n", x)); err != nil {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
mux.HandleFunc("/debug/scheduler/podstore", func(w http.ResponseWriter, r *http.Request) {
|
||||||
|
for _, x := range q.updates.List() {
|
||||||
|
if _, err := io.WriteString(w, fmt.Sprintf("%+v\n", x)); err != nil {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// signal that there are probably pod updates waiting to be processed
|
||||||
|
func (q *queuer) UpdatesAvailable() {
|
||||||
|
q.deltaCond.Broadcast()
|
||||||
|
}
|
||||||
|
|
||||||
|
// delete a pod from the to-be-scheduled queue
|
||||||
|
func (q *queuer) Dequeue(id string) {
|
||||||
|
q.queue.Delete(id)
|
||||||
|
}
|
||||||
|
|
||||||
|
// re-add a pod to the to-be-scheduled queue, will not overwrite existing pod data (that
|
||||||
|
// may have already changed).
|
||||||
|
func (q *queuer) Requeue(pod *Pod) {
|
||||||
|
// use KeepExisting in case the pod has already been updated (can happen if binding fails
|
||||||
|
// due to constraint voilations); we don't want to overwrite a newer entry with stale data.
|
||||||
|
q.queue.Add(pod, queue.KeepExisting)
|
||||||
|
q.unscheduledCond.Broadcast()
|
||||||
|
}
|
||||||
|
|
||||||
|
// same as Requeue but calls podQueue.Offer instead of podQueue.Add
|
||||||
|
func (q *queuer) Reoffer(pod *Pod) {
|
||||||
|
// use KeepExisting in case the pod has already been updated (can happen if binding fails
|
||||||
|
// due to constraint voilations); we don't want to overwrite a newer entry with stale data.
|
||||||
|
if q.queue.Offer(pod, queue.KeepExisting) {
|
||||||
|
q.unscheduledCond.Broadcast()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// spawns a go-routine to watch for unscheduled pods and queue them up
|
||||||
|
// for scheduling. returns immediately.
|
||||||
|
func (q *queuer) Run(done <-chan struct{}) {
|
||||||
|
go runtime.Until(func() {
|
||||||
|
log.Info("Watching for newly created pods")
|
||||||
|
q.lock.Lock()
|
||||||
|
defer q.lock.Unlock()
|
||||||
|
|
||||||
|
for {
|
||||||
|
// limit blocking here for short intervals so that scheduling
|
||||||
|
// may proceed even if there have been no recent pod changes
|
||||||
|
p := q.updates.Await(enqueuePopTimeout)
|
||||||
|
if p == nil {
|
||||||
|
signalled := runtime.After(q.deltaCond.Wait)
|
||||||
|
// we've yielded the lock
|
||||||
|
select {
|
||||||
|
case <-time.After(enqueueWaitTimeout):
|
||||||
|
q.deltaCond.Broadcast() // abort Wait()
|
||||||
|
<-signalled // wait for lock re-acquisition
|
||||||
|
log.V(4).Infoln("timed out waiting for a pod update")
|
||||||
|
case <-signalled:
|
||||||
|
// we've acquired the lock and there may be
|
||||||
|
// changes for us to process now
|
||||||
|
}
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
pod := p.(*Pod)
|
||||||
|
if recoverAssignedSlave(pod.Pod) != "" {
|
||||||
|
log.V(3).Infof("dequeuing assigned pod for scheduling: %v", pod.Pod.Name)
|
||||||
|
q.Dequeue(pod.GetUID())
|
||||||
|
} else {
|
||||||
|
// use ReplaceExisting because we are always pushing the latest state
|
||||||
|
now := time.Now()
|
||||||
|
pod.deadline = &now
|
||||||
|
if q.queue.Offer(pod, queue.ReplaceExisting) {
|
||||||
|
q.unscheduledCond.Broadcast()
|
||||||
|
log.V(3).Infof("queued pod for scheduling: %v", pod.Pod.Name)
|
||||||
|
} else {
|
||||||
|
log.Warningf("failed to queue pod for scheduling: %v", pod.Pod.Name)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}, 1*time.Second, done)
|
||||||
|
}
|
||||||
|
|
||||||
|
// implementation of scheduling plugin's NextPod func; see k8s plugin/pkg/scheduler
|
||||||
|
func (q *queuer) Yield() *api.Pod {
|
||||||
|
log.V(2).Info("attempting to yield a pod")
|
||||||
|
q.lock.Lock()
|
||||||
|
defer q.lock.Unlock()
|
||||||
|
|
||||||
|
for {
|
||||||
|
// limit blocking here to short intervals so that we don't block the
|
||||||
|
// enqueuer Run() routine for very long
|
||||||
|
kpod := q.queue.Await(yieldPopTimeout)
|
||||||
|
if kpod == nil {
|
||||||
|
signalled := runtime.After(q.unscheduledCond.Wait)
|
||||||
|
// lock is yielded at this point and we're going to wait for either
|
||||||
|
// a timeout, or a signal that there's data
|
||||||
|
select {
|
||||||
|
case <-time.After(yieldWaitTimeout):
|
||||||
|
q.unscheduledCond.Broadcast() // abort Wait()
|
||||||
|
<-signalled // wait for the go-routine, and the lock
|
||||||
|
log.V(4).Infoln("timed out waiting for a pod to yield")
|
||||||
|
case <-signalled:
|
||||||
|
// we have acquired the lock, and there
|
||||||
|
// may be a pod for us to pop now
|
||||||
|
}
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
pod := kpod.(*Pod).Pod
|
||||||
|
if podName, err := cache.MetaNamespaceKeyFunc(pod); err != nil {
|
||||||
|
log.Warningf("yield unable to understand pod object %+v, will skip: %v", pod, err)
|
||||||
|
} else if !q.updates.Poll(podName, queue.POP_EVENT) {
|
||||||
|
log.V(1).Infof("yield popped a transitioning pod, skipping: %+v", pod)
|
||||||
|
} else if recoverAssignedSlave(pod) != "" {
|
||||||
|
// should never happen if enqueuePods is filtering properly
|
||||||
|
log.Warningf("yield popped an already-scheduled pod, skipping: %+v", pod)
|
||||||
|
} else {
|
||||||
|
return pod
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// recoverAssignedSlave recovers the assigned Mesos slave from a pod by searching
|
||||||
|
// the BindingHostKey. For tasks in the registry of the scheduler, the same
|
||||||
|
// value is stored in T.Spec.AssignedSlave. Before launching, the BindingHostKey
|
||||||
|
// annotation is added and the executor will eventually persist that to the
|
||||||
|
// apiserver on binding.
|
||||||
|
func recoverAssignedSlave(pod *api.Pod) string {
|
||||||
|
return pod.Annotations[annotation.BindingHostKey]
|
||||||
|
}
|
@@ -17,905 +17,22 @@ limitations under the License.
|
|||||||
package scheduler
|
package scheduler
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
|
||||||
"io"
|
|
||||||
"math"
|
|
||||||
"net/http"
|
|
||||||
"sync"
|
"sync"
|
||||||
"time"
|
|
||||||
|
|
||||||
log "github.com/golang/glog"
|
|
||||||
mesos "github.com/mesos/mesos-go/mesosproto"
|
|
||||||
mutil "github.com/mesos/mesos-go/mesosutil"
|
|
||||||
bindings "github.com/mesos/mesos-go/scheduler"
|
|
||||||
execcfg "k8s.io/kubernetes/contrib/mesos/pkg/executor/config"
|
|
||||||
"k8s.io/kubernetes/contrib/mesos/pkg/executor/messages"
|
|
||||||
"k8s.io/kubernetes/contrib/mesos/pkg/node"
|
|
||||||
"k8s.io/kubernetes/contrib/mesos/pkg/offers"
|
"k8s.io/kubernetes/contrib/mesos/pkg/offers"
|
||||||
offermetrics "k8s.io/kubernetes/contrib/mesos/pkg/offers/metrics"
|
|
||||||
"k8s.io/kubernetes/contrib/mesos/pkg/proc"
|
|
||||||
"k8s.io/kubernetes/contrib/mesos/pkg/runtime"
|
|
||||||
schedcfg "k8s.io/kubernetes/contrib/mesos/pkg/scheduler/config"
|
|
||||||
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/meta"
|
|
||||||
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/metrics"
|
|
||||||
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/podtask"
|
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/podtask"
|
||||||
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/slave"
|
|
||||||
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/uid"
|
|
||||||
"k8s.io/kubernetes/pkg/api"
|
|
||||||
"k8s.io/kubernetes/pkg/api/errors"
|
|
||||||
client "k8s.io/kubernetes/pkg/client/unversioned"
|
|
||||||
"k8s.io/kubernetes/pkg/fields"
|
|
||||||
"k8s.io/kubernetes/pkg/kubelet/container"
|
|
||||||
kubetypes "k8s.io/kubernetes/pkg/kubelet/types"
|
|
||||||
"k8s.io/kubernetes/pkg/labels"
|
|
||||||
"k8s.io/kubernetes/pkg/tools"
|
|
||||||
"k8s.io/kubernetes/pkg/util/sets"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
type PluginInterface interface {
|
// Scheduler abstracts everything other components of the scheduler need
|
||||||
// the apiserver may have a different state for the pod than we do
|
// to access from eachother
|
||||||
// so reconcile our records, but only for this one pod
|
type Scheduler interface {
|
||||||
reconcileTask(*podtask.T)
|
Tasks() podtask.Registry
|
||||||
|
sync.Locker // synchronize changes to tasks, i.e. lock, get task, change task, store task, unlock
|
||||||
|
|
||||||
// execute the Scheduling plugin, should start a go routine and return immediately
|
Offers() offers.Registry
|
||||||
Run(<-chan struct{})
|
Reconcile(t *podtask.T)
|
||||||
}
|
KillTask(id string) error
|
||||||
|
LaunchTask(t *podtask.T) error
|
||||||
// KubernetesScheduler implements:
|
|
||||||
// 1: A mesos scheduler.
|
Run(done <-chan struct{})
|
||||||
// 2: A kubernetes scheduler plugin.
|
|
||||||
// 3: A kubernetes pod.Registry.
|
|
||||||
type KubernetesScheduler struct {
|
|
||||||
// We use a lock here to avoid races
|
|
||||||
// between invoking the mesos callback
|
|
||||||
// and the invoking the pod registry interfaces.
|
|
||||||
// In particular, changes to podtask.T objects are currently guarded by this lock.
|
|
||||||
*sync.RWMutex
|
|
||||||
PodScheduler
|
|
||||||
|
|
||||||
// Config related, write-once
|
|
||||||
|
|
||||||
schedcfg *schedcfg.Config
|
|
||||||
executor *mesos.ExecutorInfo
|
|
||||||
executorGroup uint64
|
|
||||||
client *client.Client
|
|
||||||
etcdClient tools.EtcdClient
|
|
||||||
failoverTimeout float64 // in seconds
|
|
||||||
reconcileInterval int64
|
|
||||||
nodeRegistrator node.Registrator
|
|
||||||
|
|
||||||
// Mesos context.
|
|
||||||
|
|
||||||
driver bindings.SchedulerDriver // late initialization
|
|
||||||
frameworkId *mesos.FrameworkID
|
|
||||||
masterInfo *mesos.MasterInfo
|
|
||||||
registered bool
|
|
||||||
registration chan struct{} // signal chan that closes upon first successful registration
|
|
||||||
onRegistration sync.Once
|
|
||||||
offers offers.Registry
|
|
||||||
slaveHostNames *slave.Registry
|
|
||||||
|
|
||||||
// unsafe state, needs to be guarded
|
|
||||||
|
|
||||||
taskRegistry podtask.Registry
|
|
||||||
|
|
||||||
// via deferred init
|
|
||||||
|
|
||||||
plugin PluginInterface
|
|
||||||
reconciler *Reconciler
|
|
||||||
reconcileCooldown time.Duration
|
|
||||||
asRegisteredMaster proc.Doer
|
|
||||||
terminate <-chan struct{} // signal chan, closes when we should kill background tasks
|
|
||||||
}
|
|
||||||
|
|
||||||
type Config struct {
|
|
||||||
Schedcfg schedcfg.Config
|
|
||||||
Executor *mesos.ExecutorInfo
|
|
||||||
Scheduler PodScheduler
|
|
||||||
Client *client.Client
|
|
||||||
EtcdClient tools.EtcdClient
|
|
||||||
FailoverTimeout float64
|
|
||||||
ReconcileInterval int64
|
|
||||||
ReconcileCooldown time.Duration
|
|
||||||
LookupNode node.LookupFunc
|
|
||||||
}
|
|
||||||
|
|
||||||
// New creates a new KubernetesScheduler
|
|
||||||
func New(config Config) *KubernetesScheduler {
|
|
||||||
var k *KubernetesScheduler
|
|
||||||
k = &KubernetesScheduler{
|
|
||||||
schedcfg: &config.Schedcfg,
|
|
||||||
RWMutex: new(sync.RWMutex),
|
|
||||||
executor: config.Executor,
|
|
||||||
executorGroup: uid.Parse(config.Executor.ExecutorId.GetValue()).Group(),
|
|
||||||
PodScheduler: config.Scheduler,
|
|
||||||
client: config.Client,
|
|
||||||
etcdClient: config.EtcdClient,
|
|
||||||
failoverTimeout: config.FailoverTimeout,
|
|
||||||
reconcileInterval: config.ReconcileInterval,
|
|
||||||
nodeRegistrator: node.NewRegistrator(config.Client, config.LookupNode),
|
|
||||||
offers: offers.CreateRegistry(offers.RegistryConfig{
|
|
||||||
Compat: func(o *mesos.Offer) bool {
|
|
||||||
// the node must be registered and have up-to-date labels
|
|
||||||
n := config.LookupNode(o.GetHostname())
|
|
||||||
if n == nil || !node.IsUpToDate(n, node.SlaveAttributesToLabels(o.GetAttributes())) {
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
|
|
||||||
// the executor IDs must not identify a kubelet-executor with a group that doesn't match ours
|
|
||||||
for _, eid := range o.GetExecutorIds() {
|
|
||||||
execuid := uid.Parse(eid.GetValue())
|
|
||||||
if execuid.Name() == execcfg.DefaultInfoID && execuid.Group() != k.executorGroup {
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return true
|
|
||||||
},
|
|
||||||
DeclineOffer: func(id string) <-chan error {
|
|
||||||
errOnce := proc.NewErrorOnce(k.terminate)
|
|
||||||
errOuter := k.asRegisteredMaster.Do(func() {
|
|
||||||
var err error
|
|
||||||
defer errOnce.Report(err)
|
|
||||||
offerId := mutil.NewOfferID(id)
|
|
||||||
filters := &mesos.Filters{}
|
|
||||||
_, err = k.driver.DeclineOffer(offerId, filters)
|
|
||||||
})
|
|
||||||
return errOnce.Send(errOuter).Err()
|
|
||||||
},
|
|
||||||
// remember expired offers so that we can tell if a previously scheduler offer relies on one
|
|
||||||
LingerTTL: config.Schedcfg.OfferLingerTTL.Duration,
|
|
||||||
TTL: config.Schedcfg.OfferTTL.Duration,
|
|
||||||
ListenerDelay: config.Schedcfg.ListenerDelay.Duration,
|
|
||||||
}),
|
|
||||||
slaveHostNames: slave.NewRegistry(),
|
|
||||||
taskRegistry: podtask.NewInMemoryRegistry(),
|
|
||||||
reconcileCooldown: config.ReconcileCooldown,
|
|
||||||
registration: make(chan struct{}),
|
|
||||||
asRegisteredMaster: proc.DoerFunc(func(proc.Action) <-chan error {
|
|
||||||
return proc.ErrorChanf("cannot execute action with unregistered scheduler")
|
|
||||||
}),
|
|
||||||
}
|
|
||||||
return k
|
|
||||||
}
|
|
||||||
|
|
||||||
func (k *KubernetesScheduler) Init(electedMaster proc.Process, pl PluginInterface, mux *http.ServeMux) error {
|
|
||||||
log.V(1).Infoln("initializing kubernetes mesos scheduler")
|
|
||||||
|
|
||||||
k.asRegisteredMaster = proc.DoerFunc(func(a proc.Action) <-chan error {
|
|
||||||
if !k.registered {
|
|
||||||
return proc.ErrorChanf("failed to execute action, scheduler is disconnected")
|
|
||||||
}
|
|
||||||
return electedMaster.Do(a)
|
|
||||||
})
|
|
||||||
k.terminate = electedMaster.Done()
|
|
||||||
k.plugin = pl
|
|
||||||
k.offers.Init(k.terminate)
|
|
||||||
k.InstallDebugHandlers(mux)
|
|
||||||
k.nodeRegistrator.Run(k.terminate)
|
|
||||||
return k.recoverTasks()
|
|
||||||
}
|
|
||||||
|
|
||||||
func (k *KubernetesScheduler) asMaster() proc.Doer {
|
|
||||||
k.RLock()
|
|
||||||
defer k.RUnlock()
|
|
||||||
return k.asRegisteredMaster
|
|
||||||
}
|
|
||||||
|
|
||||||
func (k *KubernetesScheduler) InstallDebugHandlers(mux *http.ServeMux) {
|
|
||||||
wrappedHandler := func(uri string, h http.Handler) {
|
|
||||||
mux.HandleFunc(uri, func(w http.ResponseWriter, r *http.Request) {
|
|
||||||
ch := make(chan struct{})
|
|
||||||
closer := runtime.Closer(ch)
|
|
||||||
proc.OnError(k.asMaster().Do(func() {
|
|
||||||
defer closer()
|
|
||||||
h.ServeHTTP(w, r)
|
|
||||||
}), func(err error) {
|
|
||||||
defer closer()
|
|
||||||
log.Warningf("failed HTTP request for %s: %v", uri, err)
|
|
||||||
w.WriteHeader(http.StatusServiceUnavailable)
|
|
||||||
}, k.terminate)
|
|
||||||
select {
|
|
||||||
case <-time.After(k.schedcfg.HttpHandlerTimeout.Duration):
|
|
||||||
log.Warningf("timed out waiting for request to be processed")
|
|
||||||
w.WriteHeader(http.StatusServiceUnavailable)
|
|
||||||
return
|
|
||||||
case <-ch: // noop
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
|
||||||
requestReconciliation := func(uri string, requestAction func()) {
|
|
||||||
wrappedHandler(uri, http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
|
||||||
requestAction()
|
|
||||||
w.WriteHeader(http.StatusNoContent)
|
|
||||||
}))
|
|
||||||
}
|
|
||||||
requestReconciliation("/debug/actions/requestExplicit", k.reconciler.RequestExplicit)
|
|
||||||
requestReconciliation("/debug/actions/requestImplicit", k.reconciler.RequestImplicit)
|
|
||||||
|
|
||||||
wrappedHandler("/debug/actions/kamikaze", http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
|
||||||
slaves := k.slaveHostNames.SlaveIDs()
|
|
||||||
for _, slaveId := range slaves {
|
|
||||||
_, err := k.driver.SendFrameworkMessage(
|
|
||||||
k.executor.ExecutorId,
|
|
||||||
mutil.NewSlaveID(slaveId),
|
|
||||||
messages.Kamikaze)
|
|
||||||
if err != nil {
|
|
||||||
log.Warningf("failed to send kamikaze message to slave %s: %v", slaveId, err)
|
|
||||||
} else {
|
|
||||||
io.WriteString(w, fmt.Sprintf("kamikaze slave %s\n", slaveId))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
io.WriteString(w, "OK")
|
|
||||||
}))
|
|
||||||
}
|
|
||||||
|
|
||||||
func (k *KubernetesScheduler) Registration() <-chan struct{} {
|
|
||||||
return k.registration
|
|
||||||
}
|
|
||||||
|
|
||||||
// Registered is called when the scheduler registered with the master successfully.
|
|
||||||
func (k *KubernetesScheduler) Registered(drv bindings.SchedulerDriver, fid *mesos.FrameworkID, mi *mesos.MasterInfo) {
|
|
||||||
log.Infof("Scheduler registered with the master: %v with frameworkId: %v\n", mi, fid)
|
|
||||||
|
|
||||||
k.driver = drv
|
|
||||||
k.frameworkId = fid
|
|
||||||
k.masterInfo = mi
|
|
||||||
k.registered = true
|
|
||||||
|
|
||||||
k.onRegistration.Do(func() { k.onInitialRegistration(drv) })
|
|
||||||
k.reconciler.RequestExplicit()
|
|
||||||
}
|
|
||||||
|
|
||||||
func (k *KubernetesScheduler) storeFrameworkId() {
|
|
||||||
// TODO(jdef): port FrameworkId store to generic Kubernetes config store as soon as available
|
|
||||||
_, err := k.etcdClient.Set(meta.FrameworkIDKey, k.frameworkId.GetValue(), uint64(k.failoverTimeout))
|
|
||||||
if err != nil {
|
|
||||||
log.Errorf("failed to renew frameworkId TTL: %v", err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Reregistered is called when the scheduler re-registered with the master successfully.
|
|
||||||
// This happends when the master fails over.
|
|
||||||
func (k *KubernetesScheduler) Reregistered(drv bindings.SchedulerDriver, mi *mesos.MasterInfo) {
|
|
||||||
log.Infof("Scheduler reregistered with the master: %v\n", mi)
|
|
||||||
|
|
||||||
k.driver = drv
|
|
||||||
k.masterInfo = mi
|
|
||||||
k.registered = true
|
|
||||||
|
|
||||||
k.onRegistration.Do(func() { k.onInitialRegistration(drv) })
|
|
||||||
k.reconciler.RequestExplicit()
|
|
||||||
}
|
|
||||||
|
|
||||||
// perform one-time initialization actions upon the first registration event received from Mesos.
|
|
||||||
func (k *KubernetesScheduler) onInitialRegistration(driver bindings.SchedulerDriver) {
|
|
||||||
defer close(k.registration)
|
|
||||||
|
|
||||||
if k.failoverTimeout > 0 {
|
|
||||||
refreshInterval := k.schedcfg.FrameworkIdRefreshInterval.Duration
|
|
||||||
if k.failoverTimeout < k.schedcfg.FrameworkIdRefreshInterval.Duration.Seconds() {
|
|
||||||
refreshInterval = time.Duration(math.Max(1, k.failoverTimeout/2)) * time.Second
|
|
||||||
}
|
|
||||||
go runtime.Until(k.storeFrameworkId, refreshInterval, k.terminate)
|
|
||||||
}
|
|
||||||
|
|
||||||
r1 := k.makeTaskRegistryReconciler()
|
|
||||||
r2 := k.makePodRegistryReconciler()
|
|
||||||
|
|
||||||
k.reconciler = newReconciler(k.asRegisteredMaster, k.makeCompositeReconciler(r1, r2),
|
|
||||||
k.reconcileCooldown, k.schedcfg.ExplicitReconciliationAbortTimeout.Duration, k.terminate)
|
|
||||||
go k.reconciler.Run(driver)
|
|
||||||
|
|
||||||
if k.reconcileInterval > 0 {
|
|
||||||
ri := time.Duration(k.reconcileInterval) * time.Second
|
|
||||||
time.AfterFunc(k.schedcfg.InitialImplicitReconciliationDelay.Duration, func() { runtime.Until(k.reconciler.RequestImplicit, ri, k.terminate) })
|
|
||||||
log.Infof("will perform implicit task reconciliation at interval: %v after %v", ri, k.schedcfg.InitialImplicitReconciliationDelay.Duration)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Disconnected is called when the scheduler loses connection to the master.
|
|
||||||
func (k *KubernetesScheduler) Disconnected(driver bindings.SchedulerDriver) {
|
|
||||||
log.Infof("Master disconnected!\n")
|
|
||||||
|
|
||||||
k.registered = false
|
|
||||||
|
|
||||||
// discard all cached offers to avoid unnecessary TASK_LOST updates
|
|
||||||
k.offers.Invalidate("")
|
|
||||||
}
|
|
||||||
|
|
||||||
// ResourceOffers is called when the scheduler receives some offers from the master.
|
|
||||||
func (k *KubernetesScheduler) ResourceOffers(driver bindings.SchedulerDriver, offers []*mesos.Offer) {
|
|
||||||
log.V(2).Infof("Received offers %+v", offers)
|
|
||||||
|
|
||||||
// Record the offers in the global offer map as well as each slave's offer map.
|
|
||||||
k.offers.Add(offers)
|
|
||||||
for _, offer := range offers {
|
|
||||||
slaveId := offer.GetSlaveId().GetValue()
|
|
||||||
k.slaveHostNames.Register(slaveId, offer.GetHostname())
|
|
||||||
|
|
||||||
// create api object if not existing already
|
|
||||||
if k.nodeRegistrator != nil {
|
|
||||||
labels := node.SlaveAttributesToLabels(offer.GetAttributes())
|
|
||||||
_, err := k.nodeRegistrator.Register(offer.GetHostname(), labels)
|
|
||||||
if err != nil {
|
|
||||||
log.Error(err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// OfferRescinded is called when the resources are recinded from the scheduler.
|
|
||||||
func (k *KubernetesScheduler) OfferRescinded(driver bindings.SchedulerDriver, offerId *mesos.OfferID) {
|
|
||||||
log.Infof("Offer rescinded %v\n", offerId)
|
|
||||||
|
|
||||||
oid := offerId.GetValue()
|
|
||||||
k.offers.Delete(oid, offermetrics.OfferRescinded)
|
|
||||||
}
|
|
||||||
|
|
||||||
// StatusUpdate is called when a status update message is sent to the scheduler.
|
|
||||||
func (k *KubernetesScheduler) StatusUpdate(driver bindings.SchedulerDriver, taskStatus *mesos.TaskStatus) {
|
|
||||||
|
|
||||||
source, reason := "none", "none"
|
|
||||||
if taskStatus.Source != nil {
|
|
||||||
source = (*taskStatus.Source).String()
|
|
||||||
}
|
|
||||||
if taskStatus.Reason != nil {
|
|
||||||
reason = (*taskStatus.Reason).String()
|
|
||||||
}
|
|
||||||
taskState := taskStatus.GetState()
|
|
||||||
metrics.StatusUpdates.WithLabelValues(source, reason, taskState.String()).Inc()
|
|
||||||
|
|
||||||
message := "none"
|
|
||||||
if taskStatus.Message != nil {
|
|
||||||
message = *taskStatus.Message
|
|
||||||
}
|
|
||||||
|
|
||||||
log.Infof(
|
|
||||||
"task status update %q from %q for task %q on slave %q executor %q for reason %q with message %q",
|
|
||||||
taskState.String(),
|
|
||||||
source,
|
|
||||||
taskStatus.TaskId.GetValue(),
|
|
||||||
taskStatus.SlaveId.GetValue(),
|
|
||||||
taskStatus.ExecutorId.GetValue(),
|
|
||||||
reason,
|
|
||||||
message,
|
|
||||||
)
|
|
||||||
|
|
||||||
switch taskState {
|
|
||||||
case mesos.TaskState_TASK_RUNNING, mesos.TaskState_TASK_FINISHED, mesos.TaskState_TASK_STARTING, mesos.TaskState_TASK_STAGING:
|
|
||||||
if _, state := k.taskRegistry.UpdateStatus(taskStatus); state == podtask.StateUnknown {
|
|
||||||
if taskState != mesos.TaskState_TASK_FINISHED {
|
|
||||||
//TODO(jdef) what if I receive this after a TASK_LOST or TASK_KILLED?
|
|
||||||
//I don't want to reincarnate then.. TASK_LOST is a special case because
|
|
||||||
//the master is stateless and there are scenarios where I may get TASK_LOST
|
|
||||||
//followed by TASK_RUNNING.
|
|
||||||
//TODO(jdef) consider running this asynchronously since there are API server
|
|
||||||
//calls that may be made
|
|
||||||
k.reconcileNonTerminalTask(driver, taskStatus)
|
|
||||||
} // else, we don't really care about FINISHED tasks that aren't registered
|
|
||||||
return
|
|
||||||
}
|
|
||||||
if hostName := k.slaveHostNames.HostName(taskStatus.GetSlaveId().GetValue()); hostName == "" {
|
|
||||||
// a registered task has an update reported by a slave that we don't recognize.
|
|
||||||
// this should never happen! So we don't reconcile it.
|
|
||||||
log.Errorf("Ignore status %+v because the slave does not exist", taskStatus)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
case mesos.TaskState_TASK_FAILED, mesos.TaskState_TASK_ERROR:
|
|
||||||
if task, _ := k.taskRegistry.UpdateStatus(taskStatus); task != nil {
|
|
||||||
if task.Has(podtask.Launched) && !task.Has(podtask.Bound) {
|
|
||||||
go k.plugin.reconcileTask(task)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// unknown task failed, not much we can do about it
|
|
||||||
return
|
|
||||||
}
|
|
||||||
// last-ditch effort to reconcile our records
|
|
||||||
fallthrough
|
|
||||||
case mesos.TaskState_TASK_LOST, mesos.TaskState_TASK_KILLED:
|
|
||||||
k.reconcileTerminalTask(driver, taskStatus)
|
|
||||||
default:
|
|
||||||
log.Errorf(
|
|
||||||
"unknown task status %q from %q for task %q on slave %q executor %q for reason %q with message %q",
|
|
||||||
taskState.String(),
|
|
||||||
source,
|
|
||||||
taskStatus.TaskId.GetValue(),
|
|
||||||
taskStatus.SlaveId.GetValue(),
|
|
||||||
taskStatus.ExecutorId.GetValue(),
|
|
||||||
reason,
|
|
||||||
message,
|
|
||||||
)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (k *KubernetesScheduler) reconcileTerminalTask(driver bindings.SchedulerDriver, taskStatus *mesos.TaskStatus) {
|
|
||||||
task, state := k.taskRegistry.UpdateStatus(taskStatus)
|
|
||||||
|
|
||||||
if (state == podtask.StateRunning || state == podtask.StatePending) &&
|
|
||||||
((taskStatus.GetSource() == mesos.TaskStatus_SOURCE_MASTER && taskStatus.GetReason() == mesos.TaskStatus_REASON_RECONCILIATION) ||
|
|
||||||
(taskStatus.GetSource() == mesos.TaskStatus_SOURCE_SLAVE && taskStatus.GetReason() == mesos.TaskStatus_REASON_EXECUTOR_TERMINATED) ||
|
|
||||||
(taskStatus.GetSource() == mesos.TaskStatus_SOURCE_SLAVE && taskStatus.GetReason() == mesos.TaskStatus_REASON_EXECUTOR_UNREGISTERED) ||
|
|
||||||
(taskStatus.GetSource() == mesos.TaskStatus_SOURCE_EXECUTOR && taskStatus.GetMessage() == messages.ContainersDisappeared)) {
|
|
||||||
//--
|
|
||||||
// pod-task has metadata that refers to:
|
|
||||||
// (1) a task that Mesos no longer knows about, or else
|
|
||||||
// (2) a pod that the Kubelet will never report as "failed"
|
|
||||||
// (3) a pod that the kubeletExecutor reported as lost (likely due to docker daemon crash/restart)
|
|
||||||
// For now, destroy the pod and hope that there's a replication controller backing it up.
|
|
||||||
// TODO(jdef) for case #2 don't delete the pod, just update it's status to Failed
|
|
||||||
pod := &task.Pod
|
|
||||||
log.Warningf("deleting rogue pod %v/%v for lost task %v", pod.Namespace, pod.Name, task.ID)
|
|
||||||
if err := k.client.Pods(pod.Namespace).Delete(pod.Name, api.NewDeleteOptions(0)); err != nil && !errors.IsNotFound(err) {
|
|
||||||
log.Errorf("failed to delete pod %v/%v for terminal task %v: %v", pod.Namespace, pod.Name, task.ID, err)
|
|
||||||
}
|
|
||||||
} else if taskStatus.GetReason() == mesos.TaskStatus_REASON_EXECUTOR_TERMINATED || taskStatus.GetReason() == mesos.TaskStatus_REASON_EXECUTOR_UNREGISTERED {
|
|
||||||
// attempt to prevent dangling pods in the pod and task registries
|
|
||||||
log.V(1).Infof("request explicit reconciliation to clean up for task %v after executor reported (terminated/unregistered)", taskStatus.TaskId.GetValue())
|
|
||||||
k.reconciler.RequestExplicit()
|
|
||||||
} else if taskStatus.GetState() == mesos.TaskState_TASK_LOST && state == podtask.StateRunning && taskStatus.ExecutorId != nil && taskStatus.SlaveId != nil {
|
|
||||||
//TODO(jdef) this may not be meaningful once we have proper checkpointing and master detection
|
|
||||||
//If we're reconciling and receive this then the executor may be
|
|
||||||
//running a task that we need it to kill. It's possible that the framework
|
|
||||||
//is unrecognized by the master at this point, so KillTask is not guaranteed
|
|
||||||
//to do anything. The underlying driver transport may be able to send a
|
|
||||||
//FrameworkMessage directly to the slave to terminate the task.
|
|
||||||
log.V(2).Info("forwarding TASK_LOST message to executor %v on slave %v", taskStatus.ExecutorId, taskStatus.SlaveId)
|
|
||||||
data := fmt.Sprintf("%s:%s", messages.TaskLost, task.ID) //TODO(jdef) use a real message type
|
|
||||||
if _, err := driver.SendFrameworkMessage(taskStatus.ExecutorId, taskStatus.SlaveId, data); err != nil {
|
|
||||||
log.Error(err.Error())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// reconcile an unknown (from the perspective of our registry) non-terminal task
|
|
||||||
func (k *KubernetesScheduler) reconcileNonTerminalTask(driver bindings.SchedulerDriver, taskStatus *mesos.TaskStatus) {
|
|
||||||
// attempt to recover task from pod info:
|
|
||||||
// - task data may contain an api.PodStatusResult; if status.reason == REASON_RECONCILIATION then status.data == nil
|
|
||||||
// - the Name can be parsed by container.ParseFullName() to yield a pod Name and Namespace
|
|
||||||
// - pull the pod metadata down from the api server
|
|
||||||
// - perform task recovery based on pod metadata
|
|
||||||
taskId := taskStatus.TaskId.GetValue()
|
|
||||||
if taskStatus.GetReason() == mesos.TaskStatus_REASON_RECONCILIATION && taskStatus.GetSource() == mesos.TaskStatus_SOURCE_MASTER {
|
|
||||||
// there will be no data in the task status that we can use to determine the associated pod
|
|
||||||
switch taskStatus.GetState() {
|
|
||||||
case mesos.TaskState_TASK_STAGING:
|
|
||||||
// there is still hope for this task, don't kill it just yet
|
|
||||||
//TODO(jdef) there should probably be a limit for how long we tolerate tasks stuck in this state
|
|
||||||
return
|
|
||||||
default:
|
|
||||||
// for TASK_{STARTING,RUNNING} we should have already attempted to recoverTasks() for.
|
|
||||||
// if the scheduler failed over before the executor fired TASK_STARTING, then we should *not*
|
|
||||||
// be processing this reconciliation update before we process the one from the executor.
|
|
||||||
// point: we don't know what this task is (perhaps there was unrecoverable metadata in the pod),
|
|
||||||
// so it gets killed.
|
|
||||||
log.Errorf("killing non-terminal, unrecoverable task %v", taskId)
|
|
||||||
}
|
|
||||||
} else if podStatus, err := podtask.ParsePodStatusResult(taskStatus); err != nil {
|
|
||||||
// possible rogue pod exists at this point because we can't identify it; should kill the task
|
|
||||||
log.Errorf("possible rogue pod; illegal task status data for task %v, expected an api.PodStatusResult: %v", taskId, err)
|
|
||||||
} else if name, namespace, err := container.ParsePodFullName(podStatus.Name); err != nil {
|
|
||||||
// possible rogue pod exists at this point because we can't identify it; should kill the task
|
|
||||||
log.Errorf("possible rogue pod; illegal api.PodStatusResult, unable to parse full pod name from: '%v' for task %v: %v",
|
|
||||||
podStatus.Name, taskId, err)
|
|
||||||
} else if pod, err := k.client.Pods(namespace).Get(name); err == nil {
|
|
||||||
if t, ok, err := podtask.RecoverFrom(*pod); ok {
|
|
||||||
log.Infof("recovered task %v from metadata in pod %v/%v", taskId, namespace, name)
|
|
||||||
_, err := k.taskRegistry.Register(t)
|
|
||||||
if err != nil {
|
|
||||||
// someone beat us to it?!
|
|
||||||
log.Warningf("failed to register recovered task: %v", err)
|
|
||||||
return
|
|
||||||
} else {
|
|
||||||
k.taskRegistry.UpdateStatus(taskStatus)
|
|
||||||
}
|
|
||||||
return
|
|
||||||
} else if err != nil {
|
|
||||||
//should kill the pod and the task
|
|
||||||
log.Errorf("killing pod, failed to recover task from pod %v/%v: %v", namespace, name, err)
|
|
||||||
if err := k.client.Pods(namespace).Delete(name, nil); err != nil {
|
|
||||||
log.Errorf("failed to delete pod %v/%v: %v", namespace, name, err)
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
//this is pretty unexpected: we received a TASK_{STARTING,RUNNING} message, but the apiserver's pod
|
|
||||||
//metadata is not appropriate for task reconstruction -- which should almost certainly never
|
|
||||||
//be the case unless someone swapped out the pod on us (and kept the same namespace/name) while
|
|
||||||
//we were failed over.
|
|
||||||
|
|
||||||
//kill this task, allow the newly launched scheduler to schedule the new pod
|
|
||||||
log.Warningf("unexpected pod metadata for task %v in apiserver, assuming new unscheduled pod spec: %+v", taskId, pod)
|
|
||||||
}
|
|
||||||
} else if errors.IsNotFound(err) {
|
|
||||||
// pod lookup failed, should delete the task since the pod is no longer valid; may be redundant, that's ok
|
|
||||||
log.Infof("killing task %v since pod %v/%v no longer exists", taskId, namespace, name)
|
|
||||||
} else if errors.IsServerTimeout(err) {
|
|
||||||
log.V(2).Infof("failed to reconcile task due to API server timeout: %v", err)
|
|
||||||
return
|
|
||||||
} else {
|
|
||||||
log.Errorf("unexpected API server error, aborting reconcile for task %v: %v", taskId, err)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
if _, err := driver.KillTask(taskStatus.TaskId); err != nil {
|
|
||||||
log.Errorf("failed to kill task %v: %v", taskId, err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// FrameworkMessage is called when the scheduler receives a message from the executor.
|
|
||||||
func (k *KubernetesScheduler) FrameworkMessage(driver bindings.SchedulerDriver,
|
|
||||||
executorId *mesos.ExecutorID, slaveId *mesos.SlaveID, message string) {
|
|
||||||
log.Infof("Received messages from executor %v of slave %v, %v\n", executorId, slaveId, message)
|
|
||||||
}
|
|
||||||
|
|
||||||
// SlaveLost is called when some slave is lost.
|
|
||||||
func (k *KubernetesScheduler) SlaveLost(driver bindings.SchedulerDriver, slaveId *mesos.SlaveID) {
|
|
||||||
log.Infof("Slave %v is lost\n", slaveId)
|
|
||||||
|
|
||||||
sid := slaveId.GetValue()
|
|
||||||
k.offers.InvalidateForSlave(sid)
|
|
||||||
|
|
||||||
// TODO(jdef): delete slave from our internal list? probably not since we may need to reconcile
|
|
||||||
// tasks. it would be nice to somehow flag the slave as lost so that, perhaps, we can periodically
|
|
||||||
// flush lost slaves older than X, and for which no tasks or pods reference.
|
|
||||||
|
|
||||||
// unfinished tasks/pods will be dropped. use a replication controller if you want pods to
|
|
||||||
// be restarted when slaves die.
|
|
||||||
}
|
|
||||||
|
|
||||||
// ExecutorLost is called when some executor is lost.
|
|
||||||
func (k *KubernetesScheduler) ExecutorLost(driver bindings.SchedulerDriver, executorId *mesos.ExecutorID, slaveId *mesos.SlaveID, status int) {
|
|
||||||
log.Infof("Executor %v of slave %v is lost, status: %v\n", executorId, slaveId, status)
|
|
||||||
// TODO(yifan): Restart any unfinished tasks of the executor.
|
|
||||||
}
|
|
||||||
|
|
||||||
// Error is called when there is an unrecoverable error in the scheduler or scheduler driver.
|
|
||||||
// The driver should have been aborted before this is invoked.
|
|
||||||
func (k *KubernetesScheduler) Error(driver bindings.SchedulerDriver, message string) {
|
|
||||||
log.Fatalf("fatal scheduler error: %v\n", message)
|
|
||||||
}
|
|
||||||
|
|
||||||
// filter func used for explicit task reconciliation, selects only non-terminal tasks which
|
|
||||||
// have been communicated to mesos (read: launched).
|
|
||||||
func explicitTaskFilter(t *podtask.T) bool {
|
|
||||||
switch t.State {
|
|
||||||
case podtask.StateRunning:
|
|
||||||
return true
|
|
||||||
case podtask.StatePending:
|
|
||||||
return t.Has(podtask.Launched)
|
|
||||||
default:
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// invoke the given ReconcilerAction funcs in sequence, aborting the sequence if reconciliation
|
|
||||||
// is cancelled. if any other errors occur the composite reconciler will attempt to complete the
|
|
||||||
// sequence, reporting only the last generated error.
|
|
||||||
func (k *KubernetesScheduler) makeCompositeReconciler(actions ...ReconcilerAction) ReconcilerAction {
|
|
||||||
if x := len(actions); x == 0 {
|
|
||||||
// programming error
|
|
||||||
panic("no actions specified for composite reconciler")
|
|
||||||
} else if x == 1 {
|
|
||||||
return actions[0]
|
|
||||||
}
|
|
||||||
chained := func(d bindings.SchedulerDriver, c <-chan struct{}, a, b ReconcilerAction) <-chan error {
|
|
||||||
ech := a(d, c)
|
|
||||||
ch := make(chan error, 1)
|
|
||||||
go func() {
|
|
||||||
select {
|
|
||||||
case <-k.terminate:
|
|
||||||
case <-c:
|
|
||||||
case e := <-ech:
|
|
||||||
if e != nil {
|
|
||||||
ch <- e
|
|
||||||
return
|
|
||||||
}
|
|
||||||
ech = b(d, c)
|
|
||||||
select {
|
|
||||||
case <-k.terminate:
|
|
||||||
case <-c:
|
|
||||||
case e := <-ech:
|
|
||||||
if e != nil {
|
|
||||||
ch <- e
|
|
||||||
return
|
|
||||||
}
|
|
||||||
close(ch)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
}
|
|
||||||
ch <- fmt.Errorf("aborting composite reconciler action")
|
|
||||||
}()
|
|
||||||
return ch
|
|
||||||
}
|
|
||||||
result := func(d bindings.SchedulerDriver, c <-chan struct{}) <-chan error {
|
|
||||||
return chained(d, c, actions[0], actions[1])
|
|
||||||
}
|
|
||||||
for i := 2; i < len(actions); i++ {
|
|
||||||
i := i
|
|
||||||
next := func(d bindings.SchedulerDriver, c <-chan struct{}) <-chan error {
|
|
||||||
return chained(d, c, ReconcilerAction(result), actions[i])
|
|
||||||
}
|
|
||||||
result = next
|
|
||||||
}
|
|
||||||
return ReconcilerAction(result)
|
|
||||||
}
|
|
||||||
|
|
||||||
// reconciler action factory, performs explicit task reconciliation for non-terminal
|
|
||||||
// tasks listed in the scheduler's internal taskRegistry.
|
|
||||||
func (k *KubernetesScheduler) makeTaskRegistryReconciler() ReconcilerAction {
|
|
||||||
return ReconcilerAction(func(drv bindings.SchedulerDriver, cancel <-chan struct{}) <-chan error {
|
|
||||||
taskToSlave := make(map[string]string)
|
|
||||||
for _, t := range k.taskRegistry.List(explicitTaskFilter) {
|
|
||||||
if t.Spec.SlaveID != "" {
|
|
||||||
taskToSlave[t.ID] = t.Spec.SlaveID
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return proc.ErrorChan(k.explicitlyReconcileTasks(drv, taskToSlave, cancel))
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
// reconciler action factory, performs explicit task reconciliation for non-terminal
|
|
||||||
// tasks identified by annotations in the Kubernetes pod registry.
|
|
||||||
func (k *KubernetesScheduler) makePodRegistryReconciler() ReconcilerAction {
|
|
||||||
return ReconcilerAction(func(drv bindings.SchedulerDriver, cancel <-chan struct{}) <-chan error {
|
|
||||||
podList, err := k.client.Pods(api.NamespaceAll).List(labels.Everything(), fields.Everything())
|
|
||||||
if err != nil {
|
|
||||||
return proc.ErrorChanf("failed to reconcile pod registry: %v", err)
|
|
||||||
}
|
|
||||||
taskToSlave := make(map[string]string)
|
|
||||||
for _, pod := range podList.Items {
|
|
||||||
if len(pod.Annotations) == 0 {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
taskId, found := pod.Annotations[meta.TaskIdKey]
|
|
||||||
if !found {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
slaveId, found := pod.Annotations[meta.SlaveIdKey]
|
|
||||||
if !found {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
taskToSlave[taskId] = slaveId
|
|
||||||
}
|
|
||||||
return proc.ErrorChan(k.explicitlyReconcileTasks(drv, taskToSlave, cancel))
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
// execute an explicit task reconciliation, as per http://mesos.apache.org/documentation/latest/reconciliation/
|
|
||||||
func (k *KubernetesScheduler) explicitlyReconcileTasks(driver bindings.SchedulerDriver, taskToSlave map[string]string, cancel <-chan struct{}) error {
|
|
||||||
log.Info("explicit reconcile tasks")
|
|
||||||
|
|
||||||
// tell mesos to send us the latest status updates for all the non-terminal tasks that we know about
|
|
||||||
statusList := []*mesos.TaskStatus{}
|
|
||||||
remaining := sets.StringKeySet(taskToSlave)
|
|
||||||
for taskId, slaveId := range taskToSlave {
|
|
||||||
if slaveId == "" {
|
|
||||||
delete(taskToSlave, taskId)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
statusList = append(statusList, &mesos.TaskStatus{
|
|
||||||
TaskId: mutil.NewTaskID(taskId),
|
|
||||||
SlaveId: mutil.NewSlaveID(slaveId),
|
|
||||||
State: mesos.TaskState_TASK_RUNNING.Enum(), // req'd field, doesn't have to reflect reality
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
select {
|
|
||||||
case <-cancel:
|
|
||||||
return reconciliationCancelledErr
|
|
||||||
default:
|
|
||||||
if _, err := driver.ReconcileTasks(statusList); err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
start := time.Now()
|
|
||||||
first := true
|
|
||||||
for backoff := 1 * time.Second; first || remaining.Len() > 0; backoff = backoff * 2 {
|
|
||||||
first = false
|
|
||||||
// nothing to do here other than wait for status updates..
|
|
||||||
if backoff > k.schedcfg.ExplicitReconciliationMaxBackoff.Duration {
|
|
||||||
backoff = k.schedcfg.ExplicitReconciliationMaxBackoff.Duration
|
|
||||||
}
|
|
||||||
select {
|
|
||||||
case <-cancel:
|
|
||||||
return reconciliationCancelledErr
|
|
||||||
case <-time.After(backoff):
|
|
||||||
for taskId := range remaining {
|
|
||||||
if task, _ := k.taskRegistry.Get(taskId); task != nil && explicitTaskFilter(task) && task.UpdatedTime.Before(start) {
|
|
||||||
// keep this task in remaining list
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
remaining.Delete(taskId)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
var (
|
|
||||||
reconciliationCancelledErr = fmt.Errorf("explicit task reconciliation cancelled")
|
|
||||||
)
|
|
||||||
|
|
||||||
type ReconcilerAction func(driver bindings.SchedulerDriver, cancel <-chan struct{}) <-chan error
|
|
||||||
|
|
||||||
type Reconciler struct {
|
|
||||||
proc.Doer
|
|
||||||
Action ReconcilerAction
|
|
||||||
explicit chan struct{} // send an empty struct to trigger explicit reconciliation
|
|
||||||
implicit chan struct{} // send an empty struct to trigger implicit reconciliation
|
|
||||||
done <-chan struct{} // close this when you want the reconciler to exit
|
|
||||||
cooldown time.Duration
|
|
||||||
explicitReconciliationAbortTimeout time.Duration
|
|
||||||
}
|
|
||||||
|
|
||||||
func newReconciler(doer proc.Doer, action ReconcilerAction,
|
|
||||||
cooldown, explicitReconciliationAbortTimeout time.Duration, done <-chan struct{}) *Reconciler {
|
|
||||||
return &Reconciler{
|
|
||||||
Doer: doer,
|
|
||||||
explicit: make(chan struct{}, 1),
|
|
||||||
implicit: make(chan struct{}, 1),
|
|
||||||
cooldown: cooldown,
|
|
||||||
explicitReconciliationAbortTimeout: explicitReconciliationAbortTimeout,
|
|
||||||
done: done,
|
|
||||||
Action: func(driver bindings.SchedulerDriver, cancel <-chan struct{}) <-chan error {
|
|
||||||
// trigged the reconciler action in the doer's execution context,
|
|
||||||
// but it could take a while and the scheduler needs to be able to
|
|
||||||
// process updates, the callbacks for which ALSO execute in the SAME
|
|
||||||
// deferred execution context -- so the action MUST be executed async.
|
|
||||||
errOnce := proc.NewErrorOnce(cancel)
|
|
||||||
return errOnce.Send(doer.Do(func() {
|
|
||||||
// only triggers the action if we're the currently elected,
|
|
||||||
// registered master and runs the action async.
|
|
||||||
go func() {
|
|
||||||
var err <-chan error
|
|
||||||
defer errOnce.Send(err)
|
|
||||||
err = action(driver, cancel)
|
|
||||||
}()
|
|
||||||
})).Err()
|
|
||||||
},
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (r *Reconciler) RequestExplicit() {
|
|
||||||
select {
|
|
||||||
case r.explicit <- struct{}{}: // noop
|
|
||||||
default: // request queue full; noop
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (r *Reconciler) RequestImplicit() {
|
|
||||||
select {
|
|
||||||
case r.implicit <- struct{}{}: // noop
|
|
||||||
default: // request queue full; noop
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// execute task reconciliation, returns when r.done is closed. intended to run as a goroutine.
|
|
||||||
// if reconciliation is requested while another is in progress, the in-progress operation will be
|
|
||||||
// cancelled before the new reconciliation operation begins.
|
|
||||||
func (r *Reconciler) Run(driver bindings.SchedulerDriver) {
|
|
||||||
var cancel, finished chan struct{}
|
|
||||||
requestLoop:
|
|
||||||
for {
|
|
||||||
select {
|
|
||||||
case <-r.done:
|
|
||||||
return
|
|
||||||
default: // proceed
|
|
||||||
}
|
|
||||||
select {
|
|
||||||
case <-r.implicit:
|
|
||||||
metrics.ReconciliationRequested.WithLabelValues("implicit").Inc()
|
|
||||||
select {
|
|
||||||
case <-r.done:
|
|
||||||
return
|
|
||||||
case <-r.explicit:
|
|
||||||
break // give preference to a pending request for explicit
|
|
||||||
default: // continue
|
|
||||||
// don't run implicit reconciliation while explicit is ongoing
|
|
||||||
if finished != nil {
|
|
||||||
select {
|
|
||||||
case <-finished: // continue w/ implicit
|
|
||||||
default:
|
|
||||||
log.Infoln("skipping implicit reconcile because explicit reconcile is ongoing")
|
|
||||||
continue requestLoop
|
|
||||||
}
|
|
||||||
}
|
|
||||||
errOnce := proc.NewErrorOnce(r.done)
|
|
||||||
errCh := r.Do(func() {
|
|
||||||
var err error
|
|
||||||
defer errOnce.Report(err)
|
|
||||||
log.Infoln("implicit reconcile tasks")
|
|
||||||
metrics.ReconciliationExecuted.WithLabelValues("implicit").Inc()
|
|
||||||
if _, err = driver.ReconcileTasks([]*mesos.TaskStatus{}); err != nil {
|
|
||||||
log.V(1).Infof("failed to request implicit reconciliation from mesos: %v", err)
|
|
||||||
}
|
|
||||||
})
|
|
||||||
proc.OnError(errOnce.Send(errCh).Err(), func(err error) {
|
|
||||||
log.Errorf("failed to run implicit reconciliation: %v", err)
|
|
||||||
}, r.done)
|
|
||||||
goto slowdown
|
|
||||||
}
|
|
||||||
case <-r.done:
|
|
||||||
return
|
|
||||||
case <-r.explicit: // continue
|
|
||||||
metrics.ReconciliationRequested.WithLabelValues("explicit").Inc()
|
|
||||||
}
|
|
||||||
|
|
||||||
if cancel != nil {
|
|
||||||
close(cancel)
|
|
||||||
cancel = nil
|
|
||||||
|
|
||||||
// play nice and wait for the prior operation to finish, complain
|
|
||||||
// if it doesn't
|
|
||||||
select {
|
|
||||||
case <-r.done:
|
|
||||||
return
|
|
||||||
case <-finished: // noop, expected
|
|
||||||
case <-time.After(r.explicitReconciliationAbortTimeout): // very unexpected
|
|
||||||
log.Error("reconciler action failed to stop upon cancellation")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// copy 'finished' to 'fin' here in case we end up with simultaneous go-routines,
|
|
||||||
// if cancellation takes too long or fails - we don't want to close the same chan
|
|
||||||
// more than once
|
|
||||||
cancel = make(chan struct{})
|
|
||||||
finished = make(chan struct{})
|
|
||||||
go func(fin chan struct{}) {
|
|
||||||
startedAt := time.Now()
|
|
||||||
defer func() {
|
|
||||||
metrics.ReconciliationLatency.Observe(metrics.InMicroseconds(time.Since(startedAt)))
|
|
||||||
}()
|
|
||||||
|
|
||||||
metrics.ReconciliationExecuted.WithLabelValues("explicit").Inc()
|
|
||||||
defer close(fin)
|
|
||||||
err := <-r.Action(driver, cancel)
|
|
||||||
if err == reconciliationCancelledErr {
|
|
||||||
metrics.ReconciliationCancelled.WithLabelValues("explicit").Inc()
|
|
||||||
log.Infoln(err.Error())
|
|
||||||
} else if err != nil {
|
|
||||||
log.Errorf("reconciler action failed: %v", err)
|
|
||||||
}
|
|
||||||
}(finished)
|
|
||||||
slowdown:
|
|
||||||
// don't allow reconciliation to run very frequently, either explicit or implicit
|
|
||||||
select {
|
|
||||||
case <-r.done:
|
|
||||||
return
|
|
||||||
case <-time.After(r.cooldown): // noop
|
|
||||||
}
|
|
||||||
} // for
|
|
||||||
}
|
|
||||||
|
|
||||||
func (ks *KubernetesScheduler) recoverTasks() error {
|
|
||||||
podList, err := ks.client.Pods(api.NamespaceAll).List(labels.Everything(), fields.Everything())
|
|
||||||
if err != nil {
|
|
||||||
log.V(1).Infof("failed to recover pod registry, madness may ensue: %v", err)
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
recoverSlave := func(t *podtask.T) {
|
|
||||||
|
|
||||||
slaveId := t.Spec.SlaveID
|
|
||||||
ks.slaveHostNames.Register(slaveId, t.Offer.Host())
|
|
||||||
}
|
|
||||||
for _, pod := range podList.Items {
|
|
||||||
if _, isMirrorPod := pod.Annotations[kubetypes.ConfigMirrorAnnotationKey]; isMirrorPod {
|
|
||||||
// mirrored pods are never reconciled because the scheduler isn't responsible for
|
|
||||||
// scheduling them; they're started by the executor/kubelet upon instantiation and
|
|
||||||
// reflected in the apiserver afterward. the scheduler has no knowledge of them.
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
if t, ok, err := podtask.RecoverFrom(pod); err != nil {
|
|
||||||
log.Errorf("failed to recover task from pod, will attempt to delete '%v/%v': %v", pod.Namespace, pod.Name, err)
|
|
||||||
err := ks.client.Pods(pod.Namespace).Delete(pod.Name, nil)
|
|
||||||
//TODO(jdef) check for temporary or not-found errors
|
|
||||||
if err != nil {
|
|
||||||
log.Errorf("failed to delete pod '%v/%v': %v", pod.Namespace, pod.Name, err)
|
|
||||||
}
|
|
||||||
} else if ok {
|
|
||||||
ks.taskRegistry.Register(t)
|
|
||||||
recoverSlave(t)
|
|
||||||
log.Infof("recovered task %v from pod %v/%v", t.ID, pod.Namespace, pod.Name)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
}
|
}
|
||||||
|
74
contrib/mesos/pkg/scheduler/scheduler_mock.go
Normal file
74
contrib/mesos/pkg/scheduler/scheduler_mock.go
Normal file
@@ -0,0 +1,74 @@
|
|||||||
|
/*
|
||||||
|
Copyright 2015 The Kubernetes Authors All rights reserved.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package scheduler
|
||||||
|
|
||||||
|
import (
|
||||||
|
"sync"
|
||||||
|
|
||||||
|
"github.com/stretchr/testify/mock"
|
||||||
|
"k8s.io/kubernetes/contrib/mesos/pkg/offers"
|
||||||
|
"k8s.io/kubernetes/contrib/mesos/pkg/runtime"
|
||||||
|
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/podtask"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
// MockScheduler implements SchedulerApi
|
||||||
|
type MockScheduler struct {
|
||||||
|
sync.RWMutex
|
||||||
|
mock.Mock
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *MockScheduler) Run(done <-chan struct{}) {
|
||||||
|
_ = m.Called()
|
||||||
|
runtime.Until(func() {
|
||||||
|
time.Sleep(time.Second)
|
||||||
|
}, time.Second, done)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *MockScheduler) Offers() (f offers.Registry) {
|
||||||
|
args := m.Called()
|
||||||
|
x := args.Get(0)
|
||||||
|
if x != nil {
|
||||||
|
f = x.(offers.Registry)
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *MockScheduler) Tasks() (f podtask.Registry) {
|
||||||
|
args := m.Called()
|
||||||
|
x := args.Get(0)
|
||||||
|
if x != nil {
|
||||||
|
f = x.(podtask.Registry)
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *MockScheduler) KillTask(taskId string) error {
|
||||||
|
args := m.Called(taskId)
|
||||||
|
return args.Error(0)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *MockScheduler) LaunchTask(task *podtask.T) error {
|
||||||
|
args := m.Called(task)
|
||||||
|
return args.Error(0)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *MockScheduler) Reconcile(task *podtask.T) {
|
||||||
|
_ = m.Called()
|
||||||
|
return
|
||||||
|
}
|
@@ -42,7 +42,7 @@ func (m *SchedulerServer) newServiceWriter(stop <-chan struct{}) func() {
|
|||||||
glog.Errorf("Can't create scheduler service: %v", err)
|
glog.Errorf("Can't create scheduler service: %v", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
if err := m.setEndpoints(SCHEDULER_SERVICE_NAME, net.IP(m.Address), m.Port); err != nil {
|
if err := m.setEndpoints(SCHEDULER_SERVICE_NAME, net.IP(m.address), m.port); err != nil {
|
||||||
glog.Errorf("Can't create scheduler endpoints: %v", err)
|
glog.Errorf("Can't create scheduler endpoints: %v", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -76,8 +76,8 @@ func (m *SchedulerServer) createSchedulerServiceIfNeeded(serviceName string, ser
|
|||||||
SessionAffinity: api.ServiceAffinityNone,
|
SessionAffinity: api.ServiceAffinityNone,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
if m.ServiceAddress != nil {
|
if m.serviceAddress != nil {
|
||||||
svc.Spec.ClusterIP = m.ServiceAddress.String()
|
svc.Spec.ClusterIP = m.serviceAddress.String()
|
||||||
}
|
}
|
||||||
_, err := m.client.Services(api.NamespaceValue(ctx)).Create(svc)
|
_, err := m.client.Services(api.NamespaceValue(ctx)).Create(svc)
|
||||||
if err != nil && errors.IsAlreadyExists(err) {
|
if err != nil && errors.IsAlreadyExists(err) {
|
||||||
|
@@ -54,7 +54,9 @@ import (
|
|||||||
minioncfg "k8s.io/kubernetes/contrib/mesos/pkg/minion/config"
|
minioncfg "k8s.io/kubernetes/contrib/mesos/pkg/minion/config"
|
||||||
"k8s.io/kubernetes/contrib/mesos/pkg/profile"
|
"k8s.io/kubernetes/contrib/mesos/pkg/profile"
|
||||||
"k8s.io/kubernetes/contrib/mesos/pkg/runtime"
|
"k8s.io/kubernetes/contrib/mesos/pkg/runtime"
|
||||||
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler"
|
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/components"
|
||||||
|
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/components/algorithm/podschedulers"
|
||||||
|
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/components/framework"
|
||||||
schedcfg "k8s.io/kubernetes/contrib/mesos/pkg/scheduler/config"
|
schedcfg "k8s.io/kubernetes/contrib/mesos/pkg/scheduler/config"
|
||||||
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/ha"
|
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/ha"
|
||||||
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/meta"
|
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/meta"
|
||||||
@@ -65,6 +67,7 @@ import (
|
|||||||
"k8s.io/kubernetes/pkg/api"
|
"k8s.io/kubernetes/pkg/api"
|
||||||
"k8s.io/kubernetes/pkg/api/resource"
|
"k8s.io/kubernetes/pkg/api/resource"
|
||||||
"k8s.io/kubernetes/pkg/client/cache"
|
"k8s.io/kubernetes/pkg/client/cache"
|
||||||
|
"k8s.io/kubernetes/pkg/client/record"
|
||||||
client "k8s.io/kubernetes/pkg/client/unversioned"
|
client "k8s.io/kubernetes/pkg/client/unversioned"
|
||||||
clientauth "k8s.io/kubernetes/pkg/client/unversioned/auth"
|
clientauth "k8s.io/kubernetes/pkg/client/unversioned/auth"
|
||||||
"k8s.io/kubernetes/pkg/fields"
|
"k8s.io/kubernetes/pkg/fields"
|
||||||
@@ -86,72 +89,72 @@ const (
|
|||||||
)
|
)
|
||||||
|
|
||||||
type SchedulerServer struct {
|
type SchedulerServer struct {
|
||||||
Port int
|
port int
|
||||||
Address net.IP
|
address net.IP
|
||||||
EnableProfiling bool
|
enableProfiling bool
|
||||||
AuthPath string
|
authPath string
|
||||||
APIServerList []string
|
apiServerList []string
|
||||||
EtcdServerList []string
|
etcdServerList []string
|
||||||
EtcdConfigFile string
|
etcdConfigFile string
|
||||||
AllowPrivileged bool
|
allowPrivileged bool
|
||||||
ExecutorPath string
|
executorPath string
|
||||||
ProxyPath string
|
proxyPath string
|
||||||
MesosMaster string
|
mesosMaster string
|
||||||
MesosUser string
|
mesosUser string
|
||||||
MesosRole string
|
mesosRole string
|
||||||
MesosAuthPrincipal string
|
mesosAuthPrincipal string
|
||||||
MesosAuthSecretFile string
|
mesosAuthSecretFile string
|
||||||
MesosCgroupPrefix string
|
mesosCgroupPrefix string
|
||||||
MesosExecutorCPUs mresource.CPUShares
|
mesosExecutorCPUs mresource.CPUShares
|
||||||
MesosExecutorMem mresource.MegaBytes
|
mesosExecutorMem mresource.MegaBytes
|
||||||
Checkpoint bool
|
checkpoint bool
|
||||||
FailoverTimeout float64
|
failoverTimeout float64
|
||||||
|
|
||||||
ExecutorLogV int
|
executorLogV int
|
||||||
ExecutorBindall bool
|
executorBindall bool
|
||||||
ExecutorSuicideTimeout time.Duration
|
executorSuicideTimeout time.Duration
|
||||||
LaunchGracePeriod time.Duration
|
launchGracePeriod time.Duration
|
||||||
|
|
||||||
RunProxy bool
|
runProxy bool
|
||||||
ProxyBindall bool
|
proxyBindall bool
|
||||||
ProxyLogV int
|
proxyLogV int
|
||||||
|
|
||||||
MinionPathOverride string
|
minionPathOverride string
|
||||||
MinionLogMaxSize resource.Quantity
|
minionLogMaxSize resource.Quantity
|
||||||
MinionLogMaxBackups int
|
minionLogMaxBackups int
|
||||||
MinionLogMaxAgeInDays int
|
minionLogMaxAgeInDays int
|
||||||
|
|
||||||
MesosAuthProvider string
|
mesosAuthProvider string
|
||||||
DriverPort uint
|
driverPort uint
|
||||||
HostnameOverride string
|
hostnameOverride string
|
||||||
ReconcileInterval int64
|
reconcileInterval int64
|
||||||
ReconcileCooldown time.Duration
|
reconcileCooldown time.Duration
|
||||||
DefaultContainerCPULimit mresource.CPUShares
|
defaultContainerCPULimit mresource.CPUShares
|
||||||
DefaultContainerMemLimit mresource.MegaBytes
|
defaultContainerMemLimit mresource.MegaBytes
|
||||||
SchedulerConfigFileName string
|
schedulerConfigFileName string
|
||||||
Graceful bool
|
graceful bool
|
||||||
FrameworkName string
|
frameworkName string
|
||||||
FrameworkWebURI string
|
frameworkWebURI string
|
||||||
HA bool
|
ha bool
|
||||||
AdvertisedAddress string
|
advertisedAddress string
|
||||||
ServiceAddress net.IP
|
serviceAddress net.IP
|
||||||
HADomain string
|
haDomain string
|
||||||
KMPath string
|
kmPath string
|
||||||
ClusterDNS net.IP
|
clusterDNS net.IP
|
||||||
ClusterDomain string
|
clusterDomain string
|
||||||
KubeletRootDirectory string
|
kubeletRootDirectory string
|
||||||
KubeletDockerEndpoint string
|
kubeletDockerEndpoint string
|
||||||
KubeletPodInfraContainerImage string
|
kubeletPodInfraContainerImage string
|
||||||
KubeletCadvisorPort uint
|
kubeletCadvisorPort uint
|
||||||
KubeletHostNetworkSources string
|
kubeletHostNetworkSources string
|
||||||
KubeletSyncFrequency time.Duration
|
kubeletSyncFrequency time.Duration
|
||||||
KubeletNetworkPluginName string
|
kubeletNetworkPluginName string
|
||||||
StaticPodsConfigPath string
|
staticPodsConfigPath string
|
||||||
DockerCfgPath string
|
dockerCfgPath string
|
||||||
ContainPodResources bool
|
containPodResources bool
|
||||||
AccountForPodResources bool
|
accountForPodResources bool
|
||||||
nodeRelistPeriod time.Duration
|
nodeRelistPeriod time.Duration
|
||||||
SandboxOverlay string
|
sandboxOverlay string
|
||||||
|
|
||||||
executable string // path to the binary running this service
|
executable string // path to the binary running this service
|
||||||
client *client.Client
|
client *client.Client
|
||||||
@@ -170,36 +173,36 @@ type schedulerProcessInterface interface {
|
|||||||
// NewSchedulerServer creates a new SchedulerServer with default parameters
|
// NewSchedulerServer creates a new SchedulerServer with default parameters
|
||||||
func NewSchedulerServer() *SchedulerServer {
|
func NewSchedulerServer() *SchedulerServer {
|
||||||
s := SchedulerServer{
|
s := SchedulerServer{
|
||||||
Port: ports.SchedulerPort,
|
port: ports.SchedulerPort,
|
||||||
Address: net.ParseIP("127.0.0.1"),
|
address: net.ParseIP("127.0.0.1"),
|
||||||
FailoverTimeout: time.Duration((1 << 62) - 1).Seconds(),
|
failoverTimeout: time.Duration((1 << 62) - 1).Seconds(),
|
||||||
|
|
||||||
RunProxy: true,
|
runProxy: true,
|
||||||
ExecutorSuicideTimeout: execcfg.DefaultSuicideTimeout,
|
executorSuicideTimeout: execcfg.DefaultSuicideTimeout,
|
||||||
LaunchGracePeriod: execcfg.DefaultLaunchGracePeriod,
|
launchGracePeriod: execcfg.DefaultLaunchGracePeriod,
|
||||||
DefaultContainerCPULimit: mresource.DefaultDefaultContainerCPULimit,
|
defaultContainerCPULimit: mresource.DefaultDefaultContainerCPULimit,
|
||||||
DefaultContainerMemLimit: mresource.DefaultDefaultContainerMemLimit,
|
defaultContainerMemLimit: mresource.DefaultDefaultContainerMemLimit,
|
||||||
|
|
||||||
MinionLogMaxSize: minioncfg.DefaultLogMaxSize(),
|
minionLogMaxSize: minioncfg.DefaultLogMaxSize(),
|
||||||
MinionLogMaxBackups: minioncfg.DefaultLogMaxBackups,
|
minionLogMaxBackups: minioncfg.DefaultLogMaxBackups,
|
||||||
MinionLogMaxAgeInDays: minioncfg.DefaultLogMaxAgeInDays,
|
minionLogMaxAgeInDays: minioncfg.DefaultLogMaxAgeInDays,
|
||||||
|
|
||||||
MesosAuthProvider: sasl.ProviderName,
|
mesosAuthProvider: sasl.ProviderName,
|
||||||
MesosCgroupPrefix: minioncfg.DefaultCgroupPrefix,
|
mesosCgroupPrefix: minioncfg.DefaultCgroupPrefix,
|
||||||
MesosMaster: defaultMesosMaster,
|
mesosMaster: defaultMesosMaster,
|
||||||
MesosUser: defaultMesosUser,
|
mesosUser: defaultMesosUser,
|
||||||
MesosExecutorCPUs: defaultExecutorCPUs,
|
mesosExecutorCPUs: defaultExecutorCPUs,
|
||||||
MesosExecutorMem: defaultExecutorMem,
|
mesosExecutorMem: defaultExecutorMem,
|
||||||
ReconcileInterval: defaultReconcileInterval,
|
reconcileInterval: defaultReconcileInterval,
|
||||||
ReconcileCooldown: defaultReconcileCooldown,
|
reconcileCooldown: defaultReconcileCooldown,
|
||||||
Checkpoint: true,
|
checkpoint: true,
|
||||||
FrameworkName: defaultFrameworkName,
|
frameworkName: defaultFrameworkName,
|
||||||
HA: false,
|
ha: false,
|
||||||
mux: http.NewServeMux(),
|
mux: http.NewServeMux(),
|
||||||
KubeletCadvisorPort: 4194, // copied from github.com/GoogleCloudPlatform/kubernetes/blob/release-0.14/cmd/kubelet/app/server.go
|
kubeletCadvisorPort: 4194, // copied from github.com/GoogleCloudPlatform/kubernetes/blob/release-0.14/cmd/kubelet/app/server.go
|
||||||
KubeletSyncFrequency: 10 * time.Second,
|
kubeletSyncFrequency: 10 * time.Second,
|
||||||
ContainPodResources: true,
|
containPodResources: true,
|
||||||
AccountForPodResources: true,
|
accountForPodResources: true,
|
||||||
nodeRelistPeriod: defaultNodeRelistPeriod,
|
nodeRelistPeriod: defaultNodeRelistPeriod,
|
||||||
}
|
}
|
||||||
// cache this for later use. also useful in case the original binary gets deleted, e.g.
|
// cache this for later use. also useful in case the original binary gets deleted, e.g.
|
||||||
@@ -208,76 +211,76 @@ func NewSchedulerServer() *SchedulerServer {
|
|||||||
log.Fatalf("failed to determine path to currently running executable: %v", err)
|
log.Fatalf("failed to determine path to currently running executable: %v", err)
|
||||||
} else {
|
} else {
|
||||||
s.executable = filename
|
s.executable = filename
|
||||||
s.KMPath = filename
|
s.kmPath = filename
|
||||||
}
|
}
|
||||||
|
|
||||||
return &s
|
return &s
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *SchedulerServer) addCoreFlags(fs *pflag.FlagSet) {
|
func (s *SchedulerServer) addCoreFlags(fs *pflag.FlagSet) {
|
||||||
fs.IntVar(&s.Port, "port", s.Port, "The port that the scheduler's http service runs on")
|
fs.IntVar(&s.port, "port", s.port, "The port that the scheduler's http service runs on")
|
||||||
fs.IPVar(&s.Address, "address", s.Address, "The IP address to serve on (set to 0.0.0.0 for all interfaces)")
|
fs.IPVar(&s.address, "address", s.address, "The IP address to serve on (set to 0.0.0.0 for all interfaces)")
|
||||||
fs.BoolVar(&s.EnableProfiling, "profiling", s.EnableProfiling, "Enable profiling via web interface host:port/debug/pprof/")
|
fs.BoolVar(&s.enableProfiling, "profiling", s.enableProfiling, "Enable profiling via web interface host:port/debug/pprof/")
|
||||||
fs.StringSliceVar(&s.APIServerList, "api-servers", s.APIServerList, "List of Kubernetes API servers for publishing events, and reading pods and services. (ip:port), comma separated.")
|
fs.StringSliceVar(&s.apiServerList, "api-servers", s.apiServerList, "List of Kubernetes API servers for publishing events, and reading pods and services. (ip:port), comma separated.")
|
||||||
fs.StringVar(&s.AuthPath, "auth-path", s.AuthPath, "Path to .kubernetes_auth file, specifying how to authenticate to API server.")
|
fs.StringVar(&s.authPath, "auth-path", s.authPath, "Path to .kubernetes_auth file, specifying how to authenticate to API server.")
|
||||||
fs.StringSliceVar(&s.EtcdServerList, "etcd-servers", s.EtcdServerList, "List of etcd servers to watch (http://ip:port), comma separated. Mutually exclusive with --etcd-config")
|
fs.StringSliceVar(&s.etcdServerList, "etcd-servers", s.etcdServerList, "List of etcd servers to watch (http://ip:port), comma separated. Mutually exclusive with --etcd-config")
|
||||||
fs.StringVar(&s.EtcdConfigFile, "etcd-config", s.EtcdConfigFile, "The config file for the etcd client. Mutually exclusive with --etcd-servers.")
|
fs.StringVar(&s.etcdConfigFile, "etcd-config", s.etcdConfigFile, "The config file for the etcd client. Mutually exclusive with --etcd-servers.")
|
||||||
fs.BoolVar(&s.AllowPrivileged, "allow-privileged", s.AllowPrivileged, "If true, allow privileged containers.")
|
fs.BoolVar(&s.allowPrivileged, "allow-privileged", s.allowPrivileged, "If true, allow privileged containers.")
|
||||||
fs.StringVar(&s.ClusterDomain, "cluster-domain", s.ClusterDomain, "Domain for this cluster. If set, kubelet will configure all containers to search this domain in addition to the host's search domains")
|
fs.StringVar(&s.clusterDomain, "cluster-domain", s.clusterDomain, "Domain for this cluster. If set, kubelet will configure all containers to search this domain in addition to the host's search domains")
|
||||||
fs.IPVar(&s.ClusterDNS, "cluster-dns", s.ClusterDNS, "IP address for a cluster DNS server. If set, kubelet will configure all containers to use this for DNS resolution in addition to the host's DNS servers")
|
fs.IPVar(&s.clusterDNS, "cluster-dns", s.clusterDNS, "IP address for a cluster DNS server. If set, kubelet will configure all containers to use this for DNS resolution in addition to the host's DNS servers")
|
||||||
fs.StringVar(&s.StaticPodsConfigPath, "static-pods-config", s.StaticPodsConfigPath, "Path for specification of static pods. Path should point to dir containing the staticPods configuration files. Defaults to none.")
|
fs.StringVar(&s.staticPodsConfigPath, "static-pods-config", s.staticPodsConfigPath, "Path for specification of static pods. Path should point to dir containing the staticPods configuration files. Defaults to none.")
|
||||||
|
|
||||||
fs.StringVar(&s.MesosMaster, "mesos-master", s.MesosMaster, "Location of the Mesos master. The format is a comma-delimited list of of hosts like zk://host1:port,host2:port/mesos. If using ZooKeeper, pay particular attention to the leading zk:// and trailing /mesos! If not using ZooKeeper, standard URLs like http://localhost are also acceptable.")
|
fs.StringVar(&s.mesosMaster, "mesos-master", s.mesosMaster, "Location of the Mesos master. The format is a comma-delimited list of of hosts like zk://host1:port,host2:port/mesos. If using ZooKeeper, pay particular attention to the leading zk:// and trailing /mesos! If not using ZooKeeper, standard URLs like http://localhost are also acceptable.")
|
||||||
fs.StringVar(&s.MesosUser, "mesos-user", s.MesosUser, "Mesos user for this framework, defaults to root.")
|
fs.StringVar(&s.mesosUser, "mesos-user", s.mesosUser, "Mesos user for this framework, defaults to root.")
|
||||||
fs.StringVar(&s.MesosRole, "mesos-role", s.MesosRole, "Mesos role for this framework, defaults to none.")
|
fs.StringVar(&s.mesosRole, "mesos-role", s.mesosRole, "Mesos role for this framework, defaults to none.")
|
||||||
fs.StringVar(&s.MesosAuthPrincipal, "mesos-authentication-principal", s.MesosAuthPrincipal, "Mesos authentication principal.")
|
fs.StringVar(&s.mesosAuthPrincipal, "mesos-authentication-principal", s.mesosAuthPrincipal, "Mesos authentication principal.")
|
||||||
fs.StringVar(&s.MesosAuthSecretFile, "mesos-authentication-secret-file", s.MesosAuthSecretFile, "Mesos authentication secret file.")
|
fs.StringVar(&s.mesosAuthSecretFile, "mesos-authentication-secret-file", s.mesosAuthSecretFile, "Mesos authentication secret file.")
|
||||||
fs.StringVar(&s.MesosAuthProvider, "mesos-authentication-provider", s.MesosAuthProvider, fmt.Sprintf("Authentication provider to use, default is SASL that supports mechanisms: %+v", mech.ListSupported()))
|
fs.StringVar(&s.mesosAuthProvider, "mesos-authentication-provider", s.mesosAuthProvider, fmt.Sprintf("Authentication provider to use, default is SASL that supports mechanisms: %+v", mech.ListSupported()))
|
||||||
fs.StringVar(&s.DockerCfgPath, "dockercfg-path", s.DockerCfgPath, "Path to a dockercfg file that will be used by the docker instance of the minions.")
|
fs.StringVar(&s.dockerCfgPath, "dockercfg-path", s.dockerCfgPath, "Path to a dockercfg file that will be used by the docker instance of the minions.")
|
||||||
fs.StringVar(&s.MesosCgroupPrefix, "mesos-cgroup-prefix", s.MesosCgroupPrefix, "The cgroup prefix concatenated with MESOS_DIRECTORY must give the executor cgroup set by Mesos")
|
fs.StringVar(&s.mesosCgroupPrefix, "mesos-cgroup-prefix", s.mesosCgroupPrefix, "The cgroup prefix concatenated with MESOS_DIRECTORY must give the executor cgroup set by Mesos")
|
||||||
fs.Var(&s.MesosExecutorCPUs, "mesos-executor-cpus", "Initial CPU shares to allocate for each Mesos executor container.")
|
fs.Var(&s.mesosExecutorCPUs, "mesos-executor-cpus", "Initial CPU shares to allocate for each Mesos executor container.")
|
||||||
fs.Var(&s.MesosExecutorMem, "mesos-executor-mem", "Initial memory (MB) to allocate for each Mesos executor container.")
|
fs.Var(&s.mesosExecutorMem, "mesos-executor-mem", "Initial memory (MB) to allocate for each Mesos executor container.")
|
||||||
fs.BoolVar(&s.Checkpoint, "checkpoint", s.Checkpoint, "Enable/disable checkpointing for the kubernetes-mesos framework.")
|
fs.BoolVar(&s.checkpoint, "checkpoint", s.checkpoint, "Enable/disable checkpointing for the kubernetes-mesos framework.")
|
||||||
fs.Float64Var(&s.FailoverTimeout, "failover-timeout", s.FailoverTimeout, fmt.Sprintf("Framework failover timeout, in sec."))
|
fs.Float64Var(&s.failoverTimeout, "failover-timeout", s.failoverTimeout, fmt.Sprintf("Framework failover timeout, in sec."))
|
||||||
fs.UintVar(&s.DriverPort, "driver-port", s.DriverPort, "Port that the Mesos scheduler driver process should listen on.")
|
fs.UintVar(&s.driverPort, "driver-port", s.driverPort, "Port that the Mesos scheduler driver process should listen on.")
|
||||||
fs.StringVar(&s.HostnameOverride, "hostname-override", s.HostnameOverride, "If non-empty, will use this string as identification instead of the actual hostname.")
|
fs.StringVar(&s.hostnameOverride, "hostname-override", s.hostnameOverride, "If non-empty, will use this string as identification instead of the actual hostname.")
|
||||||
fs.Int64Var(&s.ReconcileInterval, "reconcile-interval", s.ReconcileInterval, "Interval at which to execute task reconciliation, in sec. Zero disables.")
|
fs.Int64Var(&s.reconcileInterval, "reconcile-interval", s.reconcileInterval, "Interval at which to execute task reconciliation, in sec. Zero disables.")
|
||||||
fs.DurationVar(&s.ReconcileCooldown, "reconcile-cooldown", s.ReconcileCooldown, "Minimum rest period between task reconciliation operations.")
|
fs.DurationVar(&s.reconcileCooldown, "reconcile-cooldown", s.reconcileCooldown, "Minimum rest period between task reconciliation operations.")
|
||||||
fs.StringVar(&s.SchedulerConfigFileName, "scheduler-config", s.SchedulerConfigFileName, "An ini-style configuration file with low-level scheduler settings.")
|
fs.StringVar(&s.schedulerConfigFileName, "scheduler-config", s.schedulerConfigFileName, "An ini-style configuration file with low-level scheduler settings.")
|
||||||
fs.BoolVar(&s.Graceful, "graceful", s.Graceful, "Indicator of a graceful failover, intended for internal use only.")
|
fs.BoolVar(&s.graceful, "graceful", s.graceful, "Indicator of a graceful failover, intended for internal use only.")
|
||||||
fs.BoolVar(&s.HA, "ha", s.HA, "Run the scheduler in high availability mode with leader election. All peers should be configured exactly the same.")
|
fs.BoolVar(&s.ha, "ha", s.ha, "Run the scheduler in high availability mode with leader election. All peers should be configured exactly the same.")
|
||||||
fs.StringVar(&s.FrameworkName, "framework-name", s.FrameworkName, "The framework name to register with Mesos.")
|
fs.StringVar(&s.frameworkName, "framework-name", s.frameworkName, "The framework name to register with Mesos.")
|
||||||
fs.StringVar(&s.FrameworkWebURI, "framework-weburi", s.FrameworkWebURI, "A URI that points to a web-based interface for interacting with the framework.")
|
fs.StringVar(&s.frameworkWebURI, "framework-weburi", s.frameworkWebURI, "A URI that points to a web-based interface for interacting with the framework.")
|
||||||
fs.StringVar(&s.AdvertisedAddress, "advertised-address", s.AdvertisedAddress, "host:port address that is advertised to clients. May be used to construct artifact download URIs.")
|
fs.StringVar(&s.advertisedAddress, "advertised-address", s.advertisedAddress, "host:port address that is advertised to clients. May be used to construct artifact download URIs.")
|
||||||
fs.IPVar(&s.ServiceAddress, "service-address", s.ServiceAddress, "The service portal IP address that the scheduler should register with (if unset, chooses randomly)")
|
fs.IPVar(&s.serviceAddress, "service-address", s.serviceAddress, "The service portal IP address that the scheduler should register with (if unset, chooses randomly)")
|
||||||
fs.Var(&s.DefaultContainerCPULimit, "default-container-cpu-limit", "Containers without a CPU resource limit are admitted this much CPU shares")
|
fs.Var(&s.defaultContainerCPULimit, "default-container-cpu-limit", "Containers without a CPU resource limit are admitted this much CPU shares")
|
||||||
fs.Var(&s.DefaultContainerMemLimit, "default-container-mem-limit", "Containers without a memory resource limit are admitted this much amount of memory in MB")
|
fs.Var(&s.defaultContainerMemLimit, "default-container-mem-limit", "Containers without a memory resource limit are admitted this much amount of memory in MB")
|
||||||
fs.BoolVar(&s.ContainPodResources, "contain-pod-resources", s.ContainPodResources, "Reparent pod containers into mesos cgroups; disable if you're having strange mesos/docker/systemd interactions.")
|
fs.BoolVar(&s.containPodResources, "contain-pod-resources", s.containPodResources, "Reparent pod containers into mesos cgroups; disable if you're having strange mesos/docker/systemd interactions.")
|
||||||
fs.BoolVar(&s.AccountForPodResources, "account-for-pod-resources", s.AccountForPodResources, "Allocate pod CPU and memory resources from offers (Default: true)")
|
fs.BoolVar(&s.accountForPodResources, "account-for-pod-resources", s.accountForPodResources, "Allocate pod CPU and memory resources from offers (Default: true)")
|
||||||
fs.DurationVar(&s.nodeRelistPeriod, "node-monitor-period", s.nodeRelistPeriod, "Period between relisting of all nodes from the apiserver.")
|
fs.DurationVar(&s.nodeRelistPeriod, "node-monitor-period", s.nodeRelistPeriod, "Period between relisting of all nodes from the apiserver.")
|
||||||
|
|
||||||
fs.IntVar(&s.ExecutorLogV, "executor-logv", s.ExecutorLogV, "Logging verbosity of spawned minion and executor processes.")
|
fs.IntVar(&s.executorLogV, "executor-logv", s.executorLogV, "Logging verbosity of spawned minion and executor processes.")
|
||||||
fs.BoolVar(&s.ExecutorBindall, "executor-bindall", s.ExecutorBindall, "When true will set -address of the executor to 0.0.0.0.")
|
fs.BoolVar(&s.executorBindall, "executor-bindall", s.executorBindall, "When true will set -address of the executor to 0.0.0.0.")
|
||||||
fs.DurationVar(&s.ExecutorSuicideTimeout, "executor-suicide-timeout", s.ExecutorSuicideTimeout, "Executor self-terminates after this period of inactivity. Zero disables suicide watch.")
|
fs.DurationVar(&s.executorSuicideTimeout, "executor-suicide-timeout", s.executorSuicideTimeout, "Executor self-terminates after this period of inactivity. Zero disables suicide watch.")
|
||||||
fs.DurationVar(&s.LaunchGracePeriod, "mesos-launch-grace-period", s.LaunchGracePeriod, "Launch grace period after which launching tasks will be cancelled. Zero disables launch cancellation.")
|
fs.DurationVar(&s.launchGracePeriod, "mesos-launch-grace-period", s.launchGracePeriod, "Launch grace period after which launching tasks will be cancelled. Zero disables launch cancellation.")
|
||||||
fs.StringVar(&s.SandboxOverlay, "mesos-sandbox-overlay", s.SandboxOverlay, "Path to an archive (tar.gz, tar.bz2 or zip) extracted into the sandbox.")
|
fs.StringVar(&s.sandboxOverlay, "mesos-sandbox-overlay", s.sandboxOverlay, "Path to an archive (tar.gz, tar.bz2 or zip) extracted into the sandbox.")
|
||||||
|
|
||||||
fs.BoolVar(&s.ProxyBindall, "proxy-bindall", s.ProxyBindall, "When true pass -proxy-bindall to the executor.")
|
fs.BoolVar(&s.proxyBindall, "proxy-bindall", s.proxyBindall, "When true pass -proxy-bindall to the executor.")
|
||||||
fs.BoolVar(&s.RunProxy, "run-proxy", s.RunProxy, "Run the kube-proxy as a side process of the executor.")
|
fs.BoolVar(&s.runProxy, "run-proxy", s.runProxy, "Run the kube-proxy as a side process of the executor.")
|
||||||
fs.IntVar(&s.ProxyLogV, "proxy-logv", s.ProxyLogV, "Logging verbosity of spawned minion proxy processes.")
|
fs.IntVar(&s.proxyLogV, "proxy-logv", s.proxyLogV, "Logging verbosity of spawned minion proxy processes.")
|
||||||
|
|
||||||
fs.StringVar(&s.MinionPathOverride, "minion-path-override", s.MinionPathOverride, "Override the PATH in the environment of the minion sub-processes.")
|
fs.StringVar(&s.minionPathOverride, "minion-path-override", s.minionPathOverride, "Override the PATH in the environment of the minion sub-processes.")
|
||||||
fs.Var(resource.NewQuantityFlagValue(&s.MinionLogMaxSize), "minion-max-log-size", "Maximum log file size for the executor and proxy before rotation")
|
fs.Var(resource.NewQuantityFlagValue(&s.minionLogMaxSize), "minion-max-log-size", "Maximum log file size for the executor and proxy before rotation")
|
||||||
fs.IntVar(&s.MinionLogMaxAgeInDays, "minion-max-log-age", s.MinionLogMaxAgeInDays, "Maximum log file age of the executor and proxy in days")
|
fs.IntVar(&s.minionLogMaxAgeInDays, "minion-max-log-age", s.minionLogMaxAgeInDays, "Maximum log file age of the executor and proxy in days")
|
||||||
fs.IntVar(&s.MinionLogMaxBackups, "minion-max-log-backups", s.MinionLogMaxBackups, "Maximum log file backups of the executor and proxy to keep after rotation")
|
fs.IntVar(&s.minionLogMaxBackups, "minion-max-log-backups", s.minionLogMaxBackups, "Maximum log file backups of the executor and proxy to keep after rotation")
|
||||||
|
|
||||||
fs.StringVar(&s.KubeletRootDirectory, "kubelet-root-dir", s.KubeletRootDirectory, "Directory path for managing kubelet files (volume mounts,etc). Defaults to executor sandbox.")
|
fs.StringVar(&s.kubeletRootDirectory, "kubelet-root-dir", s.kubeletRootDirectory, "Directory path for managing kubelet files (volume mounts,etc). Defaults to executor sandbox.")
|
||||||
fs.StringVar(&s.KubeletDockerEndpoint, "kubelet-docker-endpoint", s.KubeletDockerEndpoint, "If non-empty, kubelet will use this for the docker endpoint to communicate with.")
|
fs.StringVar(&s.kubeletDockerEndpoint, "kubelet-docker-endpoint", s.kubeletDockerEndpoint, "If non-empty, kubelet will use this for the docker endpoint to communicate with.")
|
||||||
fs.StringVar(&s.KubeletPodInfraContainerImage, "kubelet-pod-infra-container-image", s.KubeletPodInfraContainerImage, "The image whose network/ipc namespaces containers in each pod will use.")
|
fs.StringVar(&s.kubeletPodInfraContainerImage, "kubelet-pod-infra-container-image", s.kubeletPodInfraContainerImage, "The image whose network/ipc namespaces containers in each pod will use.")
|
||||||
fs.UintVar(&s.KubeletCadvisorPort, "kubelet-cadvisor-port", s.KubeletCadvisorPort, "The port of the kubelet's local cAdvisor endpoint")
|
fs.UintVar(&s.kubeletCadvisorPort, "kubelet-cadvisor-port", s.kubeletCadvisorPort, "The port of the kubelet's local cAdvisor endpoint")
|
||||||
fs.StringVar(&s.KubeletHostNetworkSources, "kubelet-host-network-sources", s.KubeletHostNetworkSources, "Comma-separated list of sources from which the Kubelet allows pods to use of host network. For all sources use \"*\" [default=\"file\"]")
|
fs.StringVar(&s.kubeletHostNetworkSources, "kubelet-host-network-sources", s.kubeletHostNetworkSources, "Comma-separated list of sources from which the Kubelet allows pods to use of host network. For all sources use \"*\" [default=\"file\"]")
|
||||||
fs.DurationVar(&s.KubeletSyncFrequency, "kubelet-sync-frequency", s.KubeletSyncFrequency, "Max period between synchronizing running containers and config")
|
fs.DurationVar(&s.kubeletSyncFrequency, "kubelet-sync-frequency", s.kubeletSyncFrequency, "Max period between synchronizing running containers and config")
|
||||||
fs.StringVar(&s.KubeletNetworkPluginName, "kubelet-network-plugin", s.KubeletNetworkPluginName, "<Warning: Alpha feature> The name of the network plugin to be invoked for various events in kubelet/pod lifecycle")
|
fs.StringVar(&s.kubeletNetworkPluginName, "kubelet-network-plugin", s.kubeletNetworkPluginName, "<Warning: Alpha feature> The name of the network plugin to be invoked for various events in kubelet/pod lifecycle")
|
||||||
|
|
||||||
//TODO(jdef) support this flag once we have a better handle on mesos-dns and k8s DNS integration
|
//TODO(jdef) support this flag once we have a better handle on mesos-dns and k8s DNS integration
|
||||||
//fs.StringVar(&s.HADomain, "ha-domain", s.HADomain, "Domain of the HA scheduler service, only used in HA mode. If specified may be used to construct artifact download URIs.")
|
//fs.StringVar(&s.HADomain, "ha-domain", s.HADomain, "Domain of the HA scheduler service, only used in HA mode. If specified may be used to construct artifact download URIs.")
|
||||||
@@ -285,12 +288,12 @@ func (s *SchedulerServer) addCoreFlags(fs *pflag.FlagSet) {
|
|||||||
|
|
||||||
func (s *SchedulerServer) AddStandaloneFlags(fs *pflag.FlagSet) {
|
func (s *SchedulerServer) AddStandaloneFlags(fs *pflag.FlagSet) {
|
||||||
s.addCoreFlags(fs)
|
s.addCoreFlags(fs)
|
||||||
fs.StringVar(&s.ExecutorPath, "executor-path", s.ExecutorPath, "Location of the kubernetes executor executable")
|
fs.StringVar(&s.executorPath, "executor-path", s.executorPath, "Location of the kubernetes executor executable")
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *SchedulerServer) AddHyperkubeFlags(fs *pflag.FlagSet) {
|
func (s *SchedulerServer) AddHyperkubeFlags(fs *pflag.FlagSet) {
|
||||||
s.addCoreFlags(fs)
|
s.addCoreFlags(fs)
|
||||||
fs.StringVar(&s.KMPath, "km-path", s.KMPath, "Location of the km executable, may be a URI or an absolute file path.")
|
fs.StringVar(&s.kmPath, "km-path", s.kmPath, "Location of the km executable, may be a URI or an absolute file path.")
|
||||||
}
|
}
|
||||||
|
|
||||||
// returns (downloadURI, basename(path))
|
// returns (downloadURI, basename(path))
|
||||||
@@ -310,12 +313,12 @@ func (s *SchedulerServer) serveFrameworkArtifactWithFilename(path string, filena
|
|||||||
serveFile("/"+filename, path)
|
serveFile("/"+filename, path)
|
||||||
|
|
||||||
hostURI := ""
|
hostURI := ""
|
||||||
if s.AdvertisedAddress != "" {
|
if s.advertisedAddress != "" {
|
||||||
hostURI = fmt.Sprintf("http://%s/%s", s.AdvertisedAddress, filename)
|
hostURI = fmt.Sprintf("http://%s/%s", s.advertisedAddress, filename)
|
||||||
} else if s.HA && s.HADomain != "" {
|
} else if s.ha && s.haDomain != "" {
|
||||||
hostURI = fmt.Sprintf("http://%s.%s:%d/%s", SCHEDULER_SERVICE_NAME, s.HADomain, ports.SchedulerPort, filename)
|
hostURI = fmt.Sprintf("http://%s.%s:%d/%s", SCHEDULER_SERVICE_NAME, s.haDomain, ports.SchedulerPort, filename)
|
||||||
} else {
|
} else {
|
||||||
hostURI = fmt.Sprintf("http://%s:%d/%s", s.Address.String(), s.Port, filename)
|
hostURI = fmt.Sprintf("http://%s:%d/%s", s.address.String(), s.port, filename)
|
||||||
}
|
}
|
||||||
log.V(2).Infof("Hosting artifact '%s' at '%s'", filename, hostURI)
|
log.V(2).Infof("Hosting artifact '%s' at '%s'", filename, hostURI)
|
||||||
|
|
||||||
@@ -327,21 +330,21 @@ func (s *SchedulerServer) prepareExecutorInfo(hks hyperkube.Interface) (*mesos.E
|
|||||||
Shell: proto.Bool(false),
|
Shell: proto.Bool(false),
|
||||||
}
|
}
|
||||||
|
|
||||||
if s.ExecutorPath != "" {
|
if s.executorPath != "" {
|
||||||
uri, executorCmd := s.serveFrameworkArtifact(s.ExecutorPath)
|
uri, executorCmd := s.serveFrameworkArtifact(s.executorPath)
|
||||||
ci.Uris = append(ci.Uris, &mesos.CommandInfo_URI{Value: proto.String(uri), Executable: proto.Bool(true)})
|
ci.Uris = append(ci.Uris, &mesos.CommandInfo_URI{Value: proto.String(uri), Executable: proto.Bool(true)})
|
||||||
ci.Value = proto.String(fmt.Sprintf("./%s", executorCmd))
|
ci.Value = proto.String(fmt.Sprintf("./%s", executorCmd))
|
||||||
} else if !hks.FindServer(hyperkube.CommandMinion) {
|
} else if !hks.FindServer(hyperkube.CommandMinion) {
|
||||||
return nil, nil, fmt.Errorf("either run this scheduler via km or else --executor-path is required")
|
return nil, nil, fmt.Errorf("either run this scheduler via km or else --executor-path is required")
|
||||||
} else {
|
} else {
|
||||||
if strings.Index(s.KMPath, "://") > 0 {
|
if strings.Index(s.kmPath, "://") > 0 {
|
||||||
// URI could point directly to executable, e.g. hdfs:///km
|
// URI could point directly to executable, e.g. hdfs:///km
|
||||||
// or else indirectly, e.g. http://acmestorage/tarball.tgz
|
// or else indirectly, e.g. http://acmestorage/tarball.tgz
|
||||||
// so we assume that for this case the command will always "km"
|
// so we assume that for this case the command will always "km"
|
||||||
ci.Uris = append(ci.Uris, &mesos.CommandInfo_URI{Value: proto.String(s.KMPath), Executable: proto.Bool(true)})
|
ci.Uris = append(ci.Uris, &mesos.CommandInfo_URI{Value: proto.String(s.kmPath), Executable: proto.Bool(true)})
|
||||||
ci.Value = proto.String("./km") // TODO(jdef) extract constant
|
ci.Value = proto.String("./km") // TODO(jdef) extract constant
|
||||||
} else if s.KMPath != "" {
|
} else if s.kmPath != "" {
|
||||||
uri, kmCmd := s.serveFrameworkArtifact(s.KMPath)
|
uri, kmCmd := s.serveFrameworkArtifact(s.kmPath)
|
||||||
ci.Uris = append(ci.Uris, &mesos.CommandInfo_URI{Value: proto.String(uri), Executable: proto.Bool(true)})
|
ci.Uris = append(ci.Uris, &mesos.CommandInfo_URI{Value: proto.String(uri), Executable: proto.Bool(true)})
|
||||||
ci.Value = proto.String(fmt.Sprintf("./%s", kmCmd))
|
ci.Value = proto.String(fmt.Sprintf("./%s", kmCmd))
|
||||||
} else {
|
} else {
|
||||||
@@ -351,55 +354,55 @@ func (s *SchedulerServer) prepareExecutorInfo(hks hyperkube.Interface) (*mesos.E
|
|||||||
}
|
}
|
||||||
ci.Arguments = append(ci.Arguments, hyperkube.CommandMinion)
|
ci.Arguments = append(ci.Arguments, hyperkube.CommandMinion)
|
||||||
|
|
||||||
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--run-proxy=%v", s.RunProxy))
|
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--run-proxy=%v", s.runProxy))
|
||||||
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--proxy-bindall=%v", s.ProxyBindall))
|
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--proxy-bindall=%v", s.proxyBindall))
|
||||||
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--proxy-logv=%d", s.ProxyLogV))
|
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--proxy-logv=%d", s.proxyLogV))
|
||||||
|
|
||||||
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--path-override=%s", s.MinionPathOverride))
|
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--path-override=%s", s.minionPathOverride))
|
||||||
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--max-log-size=%v", s.MinionLogMaxSize.String()))
|
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--max-log-size=%v", s.minionLogMaxSize.String()))
|
||||||
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--max-log-backups=%d", s.MinionLogMaxBackups))
|
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--max-log-backups=%d", s.minionLogMaxBackups))
|
||||||
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--max-log-age=%d", s.MinionLogMaxAgeInDays))
|
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--max-log-age=%d", s.minionLogMaxAgeInDays))
|
||||||
}
|
}
|
||||||
|
|
||||||
if s.SandboxOverlay != "" {
|
if s.sandboxOverlay != "" {
|
||||||
if _, err := os.Stat(s.SandboxOverlay); os.IsNotExist(err) {
|
if _, err := os.Stat(s.sandboxOverlay); os.IsNotExist(err) {
|
||||||
log.Fatalf("Sandbox overlay archive not found: %s", s.SandboxOverlay)
|
return nil, nil, fmt.Errorf("Sandbox overlay archive not found: %s", s.sandboxOverlay)
|
||||||
}
|
}
|
||||||
uri, _ := s.serveFrameworkArtifact(s.SandboxOverlay)
|
uri, _ := s.serveFrameworkArtifact(s.sandboxOverlay)
|
||||||
ci.Uris = append(ci.Uris, &mesos.CommandInfo_URI{Value: proto.String(uri), Executable: proto.Bool(false), Extract: proto.Bool(true)})
|
ci.Uris = append(ci.Uris, &mesos.CommandInfo_URI{Value: proto.String(uri), Executable: proto.Bool(false), Extract: proto.Bool(true)})
|
||||||
}
|
}
|
||||||
|
|
||||||
if s.DockerCfgPath != "" {
|
if s.dockerCfgPath != "" {
|
||||||
uri := s.serveFrameworkArtifactWithFilename(s.DockerCfgPath, ".dockercfg")
|
uri := s.serveFrameworkArtifactWithFilename(s.dockerCfgPath, ".dockercfg")
|
||||||
ci.Uris = append(ci.Uris, &mesos.CommandInfo_URI{Value: proto.String(uri), Executable: proto.Bool(false), Extract: proto.Bool(false)})
|
ci.Uris = append(ci.Uris, &mesos.CommandInfo_URI{Value: proto.String(uri), Executable: proto.Bool(false), Extract: proto.Bool(false)})
|
||||||
}
|
}
|
||||||
|
|
||||||
//TODO(jdef): provide some way (env var?) for users to customize executor config
|
//TODO(jdef): provide some way (env var?) for users to customize executor config
|
||||||
//TODO(jdef): set -address to 127.0.0.1 if `address` is 127.0.0.1
|
//TODO(jdef): set -address to 127.0.0.1 if `address` is 127.0.0.1
|
||||||
|
|
||||||
apiServerArgs := strings.Join(s.APIServerList, ",")
|
apiServerArgs := strings.Join(s.apiServerList, ",")
|
||||||
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--api-servers=%s", apiServerArgs))
|
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--api-servers=%s", apiServerArgs))
|
||||||
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--v=%d", s.ExecutorLogV)) // this also applies to the minion
|
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--v=%d", s.executorLogV)) // this also applies to the minion
|
||||||
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--allow-privileged=%t", s.AllowPrivileged))
|
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--allow-privileged=%t", s.allowPrivileged))
|
||||||
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--suicide-timeout=%v", s.ExecutorSuicideTimeout))
|
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--suicide-timeout=%v", s.executorSuicideTimeout))
|
||||||
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--mesos-launch-grace-period=%v", s.LaunchGracePeriod))
|
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--mesos-launch-grace-period=%v", s.launchGracePeriod))
|
||||||
|
|
||||||
if s.ExecutorBindall {
|
if s.executorBindall {
|
||||||
//TODO(jdef) determine whether hostname-override is really needed for bindall because
|
//TODO(jdef) determine whether hostname-override is really needed for bindall because
|
||||||
//it conflicts with kubelet node status checks/updates
|
//it conflicts with kubelet node status checks/updates
|
||||||
//ci.Arguments = append(ci.Arguments, "--hostname-override=0.0.0.0")
|
//ci.Arguments = append(ci.Arguments, "--hostname-override=0.0.0.0")
|
||||||
ci.Arguments = append(ci.Arguments, "--address=0.0.0.0")
|
ci.Arguments = append(ci.Arguments, "--address=0.0.0.0")
|
||||||
}
|
}
|
||||||
|
|
||||||
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--mesos-cgroup-prefix=%v", s.MesosCgroupPrefix))
|
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--mesos-cgroup-prefix=%v", s.mesosCgroupPrefix))
|
||||||
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--cadvisor-port=%v", s.KubeletCadvisorPort))
|
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--cadvisor-port=%v", s.kubeletCadvisorPort))
|
||||||
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--sync-frequency=%v", s.KubeletSyncFrequency))
|
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--sync-frequency=%v", s.kubeletSyncFrequency))
|
||||||
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--contain-pod-resources=%t", s.ContainPodResources))
|
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--contain-pod-resources=%t", s.containPodResources))
|
||||||
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--enable-debugging-handlers=%t", s.EnableProfiling))
|
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--enable-debugging-handlers=%t", s.enableProfiling))
|
||||||
|
|
||||||
if s.AuthPath != "" {
|
if s.authPath != "" {
|
||||||
//TODO(jdef) should probably support non-local files, e.g. hdfs:///some/config/file
|
//TODO(jdef) should probably support non-local files, e.g. hdfs:///some/config/file
|
||||||
uri, basename := s.serveFrameworkArtifact(s.AuthPath)
|
uri, basename := s.serveFrameworkArtifact(s.authPath)
|
||||||
ci.Uris = append(ci.Uris, &mesos.CommandInfo_URI{Value: proto.String(uri)})
|
ci.Uris = append(ci.Uris, &mesos.CommandInfo_URI{Value: proto.String(uri)})
|
||||||
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--auth-path=%s", basename))
|
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--auth-path=%s", basename))
|
||||||
}
|
}
|
||||||
@@ -408,15 +411,15 @@ func (s *SchedulerServer) prepareExecutorInfo(hks hyperkube.Interface) (*mesos.E
|
|||||||
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--%s=%s", name, value))
|
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--%s=%s", name, value))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if s.ClusterDNS != nil {
|
if s.clusterDNS != nil {
|
||||||
appendOptional("cluster-dns", s.ClusterDNS.String())
|
appendOptional("cluster-dns", s.clusterDNS.String())
|
||||||
}
|
}
|
||||||
appendOptional("cluster-domain", s.ClusterDomain)
|
appendOptional("cluster-domain", s.clusterDomain)
|
||||||
appendOptional("root-dir", s.KubeletRootDirectory)
|
appendOptional("root-dir", s.kubeletRootDirectory)
|
||||||
appendOptional("docker-endpoint", s.KubeletDockerEndpoint)
|
appendOptional("docker-endpoint", s.kubeletDockerEndpoint)
|
||||||
appendOptional("pod-infra-container-image", s.KubeletPodInfraContainerImage)
|
appendOptional("pod-infra-container-image", s.kubeletPodInfraContainerImage)
|
||||||
appendOptional("host-network-sources", s.KubeletHostNetworkSources)
|
appendOptional("host-network-sources", s.kubeletHostNetworkSources)
|
||||||
appendOptional("network-plugin", s.KubeletNetworkPluginName)
|
appendOptional("network-plugin", s.kubeletNetworkPluginName)
|
||||||
|
|
||||||
log.V(1).Infof("prepared executor command %q with args '%+v'", ci.GetValue(), ci.Arguments)
|
log.V(1).Infof("prepared executor command %q with args '%+v'", ci.GetValue(), ci.Arguments)
|
||||||
|
|
||||||
@@ -429,8 +432,8 @@ func (s *SchedulerServer) prepareExecutorInfo(hks hyperkube.Interface) (*mesos.E
|
|||||||
|
|
||||||
// Check for staticPods
|
// Check for staticPods
|
||||||
var staticPodCPUs, staticPodMem float64
|
var staticPodCPUs, staticPodMem float64
|
||||||
if s.StaticPodsConfigPath != "" {
|
if s.staticPodsConfigPath != "" {
|
||||||
bs, paths, err := archive.ZipDir(s.StaticPodsConfigPath)
|
bs, paths, err := archive.ZipDir(s.staticPodsConfigPath)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, nil, err
|
return nil, nil, err
|
||||||
}
|
}
|
||||||
@@ -451,8 +454,8 @@ func (s *SchedulerServer) prepareExecutorInfo(hks hyperkube.Interface) (*mesos.E
|
|||||||
}
|
}
|
||||||
|
|
||||||
// TODO(sttts): allow unlimited static pods as well and patch in the default resource limits
|
// TODO(sttts): allow unlimited static pods as well and patch in the default resource limits
|
||||||
unlimitedCPU := mresource.LimitPodCPU(&pod, s.DefaultContainerCPULimit)
|
unlimitedCPU := mresource.LimitPodCPU(&pod, s.defaultContainerCPULimit)
|
||||||
unlimitedMem := mresource.LimitPodMem(&pod, s.DefaultContainerMemLimit)
|
unlimitedMem := mresource.LimitPodMem(&pod, s.defaultContainerMemLimit)
|
||||||
if unlimitedCPU {
|
if unlimitedCPU {
|
||||||
return nil, nil, fmt.Errorf("found static pod without limit on cpu resources: %v", podPath)
|
return nil, nil, fmt.Errorf("found static pod without limit on cpu resources: %v", podPath)
|
||||||
}
|
}
|
||||||
@@ -473,8 +476,8 @@ func (s *SchedulerServer) prepareExecutorInfo(hks hyperkube.Interface) (*mesos.E
|
|||||||
}
|
}
|
||||||
|
|
||||||
execInfo.Resources = []*mesos.Resource{
|
execInfo.Resources = []*mesos.Resource{
|
||||||
mutil.NewScalarResource("cpus", float64(s.MesosExecutorCPUs)+staticPodCPUs),
|
mutil.NewScalarResource("cpus", float64(s.mesosExecutorCPUs)+staticPodCPUs),
|
||||||
mutil.NewScalarResource("mem", float64(s.MesosExecutorMem)+staticPodMem),
|
mutil.NewScalarResource("mem", float64(s.mesosExecutorMem)+staticPodMem),
|
||||||
}
|
}
|
||||||
|
|
||||||
// calculate ExecutorInfo hash to be used for validating compatibility
|
// calculate ExecutorInfo hash to be used for validating compatibility
|
||||||
@@ -489,7 +492,7 @@ func (s *SchedulerServer) prepareExecutorInfo(hks hyperkube.Interface) (*mesos.E
|
|||||||
// TODO(jdef): hacked from kubelet/server/server.go
|
// TODO(jdef): hacked from kubelet/server/server.go
|
||||||
// TODO(k8s): replace this with clientcmd
|
// TODO(k8s): replace this with clientcmd
|
||||||
func (s *SchedulerServer) createAPIServerClient() (*client.Client, error) {
|
func (s *SchedulerServer) createAPIServerClient() (*client.Client, error) {
|
||||||
authInfo, err := clientauth.LoadFromFile(s.AuthPath)
|
authInfo, err := clientauth.LoadFromFile(s.authPath)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Warningf("Could not load kubernetes auth path: %v. Continuing with defaults.", err)
|
log.Warningf("Could not load kubernetes auth path: %v. Continuing with defaults.", err)
|
||||||
}
|
}
|
||||||
@@ -501,14 +504,14 @@ func (s *SchedulerServer) createAPIServerClient() (*client.Client, error) {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
if len(s.APIServerList) < 1 {
|
if len(s.apiServerList) < 1 {
|
||||||
return nil, fmt.Errorf("no api servers specified")
|
return nil, fmt.Errorf("no api servers specified")
|
||||||
}
|
}
|
||||||
// TODO: adapt Kube client to support LB over several servers
|
// TODO: adapt Kube client to support LB over several servers
|
||||||
if len(s.APIServerList) > 1 {
|
if len(s.apiServerList) > 1 {
|
||||||
log.Infof("Multiple api servers specified. Picking first one")
|
log.Infof("Multiple api servers specified. Picking first one")
|
||||||
}
|
}
|
||||||
clientConfig.Host = s.APIServerList[0]
|
clientConfig.Host = s.apiServerList[0]
|
||||||
c, err := client.New(&clientConfig)
|
c, err := client.New(&clientConfig)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
@@ -531,8 +534,8 @@ func (s *SchedulerServer) getDriver() (driver bindings.SchedulerDriver) {
|
|||||||
func (s *SchedulerServer) Run(hks hyperkube.Interface, _ []string) error {
|
func (s *SchedulerServer) Run(hks hyperkube.Interface, _ []string) error {
|
||||||
// get scheduler low-level config
|
// get scheduler low-level config
|
||||||
sc := schedcfg.CreateDefaultConfig()
|
sc := schedcfg.CreateDefaultConfig()
|
||||||
if s.SchedulerConfigFileName != "" {
|
if s.schedulerConfigFileName != "" {
|
||||||
f, err := os.Open(s.SchedulerConfigFileName)
|
f, err := os.Open(s.schedulerConfigFileName)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Fatalf("Cannot open scheduler config file: %v", err)
|
log.Fatalf("Cannot open scheduler config file: %v", err)
|
||||||
}
|
}
|
||||||
@@ -545,18 +548,18 @@ func (s *SchedulerServer) Run(hks hyperkube.Interface, _ []string) error {
|
|||||||
|
|
||||||
schedulerProcess, driverFactory, etcdClient, eid := s.bootstrap(hks, sc)
|
schedulerProcess, driverFactory, etcdClient, eid := s.bootstrap(hks, sc)
|
||||||
|
|
||||||
if s.EnableProfiling {
|
if s.enableProfiling {
|
||||||
profile.InstallHandler(s.mux)
|
profile.InstallHandler(s.mux)
|
||||||
}
|
}
|
||||||
go runtime.Until(func() {
|
go runtime.Until(func() {
|
||||||
log.V(1).Info("Starting HTTP interface")
|
log.V(1).Info("Starting HTTP interface")
|
||||||
log.Error(http.ListenAndServe(net.JoinHostPort(s.Address.String(), strconv.Itoa(s.Port)), s.mux))
|
log.Error(http.ListenAndServe(net.JoinHostPort(s.address.String(), strconv.Itoa(s.port)), s.mux))
|
||||||
}, sc.HttpBindInterval.Duration, schedulerProcess.Terminal())
|
}, sc.HttpBindInterval.Duration, schedulerProcess.Terminal())
|
||||||
|
|
||||||
if s.HA {
|
if s.ha {
|
||||||
validation := ha.ValidationFunc(validateLeadershipTransition)
|
validation := ha.ValidationFunc(validateLeadershipTransition)
|
||||||
srv := ha.NewCandidate(schedulerProcess, driverFactory, validation)
|
srv := ha.NewCandidate(schedulerProcess, driverFactory, validation)
|
||||||
path := fmt.Sprintf(meta.DefaultElectionFormat, s.FrameworkName)
|
path := fmt.Sprintf(meta.DefaultElectionFormat, s.frameworkName)
|
||||||
sid := uid.New(eid.Group(), "").String()
|
sid := uid.New(eid.Group(), "").String()
|
||||||
log.Infof("registering for election at %v with id %v", path, sid)
|
log.Infof("registering for election at %v with id %v", path, sid)
|
||||||
go election.Notify(election.NewEtcdMasterElector(etcdClient), path, sid, srv, nil)
|
go election.Notify(election.NewEtcdMasterElector(etcdClient), path, sid, srv, nil)
|
||||||
@@ -595,7 +598,7 @@ func (s *SchedulerServer) awaitFailover(schedulerProcess schedulerProcessInterfa
|
|||||||
case <-schedulerProcess.Failover():
|
case <-schedulerProcess.Failover():
|
||||||
err = doFailover()
|
err = doFailover()
|
||||||
default:
|
default:
|
||||||
if s.HA {
|
if s.ha {
|
||||||
err = fmt.Errorf("ha scheduler exiting instead of failing over")
|
err = fmt.Errorf("ha scheduler exiting instead of failing over")
|
||||||
} else {
|
} else {
|
||||||
log.Infof("exiting scheduler")
|
log.Infof("exiting scheduler")
|
||||||
@@ -637,22 +640,22 @@ func newEtcd(etcdConfigFile string, etcdServerList []string) (client tools.EtcdC
|
|||||||
|
|
||||||
func (s *SchedulerServer) bootstrap(hks hyperkube.Interface, sc *schedcfg.Config) (*ha.SchedulerProcess, ha.DriverFactory, tools.EtcdClient, *uid.UID) {
|
func (s *SchedulerServer) bootstrap(hks hyperkube.Interface, sc *schedcfg.Config) (*ha.SchedulerProcess, ha.DriverFactory, tools.EtcdClient, *uid.UID) {
|
||||||
|
|
||||||
s.FrameworkName = strings.TrimSpace(s.FrameworkName)
|
s.frameworkName = strings.TrimSpace(s.frameworkName)
|
||||||
if s.FrameworkName == "" {
|
if s.frameworkName == "" {
|
||||||
log.Fatalf("framework-name must be a non-empty string")
|
log.Fatalf("framework-name must be a non-empty string")
|
||||||
}
|
}
|
||||||
s.FrameworkWebURI = strings.TrimSpace(s.FrameworkWebURI)
|
s.frameworkWebURI = strings.TrimSpace(s.frameworkWebURI)
|
||||||
|
|
||||||
metrics.Register()
|
metrics.Register()
|
||||||
runtime.Register()
|
runtime.Register()
|
||||||
s.mux.Handle("/metrics", prometheus.Handler())
|
s.mux.Handle("/metrics", prometheus.Handler())
|
||||||
healthz.InstallHandler(s.mux)
|
healthz.InstallHandler(s.mux)
|
||||||
|
|
||||||
if (s.EtcdConfigFile != "" && len(s.EtcdServerList) != 0) || (s.EtcdConfigFile == "" && len(s.EtcdServerList) == 0) {
|
if (s.etcdConfigFile != "" && len(s.etcdServerList) != 0) || (s.etcdConfigFile == "" && len(s.etcdServerList) == 0) {
|
||||||
log.Fatalf("specify either --etcd-servers or --etcd-config")
|
log.Fatalf("specify either --etcd-servers or --etcd-config")
|
||||||
}
|
}
|
||||||
|
|
||||||
if len(s.APIServerList) < 1 {
|
if len(s.apiServerList) < 1 {
|
||||||
log.Fatal("No api servers specified.")
|
log.Fatal("No api servers specified.")
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -662,9 +665,9 @@ func (s *SchedulerServer) bootstrap(hks hyperkube.Interface, sc *schedcfg.Config
|
|||||||
}
|
}
|
||||||
s.client = client
|
s.client = client
|
||||||
|
|
||||||
if s.ReconcileCooldown < defaultReconcileCooldown {
|
if s.reconcileCooldown < defaultReconcileCooldown {
|
||||||
s.ReconcileCooldown = defaultReconcileCooldown
|
s.reconcileCooldown = defaultReconcileCooldown
|
||||||
log.Warningf("user-specified reconcile cooldown too small, defaulting to %v", s.ReconcileCooldown)
|
log.Warningf("user-specified reconcile cooldown too small, defaulting to %v", s.reconcileCooldown)
|
||||||
}
|
}
|
||||||
|
|
||||||
executor, eid, err := s.prepareExecutorInfo(hks)
|
executor, eid, err := s.prepareExecutorInfo(hks)
|
||||||
@@ -676,25 +679,25 @@ func (s *SchedulerServer) bootstrap(hks hyperkube.Interface, sc *schedcfg.Config
|
|||||||
// (1) the generic config store is available for the FrameworkId storage
|
// (1) the generic config store is available for the FrameworkId storage
|
||||||
// (2) the generic master election is provided by the apiserver
|
// (2) the generic master election is provided by the apiserver
|
||||||
// Compare docs/proposals/high-availability.md
|
// Compare docs/proposals/high-availability.md
|
||||||
etcdClient, err := newEtcd(s.EtcdConfigFile, s.EtcdServerList)
|
etcdClient, err := newEtcd(s.etcdConfigFile, s.etcdServerList)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Fatalf("misconfigured etcd: %v", err)
|
log.Fatalf("misconfigured etcd: %v", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
as := scheduler.NewAllocationStrategy(
|
as := podschedulers.NewAllocationStrategy(
|
||||||
podtask.NewDefaultPredicate(
|
podtask.NewDefaultPredicate(
|
||||||
s.DefaultContainerCPULimit,
|
s.defaultContainerCPULimit,
|
||||||
s.DefaultContainerMemLimit,
|
s.defaultContainerMemLimit,
|
||||||
),
|
),
|
||||||
podtask.NewDefaultProcurement(
|
podtask.NewDefaultProcurement(
|
||||||
s.DefaultContainerCPULimit,
|
s.defaultContainerCPULimit,
|
||||||
s.DefaultContainerMemLimit,
|
s.defaultContainerMemLimit,
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
// downgrade allocation strategy if user disables "account-for-pod-resources"
|
// downgrade allocation strategy if user disables "account-for-pod-resources"
|
||||||
if !s.AccountForPodResources {
|
if !s.accountForPodResources {
|
||||||
as = scheduler.NewAllocationStrategy(
|
as = podschedulers.NewAllocationStrategy(
|
||||||
podtask.DefaultMinimalPredicate,
|
podtask.DefaultMinimalPredicate,
|
||||||
podtask.DefaultMinimalProcurement)
|
podtask.DefaultMinimalProcurement)
|
||||||
}
|
}
|
||||||
@@ -716,48 +719,61 @@ func (s *SchedulerServer) bootstrap(hks hyperkube.Interface, sc *schedcfg.Config
|
|||||||
return n.(*api.Node)
|
return n.(*api.Node)
|
||||||
}
|
}
|
||||||
|
|
||||||
fcfs := scheduler.NewFCFSPodScheduler(as, lookupNode)
|
fcfs := podschedulers.NewFCFSPodScheduler(as, lookupNode)
|
||||||
mesosPodScheduler := scheduler.New(scheduler.Config{
|
framework := framework.New(framework.Config{
|
||||||
Schedcfg: *sc,
|
SchedulerConfig: *sc,
|
||||||
Executor: executor,
|
Executor: executor,
|
||||||
Scheduler: fcfs,
|
|
||||||
Client: client,
|
Client: client,
|
||||||
EtcdClient: etcdClient,
|
FailoverTimeout: s.failoverTimeout,
|
||||||
FailoverTimeout: s.FailoverTimeout,
|
ReconcileInterval: s.reconcileInterval,
|
||||||
ReconcileInterval: s.ReconcileInterval,
|
ReconcileCooldown: s.reconcileCooldown,
|
||||||
ReconcileCooldown: s.ReconcileCooldown,
|
|
||||||
LookupNode: lookupNode,
|
LookupNode: lookupNode,
|
||||||
|
StoreFrameworkId: func(id string) {
|
||||||
|
// TODO(jdef): port FrameworkId store to generic Kubernetes config store as soon as available
|
||||||
|
_, err := etcdClient.Set(meta.FrameworkIDKey, id, uint64(s.failoverTimeout))
|
||||||
|
if err != nil {
|
||||||
|
log.Errorf("failed to renew frameworkId TTL: %v", err)
|
||||||
|
}
|
||||||
|
},
|
||||||
})
|
})
|
||||||
|
|
||||||
masterUri := s.MesosMaster
|
masterUri := s.mesosMaster
|
||||||
info, cred, err := s.buildFrameworkInfo()
|
info, cred, err := s.buildFrameworkInfo()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Fatalf("Misconfigured mesos framework: %v", err)
|
log.Fatalf("Misconfigured mesos framework: %v", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
schedulerProcess := ha.New(mesosPodScheduler)
|
schedulerProcess := ha.New(framework)
|
||||||
dconfig := &bindings.DriverConfig{
|
dconfig := &bindings.DriverConfig{
|
||||||
Scheduler: schedulerProcess,
|
Scheduler: schedulerProcess,
|
||||||
Framework: info,
|
Framework: info,
|
||||||
Master: masterUri,
|
Master: masterUri,
|
||||||
Credential: cred,
|
Credential: cred,
|
||||||
BindingAddress: s.Address,
|
BindingAddress: s.address,
|
||||||
BindingPort: uint16(s.DriverPort),
|
BindingPort: uint16(s.driverPort),
|
||||||
HostnameOverride: s.HostnameOverride,
|
HostnameOverride: s.hostnameOverride,
|
||||||
WithAuthContext: func(ctx context.Context) context.Context {
|
WithAuthContext: func(ctx context.Context) context.Context {
|
||||||
ctx = auth.WithLoginProvider(ctx, s.MesosAuthProvider)
|
ctx = auth.WithLoginProvider(ctx, s.mesosAuthProvider)
|
||||||
ctx = sasl.WithBindingAddress(ctx, s.Address)
|
ctx = sasl.WithBindingAddress(ctx, s.address)
|
||||||
return ctx
|
return ctx
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
kpl := scheduler.NewPlugin(mesosPodScheduler.NewDefaultPluginConfig(schedulerProcess.Terminal(), s.mux))
|
// create event recorder sending events to the "" namespace of the apiserver
|
||||||
runtime.On(mesosPodScheduler.Registration(), func() { kpl.Run(schedulerProcess.Terminal()) })
|
broadcaster := record.NewBroadcaster()
|
||||||
runtime.On(mesosPodScheduler.Registration(), s.newServiceWriter(schedulerProcess.Terminal()))
|
recorder := broadcaster.NewRecorder(api.EventSource{Component: "scheduler"})
|
||||||
|
broadcaster.StartRecordingToSink(client.Events(""))
|
||||||
|
|
||||||
|
// create scheduler core with all components arranged around it
|
||||||
|
lw := cache.NewListWatchFromClient(client, "pods", api.NamespaceAll, fields.Everything())
|
||||||
|
sched := components.New(sc, framework, fcfs, client, recorder, schedulerProcess.Terminal(), s.mux, lw)
|
||||||
|
|
||||||
|
runtime.On(framework.Registration(), func() { sched.Run(schedulerProcess.Terminal()) })
|
||||||
|
runtime.On(framework.Registration(), s.newServiceWriter(schedulerProcess.Terminal()))
|
||||||
|
|
||||||
driverFactory := ha.DriverFactory(func() (drv bindings.SchedulerDriver, err error) {
|
driverFactory := ha.DriverFactory(func() (drv bindings.SchedulerDriver, err error) {
|
||||||
log.V(1).Infoln("performing deferred initialization")
|
log.V(1).Infoln("performing deferred initialization")
|
||||||
if err = mesosPodScheduler.Init(schedulerProcess.Master(), kpl, s.mux); err != nil {
|
if err = framework.Init(sched, schedulerProcess.Master(), s.mux); err != nil {
|
||||||
return nil, fmt.Errorf("failed to initialize pod scheduler: %v", err)
|
return nil, fmt.Errorf("failed to initialize pod scheduler: %v", err)
|
||||||
}
|
}
|
||||||
log.V(1).Infoln("deferred init complete")
|
log.V(1).Infoln("deferred init complete")
|
||||||
@@ -806,14 +822,14 @@ func (s *SchedulerServer) failover(driver bindings.SchedulerDriver, hks hyperkub
|
|||||||
args = append(args, fmt.Sprintf("--%s=%s", flag.Name, flag.Value.String()))
|
args = append(args, fmt.Sprintf("--%s=%s", flag.Name, flag.Value.String()))
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
if !s.Graceful {
|
if !s.graceful {
|
||||||
args = append(args, "--graceful")
|
args = append(args, "--graceful")
|
||||||
}
|
}
|
||||||
if len(s.APIServerList) > 0 {
|
if len(s.apiServerList) > 0 {
|
||||||
args = append(args, "--api-servers="+strings.Join(s.APIServerList, ","))
|
args = append(args, "--api-servers="+strings.Join(s.apiServerList, ","))
|
||||||
}
|
}
|
||||||
if len(s.EtcdServerList) > 0 {
|
if len(s.etcdServerList) > 0 {
|
||||||
args = append(args, "--etcd-servers="+strings.Join(s.EtcdServerList, ","))
|
args = append(args, "--etcd-servers="+strings.Join(s.etcdServerList, ","))
|
||||||
}
|
}
|
||||||
args = append(args, flags.Args()...)
|
args = append(args, flags.Args()...)
|
||||||
|
|
||||||
@@ -846,30 +862,30 @@ func (s *SchedulerServer) buildFrameworkInfo() (info *mesos.FrameworkInfo, cred
|
|||||||
}
|
}
|
||||||
log.V(2).Infof("Framework configured with mesos user %v", username)
|
log.V(2).Infof("Framework configured with mesos user %v", username)
|
||||||
info = &mesos.FrameworkInfo{
|
info = &mesos.FrameworkInfo{
|
||||||
Name: proto.String(s.FrameworkName),
|
Name: proto.String(s.frameworkName),
|
||||||
User: proto.String(username),
|
User: proto.String(username),
|
||||||
Checkpoint: proto.Bool(s.Checkpoint),
|
Checkpoint: proto.Bool(s.checkpoint),
|
||||||
}
|
}
|
||||||
if s.FrameworkWebURI != "" {
|
if s.frameworkWebURI != "" {
|
||||||
info.WebuiUrl = proto.String(s.FrameworkWebURI)
|
info.WebuiUrl = proto.String(s.frameworkWebURI)
|
||||||
}
|
}
|
||||||
if s.FailoverTimeout > 0 {
|
if s.failoverTimeout > 0 {
|
||||||
info.FailoverTimeout = proto.Float64(s.FailoverTimeout)
|
info.FailoverTimeout = proto.Float64(s.failoverTimeout)
|
||||||
}
|
}
|
||||||
if s.MesosRole != "" {
|
if s.mesosRole != "" {
|
||||||
info.Role = proto.String(s.MesosRole)
|
info.Role = proto.String(s.mesosRole)
|
||||||
}
|
}
|
||||||
if s.MesosAuthPrincipal != "" {
|
if s.mesosAuthPrincipal != "" {
|
||||||
info.Principal = proto.String(s.MesosAuthPrincipal)
|
info.Principal = proto.String(s.mesosAuthPrincipal)
|
||||||
if s.MesosAuthSecretFile == "" {
|
if s.mesosAuthSecretFile == "" {
|
||||||
return nil, nil, errors.New("authentication principal specified without the required credentials file")
|
return nil, nil, errors.New("authentication principal specified without the required credentials file")
|
||||||
}
|
}
|
||||||
secret, err := ioutil.ReadFile(s.MesosAuthSecretFile)
|
secret, err := ioutil.ReadFile(s.mesosAuthSecretFile)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, nil, err
|
return nil, nil, err
|
||||||
}
|
}
|
||||||
cred = &mesos.Credential{
|
cred = &mesos.Credential{
|
||||||
Principal: proto.String(s.MesosAuthPrincipal),
|
Principal: proto.String(s.mesosAuthPrincipal),
|
||||||
Secret: secret,
|
Secret: secret,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -877,7 +893,7 @@ func (s *SchedulerServer) buildFrameworkInfo() (info *mesos.FrameworkInfo, cred
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (s *SchedulerServer) fetchFrameworkID(client tools.EtcdClient) (*mesos.FrameworkID, error) {
|
func (s *SchedulerServer) fetchFrameworkID(client tools.EtcdClient) (*mesos.FrameworkID, error) {
|
||||||
if s.FailoverTimeout > 0 {
|
if s.failoverTimeout > 0 {
|
||||||
if response, err := client.Get(meta.FrameworkIDKey, false, false); err != nil {
|
if response, err := client.Get(meta.FrameworkIDKey, false, false); err != nil {
|
||||||
if !etcdstorage.IsEtcdNotFound(err) {
|
if !etcdstorage.IsEtcdNotFound(err) {
|
||||||
return nil, fmt.Errorf("unexpected failure attempting to load framework ID from etcd: %v", err)
|
return nil, fmt.Errorf("unexpected failure attempting to load framework ID from etcd: %v", err)
|
||||||
@@ -900,7 +916,7 @@ func (s *SchedulerServer) fetchFrameworkID(client tools.EtcdClient) (*mesos.Fram
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (s *SchedulerServer) getUsername() (username string, err error) {
|
func (s *SchedulerServer) getUsername() (username string, err error) {
|
||||||
username = s.MesosUser
|
username = s.mesosUser
|
||||||
if username == "" {
|
if username == "" {
|
||||||
if u, err := user.Current(); err == nil {
|
if u, err := user.Current(); err == nil {
|
||||||
username = u.Username
|
username = u.Username
|
||||||
|
@@ -121,8 +121,8 @@ func Test_DefaultResourceLimits(t *testing.T) {
|
|||||||
assert := assert.New(t)
|
assert := assert.New(t)
|
||||||
|
|
||||||
s := NewSchedulerServer()
|
s := NewSchedulerServer()
|
||||||
assert.Equal(s.DefaultContainerCPULimit, mresource.DefaultDefaultContainerCPULimit)
|
assert.Equal(s.defaultContainerCPULimit, mresource.DefaultDefaultContainerCPULimit)
|
||||||
assert.Equal(s.DefaultContainerMemLimit, mresource.DefaultDefaultContainerMemLimit)
|
assert.Equal(s.defaultContainerMemLimit, mresource.DefaultDefaultContainerMemLimit)
|
||||||
}
|
}
|
||||||
|
|
||||||
func Test_StaticPods(t *testing.T) {
|
func Test_StaticPods(t *testing.T) {
|
||||||
|
Reference in New Issue
Block a user