scheduler: implement role awareness

This commit is contained in:
Sergiusz Urbaniak 2015-10-01 16:51:58 +02:00
parent 1a43dcf720
commit 9eae47c6e6
45 changed files with 2591 additions and 914 deletions

View File

@ -23,6 +23,7 @@ mesosmaster1:
- MESOS_QUORUM=1 - MESOS_QUORUM=1
- MESOS_REGISTRY=in_memory - MESOS_REGISTRY=in_memory
- MESOS_WORK_DIR=/var/lib/mesos - MESOS_WORK_DIR=/var/lib/mesos
- MESOS_ROLES=role1
links: links:
- etcd - etcd
- "ambassador:apiserver" - "ambassador:apiserver"
@ -40,15 +41,15 @@ mesosslave:
DOCKER_NETWORK_OFFSET=0.0.$${N}.0 DOCKER_NETWORK_OFFSET=0.0.$${N}.0
exec wrapdocker mesos-slave exec wrapdocker mesos-slave
--work_dir="/var/tmp/mesos/$${N}" --work_dir="/var/tmp/mesos/$${N}"
--attributes="rack:$${N};gen:201$${N}" --attributes="rack:$${N};gen:201$${N};role:role$${N}"
--hostname=$$(getent hosts mesosslave | cut -d' ' -f1 | sort -u | tail -1) --hostname=$$(getent hosts mesosslave | cut -d' ' -f1 | sort -u | tail -1)
--resources="cpus:4;mem:1280;disk:25600;ports:[8000-21099];cpus(role$${N}):1;mem(role$${N}):640;disk(role$${N}):25600;ports(role$${N}):[7000-7999]"
command: [] command: []
environment: environment:
- MESOS_MASTER=mesosmaster1:5050 - MESOS_MASTER=mesosmaster1:5050
- MESOS_PORT=5051 - MESOS_PORT=5051
- MESOS_LOG_DIR=/var/log/mesos - MESOS_LOG_DIR=/var/log/mesos
- MESOS_LOGGING_LEVEL=INFO - MESOS_LOGGING_LEVEL=INFO
- MESOS_RESOURCES=cpus:4;mem:1280;disk:25600;ports:[8000-21099]
- MESOS_SWITCH_USER=0 - MESOS_SWITCH_USER=0
- MESOS_CONTAINERIZERS=docker,mesos - MESOS_CONTAINERIZERS=docker,mesos
- MESOS_ISOLATION=cgroups/cpu,cgroups/mem - MESOS_ISOLATION=cgroups/cpu,cgroups/mem
@ -58,8 +59,6 @@ mesosslave:
- etcd - etcd
- mesosmaster1 - mesosmaster1
- "ambassador:apiserver" - "ambassador:apiserver"
volumes:
- ${MESOS_DOCKER_WORK_DIR}/mesosslave:/var/tmp/mesos
apiserver: apiserver:
hostname: apiserver hostname: apiserver
image: mesosphere/kubernetes-mesos image: mesosphere/kubernetes-mesos
@ -145,6 +144,7 @@ scheduler:
--mesos-executor-cpus=1.0 --mesos-executor-cpus=1.0
--mesos-sandbox-overlay=/opt/sandbox-overlay.tar.gz --mesos-sandbox-overlay=/opt/sandbox-overlay.tar.gz
--static-pods-config=/opt/static-pods --static-pods-config=/opt/static-pods
--mesos-roles=*,role1
--v=4 --v=4
--executor-logv=4 --executor-logv=4
--profiling=true --profiling=true

View File

@ -30,6 +30,93 @@ example, the Kubernetes-Mesos executor manages `k8s.mesosphere.io/attribute`
labels and will auto-detect and update modified attributes when the mesos-slave labels and will auto-detect and update modified attributes when the mesos-slave
is restarted. is restarted.
## Resource Roles
A Mesos cluster can be statically partitioned using [resources roles][2]. Each
resource is assigned such a role (`*` is the default role, if none is explicitly
assigned in the mesos-slave command line). The Mesos master will send offers to
frameworks for `*` resources and optionally for one extra role that a
framework is assigned to. Right now only one such extra role for a framework is
supported.
### Configuring Roles for the Scheduler
Every Mesos framework scheduler can choose among the offered `*` resources and
those of the extra role. The Kubernetes-Mesos scheduler supports this by setting
the framework roles in the scheduler command line, e.g.
```bash
$ km scheduler ... --mesos-roles="*,role1" ...
```
This will tell the Kubernetes-Mesos scheduler to default to using `*` resources
if a pod is not specially assigned to another role. Moreover, the extra role
`role1` is allowed, i.e. the Mesos master will send resources or role `role1`
to the Kubernetes scheduler.
Note the following restrictions and possibilities:
- Due to the restrictions of Mesos, only one extra role may be provided on the
command line.
- It is allowed to only pass an extra role without the `*`, e.g. `--mesos-roles=role1`.
This means that no `*` resources should be considered by the scheduler at all.
- It is allowed to pass the extra role first, e.g. `--mesos-roles=role1,*`.
This means that `role1` is the default role for pods without special role
assignment (see below). But `*` resources would be considered for pods with a special `*`
assignment.
### Specifying Roles for Pods
By default a pod is scheduled using resources of the role which comes first in
the list of scheduler roles.
A pod can opt-out of this default behaviour using the `k8s.mesosphere.io/roles`
label:
```yaml
k8s.mesosphere.io/roles: role1,role2,role3
```
The format is a comma separated list of allowed resource roles. The scheduler
will try to schedule the pod with `role1` resources first, using `role2`
resources if the former are not available and finally falling back to `role3`
resources.
The `*` role may be specified as well in this list.
**Note:** An empty list will mean that no resource roles are allowed which is
equivalent to a pod which is unschedulable.
For example:
```yaml
apiVersion: v1
kind: Pod
metadata:
name: backend
labels:
k8s.mesosphere.io/roles: *,prod,test,dev
namespace: prod
spec:
...
```
This `prod/backend` pod will be scheduled using resources from all four roles,
preferably using `*` resources, followed by `prod`, `test` and `dev`. If none
of those for roles provides enough resources, the scheduling fails.
**Note:** The scheduler will also allow to mix different roles in the following
sense: if a node provides `cpu` resources for the `*` role, but `mem` resources
only for the `prod` role, the upper pod will be schedule using `cpu(*)` and
`mem(prod)` resources.
**Note:** The scheduler might also mix within one resource type, i.e. it will
use as many `cpu`s of the `*` role as possible. If a pod requires even more
`cpu` resources (defined using the `pod.spec.resources.limits` property) for successful
scheduling, the scheduler will add resources from the `prod`, `test` and `dev`
roles, in this order until the pod resource requirements are satisfied. E.g. a
pod might be scheduled with 0.5 `cpu(*)`, 1.5 `cpu(prod)` and 1 `cpu(test)`
resources plus e.g. 2 GB `mem(prod)` resources.
## Tuning ## Tuning
The scheduler configuration can be fine-tuned using an ini-style configuration file. The scheduler configuration can be fine-tuned using an ini-style configuration file.
@ -49,6 +136,7 @@ offer-ttl = 5s
; duration an expired offer lingers in history ; duration an expired offer lingers in history
offer-linger-ttl = 2m offer-linger-ttl = 2m
<<<<<<< HEAD
; duration between offer listener notifications ; duration between offer listener notifications
listener-delay = 1s listener-delay = 1s

View File

@ -17,6 +17,7 @@ limitations under the License.
package executor package executor
import ( import (
"bytes"
"encoding/json" "encoding/json"
"fmt" "fmt"
"strings" "strings"
@ -33,6 +34,7 @@ import (
"k8s.io/kubernetes/contrib/mesos/pkg/executor/messages" "k8s.io/kubernetes/contrib/mesos/pkg/executor/messages"
"k8s.io/kubernetes/contrib/mesos/pkg/node" "k8s.io/kubernetes/contrib/mesos/pkg/node"
"k8s.io/kubernetes/contrib/mesos/pkg/podutil" "k8s.io/kubernetes/contrib/mesos/pkg/podutil"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/executorinfo"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/meta" "k8s.io/kubernetes/contrib/mesos/pkg/scheduler/meta"
"k8s.io/kubernetes/pkg/api" "k8s.io/kubernetes/pkg/api"
unversionedapi "k8s.io/kubernetes/pkg/api/unversioned" unversionedapi "k8s.io/kubernetes/pkg/api/unversioned"
@ -223,13 +225,21 @@ func (k *Executor) sendPodsSnapshot() bool {
} }
// Registered is called when the executor is successfully registered with the slave. // Registered is called when the executor is successfully registered with the slave.
func (k *Executor) Registered(driver bindings.ExecutorDriver, func (k *Executor) Registered(
executorInfo *mesos.ExecutorInfo, frameworkInfo *mesos.FrameworkInfo, slaveInfo *mesos.SlaveInfo) { driver bindings.ExecutorDriver,
executorInfo *mesos.ExecutorInfo,
frameworkInfo *mesos.FrameworkInfo,
slaveInfo *mesos.SlaveInfo,
) {
if k.isDone() { if k.isDone() {
return return
} }
log.Infof("Executor %v of framework %v registered with slave %v\n",
executorInfo, frameworkInfo, slaveInfo) log.Infof(
"Executor %v of framework %v registered with slave %v\n",
executorInfo, frameworkInfo, slaveInfo,
)
if !(&k.state).transition(disconnectedState, connectedState) { if !(&k.state).transition(disconnectedState, connectedState) {
log.Errorf("failed to register/transition to a connected state") log.Errorf("failed to register/transition to a connected state")
} }
@ -241,8 +251,22 @@ func (k *Executor) Registered(driver bindings.ExecutorDriver,
} }
} }
annotations, err := executorInfoToAnnotations(executorInfo)
if err != nil {
log.Errorf(
"cannot get node annotations from executor info %v error %v",
executorInfo, err,
)
}
if slaveInfo != nil { if slaveInfo != nil {
_, err := node.CreateOrUpdate(k.client, slaveInfo.GetHostname(), node.SlaveAttributesToLabels(slaveInfo.Attributes)) _, err := node.CreateOrUpdate(
k.client,
slaveInfo.GetHostname(),
node.SlaveAttributesToLabels(slaveInfo.Attributes),
annotations,
)
if err != nil { if err != nil {
log.Errorf("cannot update node labels: %v", err) log.Errorf("cannot update node labels: %v", err)
} }
@ -270,7 +294,13 @@ func (k *Executor) Reregistered(driver bindings.ExecutorDriver, slaveInfo *mesos
} }
if slaveInfo != nil { if slaveInfo != nil {
_, err := node.CreateOrUpdate(k.client, slaveInfo.GetHostname(), node.SlaveAttributesToLabels(slaveInfo.Attributes)) _, err := node.CreateOrUpdate(
k.client,
slaveInfo.GetHostname(),
node.SlaveAttributesToLabels(slaveInfo.Attributes),
nil, // don't change annotations
)
if err != nil { if err != nil {
log.Errorf("cannot update node labels: %v", err) log.Errorf("cannot update node labels: %v", err)
} }
@ -988,3 +1018,20 @@ func nodeInfo(si *mesos.SlaveInfo, ei *mesos.ExecutorInfo) NodeInfo {
} }
return ni return ni
} }
func executorInfoToAnnotations(ei *mesos.ExecutorInfo) (annotations map[string]string, err error) {
annotations = map[string]string{}
if ei == nil {
return
}
var buf bytes.Buffer
if err = executorinfo.EncodeResources(&buf, ei.GetResources()); err != nil {
return
}
annotations[meta.ExecutorIdKey] = ei.GetExecutorId().GetValue()
annotations[meta.ExecutorResourcesKey] = buf.String()
return
}

View File

@ -168,10 +168,23 @@ func TestExecutorLaunchAndKillTask(t *testing.T) {
} }
pod := NewTestPod(1) pod := NewTestPod(1)
podTask, err := podtask.New(api.NewDefaultContext(), "", pod) executorinfo := &mesosproto.ExecutorInfo{}
podTask, err := podtask.New(
api.NewDefaultContext(),
"",
pod,
executorinfo,
nil,
)
assert.Equal(t, nil, err, "must be able to create a task from a pod") assert.Equal(t, nil, err, "must be able to create a task from a pod")
taskInfo := podTask.BuildTaskInfo(&mesosproto.ExecutorInfo{}) podTask.Spec = &podtask.Spec{
Executor: executorinfo,
}
taskInfo, err := podTask.BuildTaskInfo()
assert.Equal(t, nil, err, "must be able to build task info")
data, err := testapi.Default.Codec().Encode(pod) data, err := testapi.Default.Codec().Encode(pod)
assert.Equal(t, nil, err, "must be able to encode a pod's spec data") assert.Equal(t, nil, err, "must be able to encode a pod's spec data")
taskInfo.Data = data taskInfo.Data = data
@ -370,8 +383,21 @@ func TestExecutorFrameworkMessage(t *testing.T) {
// set up a pod to then lose // set up a pod to then lose
pod := NewTestPod(1) pod := NewTestPod(1)
podTask, _ := podtask.New(api.NewDefaultContext(), "foo", pod) executorinfo := &mesosproto.ExecutorInfo{}
taskInfo := podTask.BuildTaskInfo(&mesosproto.ExecutorInfo{}) podTask, _ := podtask.New(
api.NewDefaultContext(),
"foo",
pod,
executorinfo,
nil,
)
podTask.Spec = &podtask.Spec{
Executor: executorinfo,
}
taskInfo, err := podTask.BuildTaskInfo()
assert.Equal(t, nil, err, "must be able to build task info")
data, _ := testapi.Default.Codec().Encode(pod) data, _ := testapi.Default.Codec().Encode(pod)
taskInfo.Data = data taskInfo.Data = data

View File

@ -17,12 +17,13 @@ limitations under the License.
package node package node
import ( import (
"encoding/json"
"fmt" "fmt"
"reflect" "reflect"
"strconv" "strconv"
"strings" "strings"
"time"
log "github.com/golang/glog" log "github.com/golang/glog"
mesos "github.com/mesos/mesos-go/mesosproto" mesos "github.com/mesos/mesos-go/mesosproto"
"k8s.io/kubernetes/pkg/api" "k8s.io/kubernetes/pkg/api"
@ -33,14 +34,21 @@ import (
const ( const (
labelPrefix = "k8s.mesosphere.io/attribute-" labelPrefix = "k8s.mesosphere.io/attribute-"
clientRetryCount = 5
clientRetryInterval = time.Second
) )
// Create creates a new node api object with the given hostname and labels // Create creates a new node api object with the given hostname,
func Create(client *client.Client, hostName string, labels map[string]string) (*api.Node, error) { // slave attribute labels and annotations
func Create(
client *client.Client,
hostName string,
slaveAttrLabels,
annotations map[string]string,
) (*api.Node, error) {
n := api.Node{ n := api.Node{
ObjectMeta: api.ObjectMeta{ ObjectMeta: api.ObjectMeta{
Name: hostName, Name: hostName,
Labels: map[string]string{"kubernetes.io/hostname": hostName},
}, },
Spec: api.NodeSpec{ Spec: api.NodeSpec{
ExternalID: hostName, ExternalID: hostName,
@ -49,77 +57,91 @@ func Create(client *client.Client, hostName string, labels map[string]string) (*
Phase: api.NodePending, Phase: api.NodePending,
}, },
} }
for k, v := range labels {
n.Labels[k] = v n.Labels = mergeMaps(
} map[string]string{"kubernetes.io/hostname": hostName},
slaveAttrLabels,
)
n.Annotations = annotations
// try to create // try to create
return client.Nodes().Create(&n) return client.Nodes().Create(&n)
} }
// Update updates an existing node api object with new labels // Update updates an existing node api object
func Update(client *client.Client, n *api.Node, labels map[string]string) (*api.Node, error) { // by looking up the given hostname.
patch := struct { // The updated node merges the given slave attribute labels
Metadata struct { // and annotations with the found api object.
Labels map[string]string `json:"labels"` func Update(
} `json:"metadata"` client *client.Client,
}{} hostname string,
patch.Metadata.Labels = map[string]string{} slaveAttrLabels,
for k, v := range n.Labels { annotations map[string]string,
if !IsSlaveAttributeLabel(k) { ) (n *api.Node, err error) {
patch.Metadata.Labels[k] = v for i := 0; i < clientRetryCount; i++ {
} n, err = client.Nodes().Get(hostname)
}
for k, v := range labels {
patch.Metadata.Labels[k] = v
}
patchJson, _ := json.Marshal(patch)
log.V(4).Infof("Patching labels of node %q: %v", n.Name, string(patchJson))
err := client.Patch(api.MergePatchType).RequestURI(n.SelfLink).Body(patchJson).Do().Error()
if err != nil { if err != nil {
return nil, fmt.Errorf("error updating labels of node %q: %v", n.Name, err) return nil, fmt.Errorf("error getting node %q: %v", hostname, err)
}
if n == nil {
return nil, fmt.Errorf("no node instance returned for %q", hostname)
}
// update labels derived from Mesos slave attributes, keep all other labels
n.Labels = mergeMaps(
filterMap(n.Labels, IsNotSlaveAttributeLabel),
slaveAttrLabels,
)
n.Annotations = mergeMaps(n.Annotations, annotations)
n, err = client.Nodes().Update(n)
if err == nil && !errors.IsConflict(err) {
return n, nil
}
log.Infof("retry %d/%d: error updating node %v err %v", i, clientRetryCount, n, err)
time.Sleep(time.Duration(i) * clientRetryInterval)
} }
newNode, err := api.Scheme.DeepCopy(n)
if err != nil {
return nil, err return nil, err
}
newNode.(*api.Node).Labels = patch.Metadata.Labels
return newNode.(*api.Node), nil
} }
// CreateOrUpdate tries to create a node api object or updates an already existing one // CreateOrUpdate creates a node api object or updates an existing one
func CreateOrUpdate(client *client.Client, hostName string, labels map[string]string) (*api.Node, error) { func CreateOrUpdate(
n, err := Create(client, hostName, labels) client *client.Client,
hostname string,
slaveAttrLabels,
annotations map[string]string,
) (*api.Node, error) {
n, err := Create(client, hostname, slaveAttrLabels, annotations)
if err == nil { if err == nil {
return n, nil return n, nil
} }
if !errors.IsAlreadyExists(err) { if !errors.IsAlreadyExists(err) {
return nil, fmt.Errorf("unable to register %q with the apiserver: %v", hostName, err) return nil, fmt.Errorf("unable to register %q with the apiserver: %v", hostname, err)
} }
// fall back to update an old node with new labels // fall back to update an old node with new labels
n, err = client.Nodes().Get(hostName) return Update(client, hostname, slaveAttrLabels, annotations)
if err != nil { }
return nil, fmt.Errorf("error getting node %q: %v", hostName, err)
} // IsNotSlaveAttributeLabel returns true iff the given label is not derived from a slave attribute
if n == nil { func IsNotSlaveAttributeLabel(key, value string) bool {
return nil, fmt.Errorf("no node instance returned for %q", hostName) return !IsSlaveAttributeLabel(key, value)
}
return Update(client, n, labels)
} }
// IsSlaveAttributeLabel returns true iff the given label is derived from a slave attribute // IsSlaveAttributeLabel returns true iff the given label is derived from a slave attribute
func IsSlaveAttributeLabel(l string) bool { func IsSlaveAttributeLabel(key, value string) bool {
return strings.HasPrefix(l, labelPrefix) return strings.HasPrefix(key, labelPrefix)
} }
// IsUpToDate returns true iff the node's slave labels match the given attributes labels // IsUpToDate returns true iff the node's slave labels match the given attributes labels
func IsUpToDate(n *api.Node, labels map[string]string) bool { func IsUpToDate(n *api.Node, labels map[string]string) bool {
slaveLabels := map[string]string{} slaveLabels := map[string]string{}
for k, v := range n.Labels { for k, v := range n.Labels {
if IsSlaveAttributeLabel(k) { if IsSlaveAttributeLabel(k, "") {
slaveLabels[k] = v slaveLabels[k] = v
} }
} }
@ -158,3 +180,33 @@ func SlaveAttributesToLabels(attrs []*mesos.Attribute) map[string]string {
} }
return l return l
} }
// filterMap filters the given map and returns a new map
// containing all original elements matching the given key-value predicate.
func filterMap(m map[string]string, predicate func(string, string) bool) map[string]string {
result := make(map[string]string, len(m))
for k, v := range m {
if predicate(k, v) {
result[k] = v
}
}
return result
}
// mergeMaps merges all given maps into a single map.
// There is no advanced key conflict resolution.
// The last key from the given maps wins.
func mergeMaps(ms ...map[string]string) map[string]string {
var l int
for _, m := range ms {
l += len(m)
}
result := make(map[string]string, l)
for _, m := range ms {
for k, v := range m {
result[k] = v
}
}
return result
}

View File

@ -96,16 +96,16 @@ func (r *clientRegistrator) Run(terminate <-chan struct{}) error {
if n == nil { if n == nil {
log.V(2).Infof("creating node %s with labels %v", rg.hostName, rg.labels) log.V(2).Infof("creating node %s with labels %v", rg.hostName, rg.labels)
_, err := CreateOrUpdate(r.client, rg.hostName, rg.labels) _, err := CreateOrUpdate(r.client, rg.hostName, rg.labels, nil)
if err != nil { if err != nil {
log.Errorf("error creating the node %s: %v", rg.hostName, rg.labels) log.Errorf("error creating the node %s: %v", rg.hostName, rg.labels)
} }
} else { } else {
log.V(2).Infof("updating node %s with labels %v", rg.hostName, rg.labels) log.V(2).Infof("updating node %s with labels %v", rg.hostName, rg.labels)
_, err := Update(r.client, n, rg.labels) _, err := Update(r.client, rg.hostName, rg.labels, nil)
if err != nil && errors.IsNotFound(err) { if err != nil && errors.IsNotFound(err) {
// last chance when our store was out of date // last chance when our store was out of date
_, err = Create(r.client, rg.hostName, rg.labels) _, err = Create(r.client, rg.hostName, rg.labels, nil)
} }
if err != nil { if err != nil {
log.Errorf("error updating the node %s: %v", rg.hostName, rg.labels) log.Errorf("error updating the node %s: %v", rg.hostName, rg.labels)

View File

@ -20,16 +20,22 @@ import (
"fmt" "fmt"
log "github.com/golang/glog" log "github.com/golang/glog"
"github.com/mesos/mesos-go/mesosproto"
"k8s.io/kubernetes/contrib/mesos/pkg/offers" "k8s.io/kubernetes/contrib/mesos/pkg/offers"
"k8s.io/kubernetes/contrib/mesos/pkg/queue" "k8s.io/kubernetes/contrib/mesos/pkg/queue"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler" "k8s.io/kubernetes/contrib/mesos/pkg/scheduler"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/components/algorithm/podschedulers" "k8s.io/kubernetes/contrib/mesos/pkg/scheduler/components/algorithm/podschedulers"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/errors" "k8s.io/kubernetes/contrib/mesos/pkg/scheduler/errors"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/podtask" "k8s.io/kubernetes/contrib/mesos/pkg/scheduler/podtask"
mresource "k8s.io/kubernetes/contrib/mesos/pkg/scheduler/resource"
"k8s.io/kubernetes/pkg/api" "k8s.io/kubernetes/pkg/api"
"k8s.io/kubernetes/pkg/client/cache" "k8s.io/kubernetes/pkg/client/cache"
) )
// SchedulerAlgorithm is the interface that orchestrates the pod scheduling.
//
// Schedule implements the Scheduler interface of Kubernetes.
// It returns the selectedMachine's hostname or an error if the schedule failed.
type SchedulerAlgorithm interface { type SchedulerAlgorithm interface {
Schedule(pod *api.Pod) (string, error) Schedule(pod *api.Pod) (string, error)
} }
@ -39,18 +45,34 @@ type schedulerAlgorithm struct {
sched scheduler.Scheduler sched scheduler.Scheduler
podUpdates queue.FIFO podUpdates queue.FIFO
podScheduler podschedulers.PodScheduler podScheduler podschedulers.PodScheduler
prototype *mesosproto.ExecutorInfo
roles []string
defaultCpus mresource.CPUShares
defaultMem mresource.MegaBytes
} }
func New(sched scheduler.Scheduler, podUpdates queue.FIFO, podScheduler podschedulers.PodScheduler) SchedulerAlgorithm { // New returns a new SchedulerAlgorithm
// TODO(sur): refactor params to separate config object
func New(
sched scheduler.Scheduler,
podUpdates queue.FIFO,
podScheduler podschedulers.PodScheduler,
prototype *mesosproto.ExecutorInfo,
roles []string,
defaultCpus mresource.CPUShares,
defaultMem mresource.MegaBytes,
) SchedulerAlgorithm {
return &schedulerAlgorithm{ return &schedulerAlgorithm{
sched: sched, sched: sched,
podUpdates: podUpdates, podUpdates: podUpdates,
podScheduler: podScheduler, podScheduler: podScheduler,
roles: roles,
prototype: prototype,
defaultCpus: defaultCpus,
defaultMem: defaultMem,
} }
} }
// Schedule implements the Scheduler interface of Kubernetes.
// It returns the selectedMachine's name and error (if there's any).
func (k *schedulerAlgorithm) Schedule(pod *api.Pod) (string, error) { func (k *schedulerAlgorithm) Schedule(pod *api.Pod) (string, error) {
log.Infof("Try to schedule pod %v\n", pod.Name) log.Infof("Try to schedule pod %v\n", pod.Name)
ctx := api.WithNamespace(api.NewDefaultContext(), pod.Namespace) ctx := api.WithNamespace(api.NewDefaultContext(), pod.Namespace)
@ -74,13 +96,18 @@ func (k *schedulerAlgorithm) Schedule(pod *api.Pod) (string, error) {
log.Warningf("aborting Schedule, unable to understand pod object %+v", pod) log.Warningf("aborting Schedule, unable to understand pod object %+v", pod)
return "", errors.NoSuchPodErr return "", errors.NoSuchPodErr
} }
if deleted := k.podUpdates.Poll(podName, queue.DELETE_EVENT); deleted { if deleted := k.podUpdates.Poll(podName, queue.DELETE_EVENT); deleted {
// avoid scheduling a pod that's been deleted between yieldPod() and Schedule() // avoid scheduling a pod that's been deleted between yieldPod() and Schedule()
log.Infof("aborting Schedule, pod has been deleted %+v", pod) log.Infof("aborting Schedule, pod has been deleted %+v", pod)
return "", errors.NoSuchPodErr return "", errors.NoSuchPodErr
} }
podTask, err := podtask.New(ctx, "", pod) // write resource limits into the pod spec.
// From here on we can expect that the pod spec of a task has proper limits for CPU and memory.
k.limitPod(pod)
podTask, err := podtask.New(ctx, "", pod, k.prototype, k.roles)
if err != nil { if err != nil {
log.Warningf("aborting Schedule, unable to create podtask object %+v: %v", pod, err) log.Warningf("aborting Schedule, unable to create podtask object %+v: %v", pod, err)
return "", err return "", err
@ -115,7 +142,29 @@ func (k *schedulerAlgorithm) Schedule(pod *api.Pod) (string, error) {
} }
} }
// Call ScheduleFunc and subtract some resources, returning the name of the machine the task is scheduled on // limitPod limits the given pod based on the scheduler's default limits.
func (k *schedulerAlgorithm) limitPod(pod *api.Pod) error {
cpuRequest, cpuLimit, _, err := mresource.LimitPodCPU(pod, k.defaultCpus)
if err != nil {
return err
}
memRequest, memLimit, _, err := mresource.LimitPodMem(pod, k.defaultMem)
if err != nil {
return err
}
log.V(3).Infof(
"setting pod %s/%s resources: requested cpu %.2f mem %.2f MB, limited cpu %.2f mem %.2f MB",
pod.Namespace, pod.Name, cpuRequest, memRequest, cpuLimit, memLimit,
)
return nil
}
// doSchedule implements the actual scheduling of the given pod task.
// It checks whether the offer has been accepted and is still present in the offer registry.
// It delegates to the actual pod scheduler and updates the task registry.
func (k *schedulerAlgorithm) doSchedule(task *podtask.T) (string, error) { func (k *schedulerAlgorithm) doSchedule(task *podtask.T) (string, error) {
var offer offers.Perishable var offer offers.Perishable
var err error var err error
@ -134,8 +183,9 @@ func (k *schedulerAlgorithm) doSchedule(task *podtask.T) (string, error) {
} }
} }
var spec *podtask.Spec
if offer == nil { if offer == nil {
offer, err = k.podScheduler.SchedulePod(k.sched.Offers(), task) offer, spec, err = k.podScheduler.SchedulePod(k.sched.Offers(), task)
} }
if err != nil { if err != nil {
@ -152,11 +202,7 @@ func (k *schedulerAlgorithm) doSchedule(task *podtask.T) (string, error) {
} }
task.Offer = offer task.Offer = offer
if err := k.podScheduler.Procurement()(task, details); err != nil { task.Spec = spec
offer.Release()
task.Reset()
return "", err
}
if err := k.sched.Tasks().Update(task); err != nil { if err := k.sched.Tasks().Update(task); err != nil {
offer.Release() offer.Release()

View File

@ -21,51 +21,28 @@ import (
log "github.com/golang/glog" log "github.com/golang/glog"
"github.com/mesos/mesos-go/mesosproto"
"k8s.io/kubernetes/contrib/mesos/pkg/node" "k8s.io/kubernetes/contrib/mesos/pkg/node"
"k8s.io/kubernetes/contrib/mesos/pkg/offers" "k8s.io/kubernetes/contrib/mesos/pkg/offers"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/errors" "k8s.io/kubernetes/contrib/mesos/pkg/scheduler/errors"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/podtask" "k8s.io/kubernetes/contrib/mesos/pkg/scheduler/podtask"
"k8s.io/kubernetes/pkg/api"
) )
type allocationStrategy struct {
fitPredicate podtask.FitPredicate
procurement podtask.Procurement
}
func (a *allocationStrategy) FitPredicate() podtask.FitPredicate {
return a.fitPredicate
}
func (a *allocationStrategy) Procurement() podtask.Procurement {
return a.procurement
}
func NewAllocationStrategy(fitPredicate podtask.FitPredicate, procurement podtask.Procurement) AllocationStrategy {
if fitPredicate == nil {
panic("fitPredicate is required")
}
if procurement == nil {
panic("procurement is required")
}
return &allocationStrategy{
fitPredicate: fitPredicate,
procurement: procurement,
}
}
type fcfsPodScheduler struct { type fcfsPodScheduler struct {
AllocationStrategy procurement podtask.Procurement
lookupNode node.LookupFunc lookupNode node.LookupFunc
} }
func NewFCFSPodScheduler(as AllocationStrategy, lookupNode node.LookupFunc) PodScheduler { func NewFCFSPodScheduler(pr podtask.Procurement, lookupNode node.LookupFunc) PodScheduler {
return &fcfsPodScheduler{as, lookupNode} return &fcfsPodScheduler{pr, lookupNode}
} }
// A first-come-first-serve scheduler: acquires the first offer that can support the task // A first-come-first-serve scheduler: acquires the first offer that can support the task
func (fps *fcfsPodScheduler) SchedulePod(r offers.Registry, task *podtask.T) (offers.Perishable, error) { func (fps *fcfsPodScheduler) SchedulePod(r offers.Registry, task *podtask.T) (offers.Perishable, *podtask.Spec, error) {
podName := fmt.Sprintf("%s/%s", task.Pod.Namespace, task.Pod.Name) podName := fmt.Sprintf("%s/%s", task.Pod.Namespace, task.Pod.Name)
var acceptedOffer offers.Perishable var matchingOffer offers.Perishable
var acceptedSpec *podtask.Spec
err := r.Walk(func(p offers.Perishable) (bool, error) { err := r.Walk(func(p offers.Perishable) (bool, error) {
offer := p.Details() offer := p.Details()
if offer == nil { if offer == nil {
@ -82,25 +59,43 @@ func (fps *fcfsPodScheduler) SchedulePod(r offers.Registry, task *podtask.T) (of
return false, nil return false, nil
} }
if fps.FitPredicate()(task, offer, n) { ps := podtask.NewProcureState(offer)
if p.Acquire() { err := fps.procurement.Procure(task, n, ps)
acceptedOffer = p if err != nil {
log.V(5).Infof(
"Offer %q does not fit pod %s/%s: %v",
offer.Id, task.Pod.Namespace, task.Pod.Name, err,
)
return false, nil // continue
}
if !p.Acquire() {
log.V(2).Infof(
"Could not acquire offer %q for pod %s/%s",
offer.Id, task.Pod.Namespace, task.Pod.Name,
)
return false, nil // continue
}
matchingOffer = p
acceptedSpec, _ = ps.Result()
log.V(3).Infof("Pod %s accepted offer %v", podName, offer.Id.GetValue()) log.V(3).Infof("Pod %s accepted offer %v", podName, offer.Id.GetValue())
return true, nil // stop, we found an offer return true, nil // stop, we found an offer
}
}
return false, nil // continue
}) })
if acceptedOffer != nil { if matchingOffer != nil {
if err != nil { if err != nil {
log.Warningf("problems walking the offer registry: %v, attempting to continue", err) log.Warningf("problems walking the offer registry: %v, attempting to continue", err)
} }
return acceptedOffer, nil return matchingOffer, acceptedSpec, nil
} }
if err != nil { if err != nil {
log.V(2).Infof("failed to find a fit for pod: %s, err = %v", podName, err) log.V(2).Infof("failed to find a fit for pod: %s, err = %v", podName, err)
return nil, err return nil, nil, err
} }
log.V(2).Infof("failed to find a fit for pod: %s", podName) log.V(2).Infof("failed to find a fit for pod: %s", podName)
return nil, errors.NoSuitableOffersErr return nil, nil, errors.NoSuitableOffersErr
}
func (fps *fcfsPodScheduler) Fit(t *podtask.T, offer *mesosproto.Offer, n *api.Node) bool {
return fps.procurement.Procure(t, n, podtask.NewProcureState(offer)) == nil
} }

View File

@ -17,29 +17,25 @@ limitations under the License.
package podschedulers package podschedulers
import ( import (
"github.com/mesos/mesos-go/mesosproto"
"k8s.io/kubernetes/contrib/mesos/pkg/offers" "k8s.io/kubernetes/contrib/mesos/pkg/offers"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/podtask" "k8s.io/kubernetes/contrib/mesos/pkg/scheduler/podtask"
"k8s.io/kubernetes/pkg/api"
) )
type AllocationStrategy interface { // SchedulePod is the interface which schedules pods.
// FitPredicate returns the selector used to determine pod fitness w/ respect to a given offer // There can be different implementation for different scheduling policies.
FitPredicate() podtask.FitPredicate //
// SchedulePod accepts a set of offers and a single pod task, which aligns well
// Procurement returns a func that obtains resources for a task from resource offer // with the k8s scheduling algorithm. It returns an offer that is acceptable
Procurement() podtask.Procurement // for the pod, else nil. The caller is responsible for filling in task
} // state w/ relevant offer details.
//
// See the FCFSPodScheduler for example.
//
// Fit checks whether a given podtask can be scheduled for the given offer on the given node.
type PodScheduler interface { type PodScheduler interface {
AllocationStrategy SchedulePod(r offers.Registry, task *podtask.T) (offers.Perishable, *podtask.Spec, error)
// SchedulePod implements how to schedule pods among slaves. Fit(*podtask.T, *mesosproto.Offer, *api.Node) bool
// We can have different implementation for different scheduling policy.
//
// The function accepts a set of offers and a single pod, which aligns well
// with the k8s scheduling algorithm. It returns an offerId that is acceptable
// for the pod, otherwise nil. The caller is responsible for filling in task
// state w/ relevant offer details.
//
// See the FCFSPodScheduler for example.
SchedulePod(r offers.Registry, task *podtask.T) (offers.Perishable, error)
} }

View File

@ -98,8 +98,11 @@ func (b *binder) bind(ctx api.Context, binding *api.Binding, task *podtask.T) (e
} }
if err = b.prepareTaskForLaunch(ctx, binding.Target.Name, task, offerId); err == nil { if err = b.prepareTaskForLaunch(ctx, binding.Target.Name, task, offerId); err == nil {
log.V(2).Infof("launching task: %q on target %q slave %q for pod \"%v/%v\", cpu %.2f, mem %.2f MB", log.V(2).Infof(
task.ID, binding.Target.Name, task.Spec.SlaveID, task.Pod.Namespace, task.Pod.Name, task.Spec.CPU, task.Spec.Memory) "launching task: %q on target %q slave %q for pod \"%v/%v\", resources %v",
task.ID, binding.Target.Name, task.Spec.SlaveID, task.Pod.Namespace, task.Pod.Name, task.Spec.Resources,
)
if err = b.sched.LaunchTask(task); err == nil { if err = b.sched.LaunchTask(task); err == nil {
b.sched.Offers().Invalidate(offerId) b.sched.Offers().Invalidate(offerId)
task.Set(podtask.Launched) task.Set(podtask.Launched)

View File

@ -19,6 +19,7 @@ package deleter
import ( import (
"testing" "testing"
"github.com/mesos/mesos-go/mesosproto"
"github.com/stretchr/testify/assert" "github.com/stretchr/testify/assert"
"k8s.io/kubernetes/contrib/mesos/pkg/queue" "k8s.io/kubernetes/contrib/mesos/pkg/queue"
types "k8s.io/kubernetes/contrib/mesos/pkg/scheduler" types "k8s.io/kubernetes/contrib/mesos/pkg/scheduler"
@ -60,7 +61,13 @@ func TestDeleteOne_PendingPod(t *testing.T) {
UID: "foo0", UID: "foo0",
Namespace: api.NamespaceDefault, Namespace: api.NamespaceDefault,
}}} }}}
task, err := podtask.New(api.NewDefaultContext(), "bar", pod.Pod) task, err := podtask.New(
api.NewDefaultContext(),
"bar",
pod.Pod,
&mesosproto.ExecutorInfo{},
nil,
)
if err != nil { if err != nil {
t.Fatalf("failed to create task: %v", err) t.Fatalf("failed to create task: %v", err)
} }
@ -100,7 +107,13 @@ func TestDeleteOne_Running(t *testing.T) {
UID: "foo0", UID: "foo0",
Namespace: api.NamespaceDefault, Namespace: api.NamespaceDefault,
}}} }}}
task, err := podtask.New(api.NewDefaultContext(), "bar", pod.Pod) task, err := podtask.New(
api.NewDefaultContext(),
"bar",
pod.Pod,
&mesosproto.ExecutorInfo{},
nil,
)
if err != nil { if err != nil {
t.Fatalf("unexpected error: %v", err) t.Fatalf("unexpected error: %v", err)
} }

View File

@ -28,7 +28,6 @@ import (
mesos "github.com/mesos/mesos-go/mesosproto" mesos "github.com/mesos/mesos-go/mesosproto"
mutil "github.com/mesos/mesos-go/mesosutil" mutil "github.com/mesos/mesos-go/mesosutil"
bindings "github.com/mesos/mesos-go/scheduler" bindings "github.com/mesos/mesos-go/scheduler"
execcfg "k8s.io/kubernetes/contrib/mesos/pkg/executor/config"
"k8s.io/kubernetes/contrib/mesos/pkg/executor/messages" "k8s.io/kubernetes/contrib/mesos/pkg/executor/messages"
"k8s.io/kubernetes/contrib/mesos/pkg/node" "k8s.io/kubernetes/contrib/mesos/pkg/node"
"k8s.io/kubernetes/contrib/mesos/pkg/offers" "k8s.io/kubernetes/contrib/mesos/pkg/offers"
@ -42,7 +41,6 @@ import (
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/meta" "k8s.io/kubernetes/contrib/mesos/pkg/scheduler/meta"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/metrics" "k8s.io/kubernetes/contrib/mesos/pkg/scheduler/metrics"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/podtask" "k8s.io/kubernetes/contrib/mesos/pkg/scheduler/podtask"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/uid"
"k8s.io/kubernetes/pkg/api" "k8s.io/kubernetes/pkg/api"
"k8s.io/kubernetes/pkg/api/errors" "k8s.io/kubernetes/pkg/api/errors"
client "k8s.io/kubernetes/pkg/client/unversioned" client "k8s.io/kubernetes/pkg/client/unversioned"
@ -71,13 +69,13 @@ type framework struct {
// Config related, write-once // Config related, write-once
sched scheduler.Scheduler sched scheduler.Scheduler
schedulerConfig *schedcfg.Config schedulerConfig *schedcfg.Config
executor *mesos.ExecutorInfo
executorGroup uint64
client *client.Client client *client.Client
failoverTimeout float64 // in seconds failoverTimeout float64 // in seconds
reconcileInterval int64 reconcileInterval int64
nodeRegistrator node.Registrator nodeRegistrator node.Registrator
storeFrameworkId func(id string) storeFrameworkId func(id string)
lookupNode node.LookupFunc
executorId *mesos.ExecutorID
// Mesos context // Mesos context
driver bindings.SchedulerDriver // late initialization driver bindings.SchedulerDriver // late initialization
@ -99,7 +97,7 @@ type framework struct {
type Config struct { type Config struct {
SchedulerConfig schedcfg.Config SchedulerConfig schedcfg.Config
Executor *mesos.ExecutorInfo ExecutorId *mesos.ExecutorID
Client *client.Client Client *client.Client
StoreFrameworkId func(id string) StoreFrameworkId func(id string)
FailoverTimeout float64 FailoverTimeout float64
@ -114,12 +112,11 @@ func New(config Config) Framework {
k = &framework{ k = &framework{
schedulerConfig: &config.SchedulerConfig, schedulerConfig: &config.SchedulerConfig,
RWMutex: new(sync.RWMutex), RWMutex: new(sync.RWMutex),
executor: config.Executor,
executorGroup: uid.Parse(config.Executor.ExecutorId.GetValue()).Group(),
client: config.Client, client: config.Client,
failoverTimeout: config.FailoverTimeout, failoverTimeout: config.FailoverTimeout,
reconcileInterval: config.ReconcileInterval, reconcileInterval: config.ReconcileInterval,
nodeRegistrator: node.NewRegistrator(config.Client, config.LookupNode), nodeRegistrator: node.NewRegistrator(config.Client, config.LookupNode),
executorId: config.ExecutorId,
offers: offers.CreateRegistry(offers.RegistryConfig{ offers: offers.CreateRegistry(offers.RegistryConfig{
Compat: func(o *mesos.Offer) bool { Compat: func(o *mesos.Offer) bool {
// the node must be registered and have up-to-date labels // the node must be registered and have up-to-date labels
@ -128,10 +125,17 @@ func New(config Config) Framework {
return false return false
} }
// the executor IDs must not identify a kubelet-executor with a group that doesn't match ours eids := len(o.GetExecutorIds())
for _, eid := range o.GetExecutorIds() { switch {
execuid := uid.Parse(eid.GetValue()) case eids > 1:
if execuid.Name() == execcfg.DefaultInfoID && execuid.Group() != k.executorGroup { // at most one executor id expected. More than one means that
// the given node is seriously in trouble.
return false
case eids == 1:
// the executor id must match, otherwise the running executor
// is incompatible with the current scheduler configuration.
if eid := o.GetExecutorIds()[0]; eid.GetValue() != config.ExecutorId.GetValue() {
return false return false
} }
} }
@ -161,6 +165,7 @@ func New(config Config) Framework {
return proc.ErrorChanf("cannot execute action with unregistered scheduler") return proc.ErrorChanf("cannot execute action with unregistered scheduler")
}), }),
storeFrameworkId: config.StoreFrameworkId, storeFrameworkId: config.StoreFrameworkId,
lookupNode: config.LookupNode,
} }
return k return k
} }
@ -188,6 +193,45 @@ func (k *framework) asMaster() proc.Doer {
return k.asRegisteredMaster return k.asRegisteredMaster
} }
// An executorRef holds a reference to an executor and the slave it is running on
type executorRef struct {
executorID *mesos.ExecutorID
slaveID *mesos.SlaveID
}
// executorRefs returns a slice of known references to running executors known to this framework
func (k *framework) executorRefs() []executorRef {
slaves := k.slaveHostNames.SlaveIDs()
refs := make([]executorRef, 0, len(slaves))
for _, slaveID := range slaves {
hostname := k.slaveHostNames.HostName(slaveID)
if hostname == "" {
log.Warningf("hostname lookup for slaveID %q failed", slaveID)
continue
}
node := k.lookupNode(hostname)
if node == nil {
log.Warningf("node lookup for slaveID %q failed", slaveID)
continue
}
eid, ok := node.Annotations[meta.ExecutorIdKey]
if !ok {
log.Warningf("unable to find %q annotation for node %v", meta.ExecutorIdKey, node)
continue
}
refs = append(refs, executorRef{
executorID: mutil.NewExecutorID(eid),
slaveID: mutil.NewSlaveID(slaveID),
})
}
return refs
}
func (k *framework) installDebugHandlers(mux *http.ServeMux) { func (k *framework) installDebugHandlers(mux *http.ServeMux) {
wrappedHandler := func(uri string, h http.Handler) { wrappedHandler := func(uri string, h http.Handler) {
mux.HandleFunc(uri, func(w http.ResponseWriter, r *http.Request) { mux.HandleFunc(uri, func(w http.ResponseWriter, r *http.Request) {
@ -210,6 +254,7 @@ func (k *framework) installDebugHandlers(mux *http.ServeMux) {
} }
}) })
} }
requestReconciliation := func(uri string, requestAction func()) { requestReconciliation := func(uri string, requestAction func()) {
wrappedHandler(uri, http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { wrappedHandler(uri, http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
requestAction() requestAction()
@ -220,18 +265,34 @@ func (k *framework) installDebugHandlers(mux *http.ServeMux) {
requestReconciliation("/debug/actions/requestImplicit", k.tasksReconciler.RequestImplicit) requestReconciliation("/debug/actions/requestImplicit", k.tasksReconciler.RequestImplicit)
wrappedHandler("/debug/actions/kamikaze", http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { wrappedHandler("/debug/actions/kamikaze", http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
slaves := k.slaveHostNames.SlaveIDs() refs := k.executorRefs()
for _, slaveId := range slaves {
for _, ref := range refs {
_, err := k.driver.SendFrameworkMessage( _, err := k.driver.SendFrameworkMessage(
k.executor.ExecutorId, ref.executorID,
mutil.NewSlaveID(slaveId), ref.slaveID,
messages.Kamikaze) messages.Kamikaze,
)
if err != nil { if err != nil {
log.Warningf("failed to send kamikaze message to slave %s: %v", slaveId, err) msg := fmt.Sprintf(
} else { "error sending kamikaze message to executor %q on slave %q: %v",
io.WriteString(w, fmt.Sprintf("kamikaze slave %s\n", slaveId)) ref.executorID.GetValue(),
ref.slaveID.GetValue(),
err,
)
log.Warning(msg)
fmt.Fprintln(w, msg)
continue
} }
io.WriteString(w, fmt.Sprintf(
"kamikaze message sent to executor %q on slave %q\n",
ref.executorID.GetValue(),
ref.slaveID.GetValue(),
))
} }
io.WriteString(w, "OK") io.WriteString(w, "OK")
})) }))
} }
@ -702,11 +763,16 @@ func (ks *framework) KillTask(id string) error {
} }
func (ks *framework) LaunchTask(t *podtask.T) error { func (ks *framework) LaunchTask(t *podtask.T) error {
taskInfo, err := t.BuildTaskInfo()
if err != nil {
return err
}
// assume caller is holding scheduler lock // assume caller is holding scheduler lock
taskList := []*mesos.TaskInfo{t.BuildTaskInfo(ks.executor)} taskList := []*mesos.TaskInfo{taskInfo}
offerIds := []*mesos.OfferID{t.Offer.Details().Id} offerIds := []*mesos.OfferID{t.Offer.Details().Id}
filters := &mesos.Filters{} filters := &mesos.Filters{}
_, err := ks.driver.LaunchTasks(offerIds, taskList, filters) _, err = ks.driver.LaunchTasks(offerIds, taskList, filters)
return err return err
} }

View File

@ -37,6 +37,7 @@ import (
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/config" "k8s.io/kubernetes/contrib/mesos/pkg/scheduler/config"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/podtask" "k8s.io/kubernetes/contrib/mesos/pkg/scheduler/podtask"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/queuer" "k8s.io/kubernetes/contrib/mesos/pkg/scheduler/queuer"
mresource "k8s.io/kubernetes/contrib/mesos/pkg/scheduler/resource"
"k8s.io/kubernetes/pkg/api" "k8s.io/kubernetes/pkg/api"
"k8s.io/kubernetes/pkg/client/cache" "k8s.io/kubernetes/pkg/client/cache"
"k8s.io/kubernetes/pkg/client/record" "k8s.io/kubernetes/pkg/client/record"
@ -54,9 +55,20 @@ type sched struct {
taskRegistry podtask.Registry taskRegistry podtask.Registry
} }
func New(c *config.Config, fw framework.Framework, ps podschedulers.PodScheduler, func New(
client *client.Client, recorder record.EventRecorder, terminate <-chan struct{}, mux *http.ServeMux, lw *cache.ListWatch) scheduler.Scheduler { c *config.Config,
fw framework.Framework,
ps podschedulers.PodScheduler,
client *client.Client,
recorder record.EventRecorder,
terminate <-chan struct{},
mux *http.ServeMux,
lw *cache.ListWatch,
prototype *mesos.ExecutorInfo,
roles []string,
defaultCpus mresource.CPUShares,
defaultMem mresource.MegaBytes,
) scheduler.Scheduler {
core := &sched{ core := &sched{
framework: fw, framework: fw,
taskRegistry: podtask.NewInMemoryRegistry(), taskRegistry: podtask.NewInMemoryRegistry(),
@ -69,7 +81,7 @@ func New(c *config.Config, fw framework.Framework, ps podschedulers.PodScheduler
q := queuer.New(queue.NewDelayFIFO(), podUpdates) q := queuer.New(queue.NewDelayFIFO(), podUpdates)
algorithm := algorithm.New(core, podUpdates, ps) algorithm := algorithm.New(core, podUpdates, ps, prototype, roles, defaultCpus, defaultMem)
podDeleter := deleter.New(core, q) podDeleter := deleter.New(core, q)
@ -86,7 +98,7 @@ func New(c *config.Config, fw framework.Framework, ps podschedulers.PodScheduler
// "backs off" when it can't find an offer that matches up with a pod. // "backs off" when it can't find an offer that matches up with a pod.
// The backoff period for a pod can terminate sooner if an offer becomes // The backoff period for a pod can terminate sooner if an offer becomes
// available that matches up. // available that matches up.
return !task.Has(podtask.Launched) && ps.FitPredicate()(task, offer, nil) return !task.Has(podtask.Launched) && ps.Fit(task, offer, nil)
default: default:
// no point in continuing to check for matching offers // no point in continuing to check for matching offers
return true return true

View File

@ -0,0 +1,92 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package executorinfo
import (
"encoding/base64"
"io"
"bufio"
"github.com/gogo/protobuf/proto"
"github.com/mesos/mesos-go/mesosproto"
)
var base64Codec = base64.StdEncoding
// EncodeResources encodes the given resource slice to the given writer.
// The resource slice is encoded as a comma separated string of
// base64 encoded resource protobufs.
func EncodeResources(w io.Writer, rs []*mesosproto.Resource) error {
sep := ""
for _, r := range rs {
_, err := io.WriteString(w, sep)
if err != nil {
return err
}
buf, err := proto.Marshal(r)
if err != nil {
return err
}
encoded := base64Codec.EncodeToString(buf)
_, err = io.WriteString(w, encoded)
if err != nil {
return err
}
sep = ","
}
return nil
}
// DecodeResources decodes a resource slice from the given reader.
// The format is expected to be the same as in EncodeResources.
func DecodeResources(r io.Reader) (rs []*mesosproto.Resource, err error) {
delimited := bufio.NewReader(r)
rs = []*mesosproto.Resource{}
for err != io.EOF {
var encoded string
encoded, err = delimited.ReadString(',')
switch {
case err == io.EOF:
case err == nil:
encoded = encoded[:len(encoded)-1]
default: // err != nil && err != io.EOF
return nil, err
}
decoded, err := base64Codec.DecodeString(encoded)
if err != nil {
return nil, err
}
r := mesosproto.Resource{}
if err := proto.Unmarshal(decoded, &r); err != nil {
return nil, err
}
rs = append(rs, &r)
}
return rs, nil
}

View File

@ -0,0 +1,69 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package executorinfo
import (
"bytes"
"reflect"
"testing"
"github.com/mesos/mesos-go/mesosproto"
"github.com/mesos/mesos-go/mesosutil"
)
func TestEncodeDecode(t *testing.T) {
want := []*mesosproto.Resource{
scalar("cpus", 0.1, "*"),
scalar("mem", 64.0, "*"),
scalar("mem", 128.0, "public_slave"),
}
var buf bytes.Buffer
if err := EncodeResources(&buf, want); err != nil {
t.Error(err)
}
got, err := DecodeResources(&buf)
if err != nil {
t.Error(err)
}
if ok := reflect.DeepEqual(want, got); !ok {
t.Errorf("want %v got %v", want, got)
}
}
func TestEncodeDecodeNil(t *testing.T) {
var buf bytes.Buffer
if err := EncodeResources(&buf, nil); err != nil {
t.Error(err)
}
if buf.String() != "" {
t.Errorf("expected empty string but got %q", buf.String())
}
if _, err := DecodeResources(&buf); err == nil {
t.Errorf("expected error but got none")
}
}
func scalar(name string, value float64, role string) *mesosproto.Resource {
res := mesosutil.NewScalarResource(name, value)
res.Role = &role
return res
}

View File

@ -14,5 +14,6 @@ See the License for the specific language governing permissions and
limitations under the License. limitations under the License.
*/ */
// Package uid encapsulates unique identifiers code used by the scheduler. // Package executorinfo provides a lru-based executor info registry
package uid // as well as some utility methods.
package executorinfo

View File

@ -14,7 +14,7 @@ See the License for the specific language governing permissions and
limitations under the License. limitations under the License.
*/ */
package service package executorinfo
import ( import (
"bytes" "bytes"
@ -23,17 +23,32 @@ import (
"sort" "sort"
"strconv" "strconv"
"github.com/gogo/protobuf/proto"
"github.com/mesos/mesos-go/mesosproto"
mesos "github.com/mesos/mesos-go/mesosproto" mesos "github.com/mesos/mesos-go/mesosproto"
execcfg "k8s.io/kubernetes/contrib/mesos/pkg/executor/config"
) )
func NewID(info *mesosproto.ExecutorInfo) *mesosproto.ExecutorID {
eid := fmt.Sprintf("%x_%s", hash(info), execcfg.DefaultInfoID)
return &mesosproto.ExecutorID{Value: proto.String(eid)}
}
// compute a hashcode for ExecutorInfo that may be used as a reasonable litmus test // compute a hashcode for ExecutorInfo that may be used as a reasonable litmus test
// with respect to compatibility across HA schedulers. the intent is that an HA scheduler // with respect to compatibility across HA schedulers. the intent is that an HA scheduler
// should fail-fast if it doesn't pass this test, rather than generating (potentially many) // should fail-fast if it doesn't pass this test, rather than generating (potentially many)
// errors at run-time because a Mesos master decides that the ExecutorInfo generated by a // errors at run-time because a Mesos master decides that the ExecutorInfo generated by a
// secondary scheduler doesn't match that of the primary scheduler. // secondary scheduler doesn't match that of the primary scheduler.
// //
// Note: We intentionally leave out the Resources in this hash because they are
// set during procurement and should not lead to a different ExecutorId.
// This also means that the Resources do not contribute to offer
// compatibility checking. But as we persist and restore the Resources
// through node anotation we make sure that the right resources are chosen
// during task launch.
//
// see https://github.com/apache/mesos/blob/0.22.0/src/common/type_utils.cpp#L110 // see https://github.com/apache/mesos/blob/0.22.0/src/common/type_utils.cpp#L110
func hashExecutorInfo(info *mesos.ExecutorInfo) uint64 { func hash(info *mesos.ExecutorInfo) uint64 {
// !!! we specifically do NOT include: // !!! we specifically do NOT include:
// - Framework ID because it's a value that's initialized too late for us to use // - Framework ID because it's a value that's initialized too late for us to use
// - Executor ID because it's a value that includes a copy of this hash // - Executor ID because it's a value that includes a copy of this hash
@ -54,7 +69,7 @@ func hashExecutorInfo(info *mesos.ExecutorInfo) uint64 {
buf.WriteString(item) buf.WriteString(item)
} }
} }
if vars := info.Command.Environment.GetVariables(); vars != nil && len(vars) > 0 { if vars := info.Command.Environment.GetVariables(); len(vars) > 0 {
names := []string{} names := []string{}
e := make(map[string]string) e := make(map[string]string)
@ -81,7 +96,7 @@ func hashExecutorInfo(info *mesos.ExecutorInfo) uint64 {
buf.WriteString(uri) buf.WriteString(uri)
} }
} }
//TODO(jdef) add support for Resources and Container //TODO(jdef) add support for Container
} }
table := crc64.MakeTable(crc64.ECMA) table := crc64.MakeTable(crc64.ECMA)
return crc64.Checksum(buf.Bytes(), table) return crc64.Checksum(buf.Bytes(), table)

View File

@ -0,0 +1,95 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package executorinfo
import (
"container/list"
"errors"
"github.com/mesos/mesos-go/mesosproto"
)
// Cache is an LRU cache for executor info objects.
// It is not safe for concurrent use.
type Cache struct {
maxEntries int
ll *list.List
cache map[string]*list.Element // by hostname
}
type entry struct {
hostname string
info *mesosproto.ExecutorInfo
}
// NewCache creates a new cache.
// If maxEntries is zero, an error is being returned.
func NewCache(maxEntries int) (*Cache, error) {
if maxEntries <= 0 {
return nil, errors.New("invalid maxEntries value")
}
return &Cache{
maxEntries: maxEntries,
ll: list.New(), // least recently used sorted linked list
cache: make(map[string]*list.Element),
}, nil
}
// Add adds an executor info associated with the given hostname to the cache.
func (c *Cache) Add(hostname string, e *mesosproto.ExecutorInfo) {
if ee, ok := c.cache[hostname]; ok {
c.ll.MoveToFront(ee)
ee.Value.(*entry).info = e
return
}
el := c.ll.PushFront(&entry{hostname, e})
c.cache[hostname] = el
if c.ll.Len() > c.maxEntries {
c.RemoveOldest()
}
}
// Get looks up a hostname's executor info from the cache.
func (c *Cache) Get(hostname string) (e *mesosproto.ExecutorInfo, ok bool) {
if el, hit := c.cache[hostname]; hit {
c.ll.MoveToFront(el)
return el.Value.(*entry).info, true
}
return
}
// Remove removes the provided hostname from the cache.
func (c *Cache) Remove(hostname string) {
if el, hit := c.cache[hostname]; hit {
c.removeElement(el)
}
}
// RemoveOldest removes the oldest item from the cache.
func (c *Cache) RemoveOldest() {
oldest := c.ll.Back()
if oldest != nil {
c.removeElement(oldest)
}
}
func (c *Cache) removeElement(el *list.Element) {
c.ll.Remove(el)
kv := el.Value.(*entry)
delete(c.cache, kv.hostname)
}

View File

@ -14,34 +14,42 @@ See the License for the specific language governing permissions and
limitations under the License. limitations under the License.
*/ */
package uid package executorinfo
import ( import (
"testing" "testing"
"github.com/mesos/mesos-go/mesosproto"
) )
func TestUID_Parse(t *testing.T) { func TestLruCache(t *testing.T) {
valid := []string{"1234567890abcdef_foo", "123_bar", "face_time"} c, err := NewCache(2)
groups := []uint64{0x1234567890abcdef, 0x123, 0xface} if err != nil {
t.Fatal(err)
for i, good := range valid {
u := Parse(good)
if u == nil {
t.Errorf("expected parsed UID, not nil")
}
if groups[i] != u.Group() {
t.Errorf("expected matching group instead of %x", u.Group())
}
if good != u.String() {
t.Errorf("expected %q instead of %q", good, u.String())
}
} }
invalid := []string{"", "bad"} e := &mesosproto.ExecutorInfo{}
for _, bad := range invalid {
u := Parse(bad) c.Add("foo", e)
if u != nil { c.Add("bar", e)
t.Errorf("expected nil UID instead of %v", u)
if _, ok := c.Get("bar"); !ok {
t.Fatal(`expected "bar" but got none`)
} }
if _, ok := c.Get("foo"); !ok {
t.Fatal(`expected "foo" but got none`)
}
c.Add("foo", e)
c.Add("baz", e)
if _, ok := c.Get("bar"); ok {
t.Fatal(`expected none but got "bar"`)
}
c.Remove("foo")
if _, ok := c.Get("foo"); ok {
t.Fatal(`expected none but got "foo"`)
} }
} }

View File

@ -0,0 +1,178 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package executorinfo
import (
"fmt"
"strings"
"sync"
"github.com/gogo/protobuf/proto"
"github.com/mesos/mesos-go/mesosproto"
"k8s.io/kubernetes/contrib/mesos/pkg/node"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/meta"
)
// Registry is the interface that provides methods for interacting
// with a registry of ExecutorInfo objects
//
// Get looks up an ExecutorInfo object for the given hostname
//
// New returns an ExecutorInfo object based on a given hostname and resources
//
// Invalidate invalidates the given hostname from this registry.
// Note that a subsequent Get may recover the executor info.
type Registry interface {
New(hostname string, resources []*mesosproto.Resource) *mesosproto.ExecutorInfo
Get(hostname string) (*mesosproto.ExecutorInfo, error)
Invalidate(hostname string)
}
// registry implements a map-based in-memory ExecutorInfo registry
type registry struct {
cache *Cache
mu sync.RWMutex // protects fields above
lookupNode node.LookupFunc
prototype *mesosproto.ExecutorInfo
}
// NewRegistry returns a new executorinfo registry.
// The given prototype is being used for properties other than resources.
func NewRegistry(
lookupNode node.LookupFunc,
prototype *mesosproto.ExecutorInfo,
cache *Cache,
) (Registry, error) {
if prototype == nil {
return nil, fmt.Errorf("no prototype given")
}
if lookupNode == nil {
return nil, fmt.Errorf("no lookupNode given")
}
if cache == nil {
return nil, fmt.Errorf("no cache given")
}
return &registry{
cache: cache,
lookupNode: lookupNode,
prototype: prototype,
}, nil
}
// New creates a customized ExecutorInfo for a host
//
// Note: New modifies Command.Arguments and Resources and intentionally
// does not update the executor id (although that originally depended on the
// command arguments and the resources). But as the hostname is constant for a
// given host, and the resources are compatible by the registry logic here this
// will not weaken our litmus test comparing the prototype ExecutorId with the
// id of running executors when an offer comes in.
func (r *registry) New(
hostname string,
resources []*mesosproto.Resource,
) *mesosproto.ExecutorInfo {
e := proto.Clone(r.prototype).(*mesosproto.ExecutorInfo)
e.Resources = resources
setCommandArgument(e, "--hostname-override", hostname)
r.mu.Lock()
defer r.mu.Unlock()
cached, ok := r.cache.Get(hostname)
if ok {
return cached
}
r.cache.Add(hostname, e)
return e
}
func (r *registry) Get(hostname string) (*mesosproto.ExecutorInfo, error) {
// first try to read from cached items
r.mu.RLock()
info, ok := r.cache.Get(hostname)
r.mu.RUnlock()
if ok {
return info, nil
}
result, err := r.resourcesFromNode(hostname)
if err != nil {
// master claims there is an executor with id, we cannot find any meta info
// => no way to recover this node
return nil, fmt.Errorf(
"failed to recover executor info for node %q, error: %v",
hostname, err,
)
}
return r.New(hostname, result), nil
}
func (r *registry) Invalidate(hostname string) {
r.mu.Lock()
defer r.mu.Unlock()
r.cache.Remove(hostname)
}
// resourcesFromNode looks up ExecutorInfo resources for the given hostname and executorinfo ID
// or returns an error in case of failure.
func (r *registry) resourcesFromNode(hostname string) ([]*mesosproto.Resource, error) {
n := r.lookupNode(hostname)
if n == nil {
return nil, fmt.Errorf("hostname %q not found", hostname)
}
encoded, ok := n.Annotations[meta.ExecutorResourcesKey]
if !ok {
return nil, fmt.Errorf(
"no %q annotation found in hostname %q",
meta.ExecutorResourcesKey, hostname,
)
}
return DecodeResources(strings.NewReader(encoded))
}
// setCommandArgument sets the given flag to the given value
// in the command arguments of the given executoringfo.
func setCommandArgument(ei *mesosproto.ExecutorInfo, flag, value string) {
if ei.Command == nil {
return
}
argv := ei.Command.Arguments
overwrite := false
for i, arg := range argv {
if strings.HasPrefix(arg, flag+"=") {
overwrite = true
argv[i] = flag + "=" + value
break
}
}
if !overwrite {
ei.Command.Arguments = append(argv, flag+"="+value)
}
}

View File

@ -0,0 +1,194 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package executorinfo
import (
"bytes"
"reflect"
"testing"
"github.com/gogo/protobuf/proto"
"github.com/mesos/mesos-go/mesosproto"
"github.com/mesos/mesos-go/mesosutil"
"k8s.io/kubernetes/contrib/mesos/pkg/node"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/meta"
"k8s.io/kubernetes/pkg/api"
)
func TestRegistryGet(t *testing.T) {
var lookupFunc func() *api.Node
lookupNode := node.LookupFunc(func(hostname string) *api.Node {
return lookupFunc()
})
prototype := &mesosproto.ExecutorInfo{
Resources: []*mesosproto.Resource{
scalar("foo", 1.0, "role1"),
},
}
c, err := NewCache(1000)
if err != nil {
t.Error(err)
return
}
r, err := NewRegistry(lookupNode, prototype, c)
if err != nil {
t.Error(err)
return
}
var resources bytes.Buffer
EncodeResources(&resources, prototype.GetResources())
for i, tt := range []struct {
apiNode *api.Node
wantErr bool
}{
{
apiNode: nil,
wantErr: true,
}, {
apiNode: &api.Node{},
wantErr: true,
}, {
apiNode: &api.Node{
ObjectMeta: api.ObjectMeta{
Annotations: map[string]string{},
},
},
wantErr: true,
}, {
apiNode: &api.Node{
ObjectMeta: api.ObjectMeta{
Annotations: map[string]string{
meta.ExecutorResourcesKey: resources.String(),
},
},
},
wantErr: false,
},
} {
lookupFunc = func() *api.Node { return tt.apiNode }
_, err := r.Get("")
if tt.wantErr && err == nil {
t.Errorf("test %d: want error but got none", i)
}
if !tt.wantErr && err != nil {
t.Errorf("test %d error: %v", i, err)
}
}
}
func TestRegistryNew(t *testing.T) {
for i, tt := range []struct {
prototype *mesosproto.ExecutorInfo
resources []*mesosproto.Resource
want *mesosproto.ExecutorInfo
}{
{
prototype: &mesosproto.ExecutorInfo{
ExecutorId: mesosutil.NewExecutorID("exec-id"),
},
resources: nil,
want: &mesosproto.ExecutorInfo{
ExecutorId: mesosutil.NewExecutorID("exec-id"),
},
}, {
prototype: &mesosproto.ExecutorInfo{
ExecutorId: mesosutil.NewExecutorID("exec-id"),
},
resources: []*mesosproto.Resource{},
want: &mesosproto.ExecutorInfo{
ExecutorId: mesosutil.NewExecutorID("exec-id"),
Resources: []*mesosproto.Resource{},
},
}, {
prototype: &mesosproto.ExecutorInfo{
ExecutorId: mesosutil.NewExecutorID("exec-id"),
Name: proto.String("foo"),
},
resources: []*mesosproto.Resource{
scalar("foo", 1.0, "role1"),
scalar("bar", 2.0, "role2"),
},
want: &mesosproto.ExecutorInfo{
ExecutorId: mesosutil.NewExecutorID("exec-id"),
Name: proto.String("foo"),
Resources: []*mesosproto.Resource{
scalar("foo", 1.0, "role1"),
scalar("bar", 2.0, "role2"),
},
},
},
} {
lookupNode := node.LookupFunc(func(string) *api.Node {
return nil
})
c, err := NewCache(1000)
if err != nil {
t.Error(err)
continue
}
r, err := NewRegistry(lookupNode, tt.prototype, c)
if err != nil {
t.Error(err)
continue
}
got := r.New("", tt.resources)
if !reflect.DeepEqual(got, tt.want) {
t.Errorf("test #%d\ngot %v\nwant %v", i, got, tt.want)
}
}
}
func TestRegistryNewDup(t *testing.T) {
lookupNode := node.LookupFunc(func(string) *api.Node {
return nil
})
c, err := NewCache(1000)
if err != nil {
t.Error(err)
return
}
r, err := NewRegistry(lookupNode, &mesosproto.ExecutorInfo{}, c)
if err != nil {
t.Error(err)
return
}
new := r.New("", nil)
dup := r.New("", nil)
if !reflect.DeepEqual(new, dup) {
t.Errorf(
"expected new == dup, but got new %v dup %v",
new, dup,
)
}
}

View File

@ -25,6 +25,7 @@ import (
"testing" "testing"
"time" "time"
"github.com/gogo/protobuf/proto"
log "github.com/golang/glog" log "github.com/golang/glog"
mesos "github.com/mesos/mesos-go/mesosproto" mesos "github.com/mesos/mesos-go/mesosproto"
"github.com/mesos/mesos-go/mesosutil" "github.com/mesos/mesos-go/mesosutil"
@ -435,6 +436,24 @@ type lifecycleTest struct {
t *testing.T t *testing.T
} }
type mockRegistry struct {
prototype *mesos.ExecutorInfo
}
func (m mockRegistry) New(nodename string, rs []*mesos.Resource) *mesos.ExecutorInfo {
clone := proto.Clone(m.prototype).(*mesos.ExecutorInfo)
clone.Resources = rs
return clone
}
func (m mockRegistry) Get(nodename string) (*mesos.ExecutorInfo, error) {
panic("N/A")
}
func (m mockRegistry) Invalidate(hostname string) {
panic("N/A")
}
func newLifecycleTest(t *testing.T) lifecycleTest { func newLifecycleTest(t *testing.T) lifecycleTest {
assert := &EventAssertions{*assert.New(t)} assert := &EventAssertions{*assert.New(t)}
@ -458,7 +477,7 @@ func newLifecycleTest(t *testing.T) lifecycleTest {
}) })
c := *schedcfg.CreateDefaultConfig() c := *schedcfg.CreateDefaultConfig()
fw := framework.New(framework.Config{ fw := framework.New(framework.Config{
Executor: ei, ExecutorId: ei.GetExecutorId(),
Client: client, Client: client,
SchedulerConfig: c, SchedulerConfig: c,
LookupNode: apiServer.LookupNode, LookupNode: apiServer.LookupNode,
@ -470,24 +489,28 @@ func newLifecycleTest(t *testing.T) lifecycleTest {
// assert.NotNil(framework.offers, "offer registry is nil") // assert.NotNil(framework.offers, "offer registry is nil")
// create pod scheduler // create pod scheduler
strategy := podschedulers.NewAllocationStrategy( pr := podtask.NewDefaultProcurement(ei, mockRegistry{ei})
podtask.NewDefaultPredicate( fcfs := podschedulers.NewFCFSPodScheduler(pr, apiServer.LookupNode)
mresource.DefaultDefaultContainerCPULimit,
mresource.DefaultDefaultContainerMemLimit,
),
podtask.NewDefaultProcurement(
mresource.DefaultDefaultContainerCPULimit,
mresource.DefaultDefaultContainerMemLimit,
),
)
fcfs := podschedulers.NewFCFSPodScheduler(strategy, apiServer.LookupNode)
// create scheduler process // create scheduler process
schedulerProc := ha.New(fw) schedulerProc := ha.New(fw)
// create scheduler // create scheduler
eventObs := NewEventObserver() eventObs := NewEventObserver()
scheduler := components.New(&c, fw, fcfs, client, eventObs, schedulerProc.Terminal(), http.DefaultServeMux, &podsListWatch.ListWatch) scheduler := components.New(
&c,
fw,
fcfs,
client,
eventObs,
schedulerProc.Terminal(),
http.DefaultServeMux,
&podsListWatch.ListWatch,
ei,
[]string{"*"},
mresource.DefaultDefaultContainerCPULimit,
mresource.DefaultDefaultContainerMemLimit,
)
assert.NotNil(scheduler) assert.NotNil(scheduler)
// create mock mesos scheduler driver // create mock mesos scheduler driver

View File

@ -25,6 +25,9 @@ const (
TaskIdKey = "k8s.mesosphere.io/taskId" TaskIdKey = "k8s.mesosphere.io/taskId"
SlaveIdKey = "k8s.mesosphere.io/slaveId" SlaveIdKey = "k8s.mesosphere.io/slaveId"
OfferIdKey = "k8s.mesosphere.io/offerId" OfferIdKey = "k8s.mesosphere.io/offerId"
ExecutorIdKey = "k8s.mesosphere.io/executorId"
ExecutorResourcesKey = "k8s.mesosphere.io/executorResources"
PortMappingKey = "k8s.mesosphere.io/portMapping"
PortMappingKeyPrefix = "k8s.mesosphere.io/port_" PortMappingKeyPrefix = "k8s.mesosphere.io/port_"
PortMappingKeyFormat = PortMappingKeyPrefix + "%s_%d" PortMappingKeyFormat = PortMappingKeyPrefix + "%s_%d"
PortNameMappingKeyPrefix = "k8s.mesosphere.io/portName_" PortNameMappingKeyPrefix = "k8s.mesosphere.io/portName_"

View File

@ -0,0 +1,22 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package meta
// kubernetes api object labels
const (
RolesKey = "k8s.mesosphere.io/roles"
)

View File

@ -1,74 +0,0 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package podtask
import (
log "github.com/golang/glog"
mesos "github.com/mesos/mesos-go/mesosproto"
"k8s.io/kubernetes/pkg/api"
)
// bogus numbers that we use to make sure that there's some set of minimal offered resources on the slave
const (
minimalCpus = 0.01
minimalMem = 0.25
)
var (
DefaultMinimalPredicate = RequireAllPredicate([]FitPredicate{
ValidationPredicate,
NodeSelectorPredicate,
MinimalPodResourcesPredicate,
PortsPredicate,
}).Fit
DefaultMinimalProcurement = AllOrNothingProcurement([]Procurement{
ValidateProcurement,
NodeProcurement,
MinimalPodResourcesProcurement,
PortsProcurement,
}).Procure
)
func MinimalPodResourcesPredicate(t *T, offer *mesos.Offer, _ *api.Node) bool {
var (
offeredCpus float64
offeredMem float64
)
for _, resource := range offer.Resources {
if resource.GetName() == "cpus" {
offeredCpus = resource.GetScalar().GetValue()
}
if resource.GetName() == "mem" {
offeredMem = resource.GetScalar().GetValue()
}
}
log.V(4).Infof("trying to match offer with pod %v/%v: cpus: %.2f mem: %.2f MB", t.Pod.Namespace, t.Pod.Name, minimalCpus, minimalMem)
if (minimalCpus > offeredCpus) || (minimalMem > offeredMem) {
log.V(3).Infof("not enough resources for pod %v/%v: cpus: %.2f mem: %.2f MB", t.Pod.Namespace, t.Pod.Name, minimalCpus, minimalMem)
return false
}
return true
}
func MinimalPodResourcesProcurement(t *T, details *mesos.Offer) error {
log.V(3).Infof("Recording offer(s) %s/%s against pod %v: cpu: %.2f, mem: %.2f MB", details.Id, t.Pod.Namespace, t.Pod.Name, minimalCpus, minimalMem)
t.Spec.CPU = minimalCpus
t.Spec.Memory = minimalMem
return nil
}

View File

@ -17,6 +17,7 @@ limitations under the License.
package podtask package podtask
import ( import (
"errors"
"fmt" "fmt"
"strings" "strings"
"time" "time"
@ -26,7 +27,6 @@ import (
"k8s.io/kubernetes/contrib/mesos/pkg/offers" "k8s.io/kubernetes/contrib/mesos/pkg/offers"
annotation "k8s.io/kubernetes/contrib/mesos/pkg/scheduler/meta" annotation "k8s.io/kubernetes/contrib/mesos/pkg/scheduler/meta"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/metrics" "k8s.io/kubernetes/contrib/mesos/pkg/scheduler/metrics"
mresource "k8s.io/kubernetes/contrib/mesos/pkg/scheduler/resource"
"k8s.io/kubernetes/pkg/api" "k8s.io/kubernetes/pkg/api"
log "github.com/golang/glog" log "github.com/golang/glog"
@ -51,11 +51,17 @@ const (
Deleted = FlagType("deleted") Deleted = FlagType("deleted")
) )
var defaultRoles = []string{"*"}
// A struct that describes a pod task. // A struct that describes a pod task.
type T struct { type T struct {
ID string ID string
Pod api.Pod Pod api.Pod
Spec Spec
// Stores the final procurement result, once set read-only.
// Meant to be set by algorith.SchedulerAlgorithm only.
Spec *Spec
Offer offers.Perishable // thread-safe Offer offers.Perishable // thread-safe
State StateType State StateType
Flags map[FlagType]struct{} Flags map[FlagType]struct{}
@ -63,20 +69,26 @@ type T struct {
UpdatedTime time.Time // time of the most recent StatusUpdate we've seen from the mesos master UpdatedTime time.Time // time of the most recent StatusUpdate we've seen from the mesos master
podStatus api.PodStatus podStatus api.PodStatus
prototype *mesos.ExecutorInfo // readonly
allowedRoles []string // roles under which pods are allowed to be launched
podKey string podKey string
launchTime time.Time launchTime time.Time
bindTime time.Time bindTime time.Time
mapper HostPortMappingType mapper HostPortMapper
}
type Port struct {
Port uint64
Role string
} }
type Spec struct { type Spec struct {
SlaveID string SlaveID string
AssignedSlave string AssignedSlave string
CPU mresource.CPUShares Resources []*mesos.Resource
Memory mresource.MegaBytes
PortMap []HostPortMapping PortMap []HostPortMapping
Ports []uint64
Data []byte Data []byte
Executor *mesos.ExecutorInfo
} }
// mostly-clone this pod task. the clone will actually share the some fields: // mostly-clone this pod task. the clone will actually share the some fields:
@ -91,7 +103,6 @@ func (t *T) Clone() *T {
clone := *t clone := *t
// deep copy // deep copy
(&t.Spec).copyTo(&clone.Spec)
clone.Flags = map[FlagType]struct{}{} clone.Flags = map[FlagType]struct{}{}
for k := range t.Flags { for k := range t.Flags {
clone.Flags[k] = struct{}{} clone.Flags[k] = struct{}{}
@ -99,20 +110,8 @@ func (t *T) Clone() *T {
return &clone return &clone
} }
func (old *Spec) copyTo(new *Spec) {
if len(old.PortMap) > 0 {
new.PortMap = append(([]HostPortMapping)(nil), old.PortMap...)
}
if len(old.Ports) > 0 {
new.Ports = append(([]uint64)(nil), old.Ports...)
}
if len(old.Data) > 0 {
new.Data = append(([]byte)(nil), old.Data...)
}
}
func (t *T) HasAcceptedOffer() bool { func (t *T) HasAcceptedOffer() bool {
return t.Spec.SlaveID != "" return t.Spec != nil
} }
func (t *T) GetOfferId() string { func (t *T) GetOfferId() string {
@ -130,50 +129,21 @@ func generateTaskName(pod *api.Pod) string {
return fmt.Sprintf("%s.%s.pods", pod.Name, ns) return fmt.Sprintf("%s.%s.pods", pod.Name, ns)
} }
func setCommandArgument(ei *mesos.ExecutorInfo, flag, value string, create bool) { func (t *T) BuildTaskInfo() (*mesos.TaskInfo, error) {
argv := []string{} if t.Spec == nil {
overwrite := false return nil, errors.New("no podtask.T.Spec given, cannot build task info")
if ei.Command != nil && ei.Command.Arguments != nil {
argv = ei.Command.Arguments
for i, arg := range argv {
if strings.HasPrefix(arg, flag+"=") {
overwrite = true
argv[i] = flag + "=" + value
break
} }
}
}
if !overwrite && create {
argv = append(argv, flag+"="+value)
if ei.Command == nil {
ei.Command = &mesos.CommandInfo{}
}
ei.Command.Arguments = argv
}
}
func (t *T) BuildTaskInfo(prototype *mesos.ExecutorInfo) *mesos.TaskInfo {
info := &mesos.TaskInfo{ info := &mesos.TaskInfo{
Name: proto.String(generateTaskName(&t.Pod)), Name: proto.String(generateTaskName(&t.Pod)),
TaskId: mutil.NewTaskID(t.ID), TaskId: mutil.NewTaskID(t.ID),
SlaveId: mutil.NewSlaveID(t.Spec.SlaveID), Executor: t.Spec.Executor,
Executor: proto.Clone(prototype).(*mesos.ExecutorInfo),
Data: t.Spec.Data, Data: t.Spec.Data,
Resources: []*mesos.Resource{ Resources: t.Spec.Resources,
mutil.NewScalarResource("cpus", float64(t.Spec.CPU)), SlaveId: mutil.NewSlaveID(t.Spec.SlaveID),
mutil.NewScalarResource("mem", float64(t.Spec.Memory)),
},
} }
if portsResource := rangeResource("ports", t.Spec.Ports); portsResource != nil { return info, nil
info.Resources = append(info.Resources, portsResource)
}
// hostname needs of the executor needs to match that of the offer, otherwise
// the kubelet node status checker/updater is very unhappy
setCommandArgument(info.Executor, "--hostname-override", t.Spec.AssignedSlave, true)
return info
} }
// Clear offer-related details from the task, should be called if/when an offer // Clear offer-related details from the task, should be called if/when an offer
@ -181,7 +151,7 @@ func (t *T) BuildTaskInfo(prototype *mesos.ExecutorInfo) *mesos.TaskInfo {
func (t *T) Reset() { func (t *T) Reset() {
log.V(3).Infof("Clearing offer(s) from pod %v", t.Pod.Name) log.V(3).Infof("Clearing offer(s) from pod %v", t.Pod.Name)
t.Offer = nil t.Offer = nil
t.Spec = Spec{} t.Spec = nil
} }
func (t *T) Set(f FlagType) { func (t *T) Set(f FlagType) {
@ -198,23 +168,57 @@ func (t *T) Has(f FlagType) (exists bool) {
return return
} }
func New(ctx api.Context, id string, pod *api.Pod) (*T, error) { func (t *T) Roles() []string {
var roles []string
if r, ok := t.Pod.ObjectMeta.Labels[annotation.RolesKey]; ok {
roles = strings.Split(r, ",")
for i, r := range roles {
roles[i] = strings.TrimSpace(r)
}
roles = filterRoles(roles, not(emptyRole), not(seenRole()))
} else {
// no roles label defined,
// by convention return the first allowed role
// to be used for launching the pod task
return []string{t.allowedRoles[0]}
}
return filterRoles(roles, inRoles(t.allowedRoles...))
}
func New(ctx api.Context, id string, pod *api.Pod, prototype *mesos.ExecutorInfo, allowedRoles []string) (*T, error) {
if prototype == nil {
return nil, fmt.Errorf("illegal argument: executor is nil")
}
if len(allowedRoles) == 0 {
allowedRoles = defaultRoles
}
key, err := MakePodKey(ctx, pod.Name) key, err := MakePodKey(ctx, pod.Name)
if err != nil { if err != nil {
return nil, err return nil, err
} }
if id == "" { if id == "" {
id = "pod." + uuid.NewUUID().String() id = "pod." + uuid.NewUUID().String()
} }
task := &T{ task := &T{
ID: id, ID: id,
Pod: *pod, Pod: *pod,
State: StatePending, State: StatePending,
podKey: key, podKey: key,
mapper: MappingTypeForPod(pod), mapper: NewHostPortMapper(pod),
Flags: make(map[FlagType]struct{}), Flags: make(map[FlagType]struct{}),
prototype: prototype,
allowedRoles: allowedRoles,
} }
task.CreateTime = time.Now() task.CreateTime = time.Now()
return task, nil return task, nil
} }
@ -222,6 +226,7 @@ func (t *T) SaveRecoveryInfo(dict map[string]string) {
dict[annotation.TaskIdKey] = t.ID dict[annotation.TaskIdKey] = t.ID
dict[annotation.SlaveIdKey] = t.Spec.SlaveID dict[annotation.SlaveIdKey] = t.Spec.SlaveID
dict[annotation.OfferIdKey] = t.Offer.Details().Id.GetValue() dict[annotation.OfferIdKey] = t.Offer.Details().Id.GetValue()
dict[annotation.ExecutorIdKey] = t.Spec.Executor.ExecutorId.GetValue()
} }
// reconstruct a task from metadata stashed in a pod entry. there are limited pod states that // reconstruct a task from metadata stashed in a pod entry. there are limited pod states that
@ -267,9 +272,10 @@ func RecoverFrom(pod api.Pod) (*T, bool, error) {
podKey: key, podKey: key,
State: StatePending, // possibly running? mesos will tell us during reconciliation State: StatePending, // possibly running? mesos will tell us during reconciliation
Flags: make(map[FlagType]struct{}), Flags: make(map[FlagType]struct{}),
mapper: MappingTypeForPod(&pod), mapper: NewHostPortMapper(&pod),
launchTime: now, launchTime: now,
bindTime: now, bindTime: now,
Spec: &Spec{},
} }
var ( var (
offerId string offerId string
@ -293,6 +299,10 @@ func RecoverFrom(pod api.Pod) (*T, bool, error) {
offerId = v offerId = v
case annotation.TaskIdKey: case annotation.TaskIdKey:
t.ID = v t.ID = v
case annotation.ExecutorIdKey:
// this is nowhere near sufficient to re-launch a task, but we really just
// want this for tracking
t.Spec.Executor = &mesos.ExecutorInfo{ExecutorId: mutil.NewExecutorID(v)}
} }
} }
t.Offer = offers.Expired(offerId, t.Spec.AssignedSlave, 0) t.Offer = offers.Expired(offerId, t.Spec.AssignedSlave, 0)

View File

@ -17,13 +17,15 @@ limitations under the License.
package podtask package podtask
import ( import (
"reflect"
"testing" "testing"
"github.com/gogo/protobuf/proto" "github.com/gogo/protobuf/proto"
mesos "github.com/mesos/mesos-go/mesosproto" mesos "github.com/mesos/mesos-go/mesosproto"
mutil "github.com/mesos/mesos-go/mesosutil" mutil "github.com/mesos/mesos-go/mesosutil"
"github.com/stretchr/testify/assert"
"k8s.io/kubernetes/contrib/mesos/pkg/node" "k8s.io/kubernetes/contrib/mesos/pkg/node"
mresource "k8s.io/kubernetes/contrib/mesos/pkg/scheduler/resource" "k8s.io/kubernetes/contrib/mesos/pkg/scheduler/meta"
"k8s.io/kubernetes/pkg/api" "k8s.io/kubernetes/pkg/api"
) )
@ -32,21 +34,100 @@ const (
t_min_mem = 128 t_min_mem = 128
) )
func fakePodTask(id string) (*T, error) { func fakePodTask(id string, roles ...string) *T {
return New(api.NewDefaultContext(), "", &api.Pod{ t, _ := New(
api.NewDefaultContext(),
"",
&api.Pod{
ObjectMeta: api.ObjectMeta{ ObjectMeta: api.ObjectMeta{
Name: id, Name: id,
Namespace: api.NamespaceDefault, Namespace: api.NamespaceDefault,
}, },
}) },
&mesos.ExecutorInfo{},
roles,
)
return t
}
func TestRoles(t *testing.T) {
assert := assert.New(t)
for i, tt := range []struct {
labels map[string]string
frameworkRoles []string
want []string
}{
{
map[string]string{},
nil,
defaultRoles,
},
{
map[string]string{"other": "label"},
nil,
defaultRoles,
},
{
map[string]string{meta.RolesKey: ""},
nil,
[]string{},
},
{
map[string]string{
"other": "label",
meta.RolesKey: ", , ,",
},
nil,
[]string{},
},
{
map[string]string{meta.RolesKey: "forbiddenRole"},
[]string{"allowedRole"},
[]string{},
},
{
map[string]string{meta.RolesKey: "*, , *, ,slave_public,"},
[]string{"*", "slave_public"},
[]string{"*", "slave_public"},
},
{
map[string]string{meta.RolesKey: "role3,role2,role1"},
[]string{"role1", "role4"},
[]string{"role1"},
},
{
map[string]string{},
[]string{"role1"},
[]string{"role1"},
},
} {
task := fakePodTask("test", tt.frameworkRoles...)
task.Pod.ObjectMeta.Labels = tt.labels
assert.True(reflect.DeepEqual(task.Roles(), tt.want), "test #%d got %#v want %#v", i, task.Roles(), tt.want)
}
}
type mockRegistry struct{}
func (mr mockRegistry) New(nodename string, resources []*mesos.Resource) *mesos.ExecutorInfo {
return &mesos.ExecutorInfo{
Resources: resources,
}
}
func (mr mockRegistry) Get(nodename string) (*mesos.ExecutorInfo, error) {
panic("N/A")
}
func (mr mockRegistry) Invalidate(hostname string) {
panic("N/A")
} }
func TestEmptyOffer(t *testing.T) { func TestEmptyOffer(t *testing.T) {
t.Parallel() t.Parallel()
task, err := fakePodTask("foo") task := fakePodTask("foo")
if err != nil {
t.Fatal(err)
}
task.Pod.Spec = api.PodSpec{ task.Pod.Spec = api.PodSpec{
Containers: []api.Container{{ Containers: []api.Container{{
@ -54,21 +135,28 @@ func TestEmptyOffer(t *testing.T) {
}}, }},
} }
defaultPredicate := NewDefaultPredicate(mresource.DefaultDefaultContainerCPULimit, mresource.DefaultDefaultContainerMemLimit) defaultProc := NewDefaultProcurement(
if ok := defaultPredicate(task, nil, nil); ok { &mesos.ExecutorInfo{
t.Fatalf("accepted nil offer") Resources: []*mesos.Resource{
} mutil.NewScalarResource("cpus", 1.0),
if ok := defaultPredicate(task, &mesos.Offer{}, nil); ok { mutil.NewScalarResource("mem", 64.0),
},
},
mockRegistry{},
)
if err := defaultProc.Procure(
task,
&api.Node{},
NewProcureState(&mesos.Offer{}),
); err == nil {
t.Fatalf("accepted empty offer") t.Fatalf("accepted empty offer")
} }
} }
func TestNoPortsInPodOrOffer(t *testing.T) { func TestNoPortsInPodOrOffer(t *testing.T) {
t.Parallel() t.Parallel()
task, err := fakePodTask("foo") task := fakePodTask("foo")
if err != nil || task == nil {
t.Fatal(err)
}
task.Pod.Spec = api.PodSpec{ task.Pod.Spec = api.PodSpec{
Containers: []api.Container{{ Containers: []api.Container{{
@ -76,7 +164,14 @@ func TestNoPortsInPodOrOffer(t *testing.T) {
}}, }},
} }
defaultPredicate := NewDefaultPredicate(mresource.DefaultDefaultContainerCPULimit, mresource.DefaultDefaultContainerMemLimit) executor := &mesos.ExecutorInfo{
Resources: []*mesos.Resource{
mutil.NewScalarResource("cpus", 1.0),
mutil.NewScalarResource("mem", 64.0),
},
}
defaultProc := NewDefaultProcurement(executor, mockRegistry{})
offer := &mesos.Offer{ offer := &mesos.Offer{
Resources: []*mesos.Resource{ Resources: []*mesos.Resource{
@ -84,7 +179,12 @@ func TestNoPortsInPodOrOffer(t *testing.T) {
mutil.NewScalarResource("mem", 0.001), mutil.NewScalarResource("mem", 0.001),
}, },
} }
if ok := defaultPredicate(task, offer, nil); ok {
if err := defaultProc.Procure(
task,
nil,
NewProcureState(offer),
); err == nil {
t.Fatalf("accepted offer %v:", offer) t.Fatalf("accepted offer %v:", offer)
} }
@ -94,26 +194,39 @@ func TestNoPortsInPodOrOffer(t *testing.T) {
mutil.NewScalarResource("mem", t_min_mem), mutil.NewScalarResource("mem", t_min_mem),
}, },
} }
if ok := defaultPredicate(task, offer, nil); !ok {
if err := defaultProc.Procure(
task,
nil,
NewProcureState(offer),
); err != nil {
t.Fatalf("did not accepted offer %v:", offer) t.Fatalf("did not accepted offer %v:", offer)
} }
} }
func TestAcceptOfferPorts(t *testing.T) { func TestAcceptOfferPorts(t *testing.T) {
t.Parallel() t.Parallel()
task, _ := fakePodTask("foo") task := fakePodTask("foo")
pod := &task.Pod pod := &task.Pod
defaultPredicate := NewDefaultPredicate(mresource.DefaultDefaultContainerCPULimit, mresource.DefaultDefaultContainerMemLimit) defaultProc := NewDefaultProcurement(
&mesos.ExecutorInfo{},
mockRegistry{},
)
offer := &mesos.Offer{ offer := &mesos.Offer{
Resources: []*mesos.Resource{ Resources: []*mesos.Resource{
mutil.NewScalarResource("cpus", t_min_cpu), mutil.NewScalarResource("cpus", t_min_cpu),
mutil.NewScalarResource("mem", t_min_mem), mutil.NewScalarResource("mem", t_min_mem),
rangeResource("ports", []uint64{1, 1}), newPortsResource("*", 1, 1),
}, },
} }
if ok := defaultPredicate(task, offer, nil); !ok {
if err := defaultProc.Procure(
task,
&api.Node{},
NewProcureState(offer),
); err != nil {
t.Fatalf("did not accepted offer %v:", offer) t.Fatalf("did not accepted offer %v:", offer)
} }
@ -125,17 +238,31 @@ func TestAcceptOfferPorts(t *testing.T) {
}}, }},
} }
if ok := defaultPredicate(task, offer, nil); ok { if err := defaultProc.Procure(
task,
&api.Node{},
NewProcureState(offer),
); err == nil {
t.Fatalf("accepted offer %v:", offer) t.Fatalf("accepted offer %v:", offer)
} }
pod.Spec.Containers[0].Ports[0].HostPort = 1 pod.Spec.Containers[0].Ports[0].HostPort = 1
if ok := defaultPredicate(task, offer, nil); !ok {
if err := defaultProc.Procure(
task,
&api.Node{},
NewProcureState(offer),
); err != nil {
t.Fatalf("did not accepted offer %v:", offer) t.Fatalf("did not accepted offer %v:", offer)
} }
pod.Spec.Containers[0].Ports[0].HostPort = 0 pod.Spec.Containers[0].Ports[0].HostPort = 0
if ok := defaultPredicate(task, offer, nil); !ok {
if err := defaultProc.Procure(
task,
&api.Node{},
NewProcureState(offer),
); err != nil {
t.Fatalf("did not accepted offer %v:", offer) t.Fatalf("did not accepted offer %v:", offer)
} }
@ -143,12 +270,22 @@ func TestAcceptOfferPorts(t *testing.T) {
mutil.NewScalarResource("cpus", t_min_cpu), mutil.NewScalarResource("cpus", t_min_cpu),
mutil.NewScalarResource("mem", t_min_mem), mutil.NewScalarResource("mem", t_min_mem),
} }
if ok := defaultPredicate(task, offer, nil); ok {
if err := defaultProc.Procure(
task,
&api.Node{},
NewProcureState(offer),
); err == nil {
t.Fatalf("accepted offer %v:", offer) t.Fatalf("accepted offer %v:", offer)
} }
pod.Spec.Containers[0].Ports[0].HostPort = 1 pod.Spec.Containers[0].Ports[0].HostPort = 1
if ok := defaultPredicate(task, offer, nil); ok {
if err := defaultProc.Procure(
task,
&api.Node{},
NewProcureState(offer),
); err == nil {
t.Fatalf("accepted offer %v:", offer) t.Fatalf("accepted offer %v:", offer)
} }
} }
@ -233,10 +370,13 @@ func TestNodeSelector(t *testing.T) {
{map[string]string{"some.other/label": "43"}, node3, true, "non-slave attribute matches"}, {map[string]string{"some.other/label": "43"}, node3, true, "non-slave attribute matches"},
} }
defaultPredicate := NewDefaultPredicate(mresource.DefaultDefaultContainerCPULimit, mresource.DefaultDefaultContainerMemLimit) defaultProc := NewDefaultProcurement(
&mesos.ExecutorInfo{},
mockRegistry{},
)
for _, ts := range tests { for _, ts := range tests {
task, _ := fakePodTask("foo") task := fakePodTask("foo")
task.Pod.Spec.NodeSelector = ts.selector task.Pod.Spec.NodeSelector = ts.selector
offer := &mesos.Offer{ offer := &mesos.Offer{
Resources: []*mesos.Resource{ Resources: []*mesos.Resource{
@ -245,8 +385,16 @@ func TestNodeSelector(t *testing.T) {
}, },
Hostname: &ts.node.Name, Hostname: &ts.node.Name,
} }
if got, want := defaultPredicate(task, offer, ts.node), ts.ok; got != want {
t.Fatalf("expected acceptance of offer for selector %v to be %v, got %v: %q", ts.selector, want, got, ts.desc) err := defaultProc.Procure(
task,
ts.node,
NewProcureState(offer),
)
ok := err == nil
if ts.ok != ok {
t.Fatalf("expected acceptance of offer for selector %v to be %v, got %v: %q", ts.selector, ts.ok, ok, ts.desc)
} }
} }
} }
@ -266,3 +414,12 @@ func newScalarAttribute(name string, val float64) *mesos.Attribute {
Scalar: &mesos.Value_Scalar{Value: proto.Float64(val)}, Scalar: &mesos.Value_Scalar{Value: proto.Float64(val)},
} }
} }
func newPortsResource(role string, ports ...uint64) *mesos.Resource {
return &mesos.Resource{
Name: proto.String("ports"),
Type: mesos.Value_RANGES.Enum(),
Ranges: newRanges(ports),
Role: stringPtrTo(role),
}
}

View File

@ -21,39 +21,43 @@ import (
log "github.com/golang/glog" log "github.com/golang/glog"
mesos "github.com/mesos/mesos-go/mesosproto" mesos "github.com/mesos/mesos-go/mesosproto"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/meta"
"k8s.io/kubernetes/pkg/api" "k8s.io/kubernetes/pkg/api"
"k8s.io/kubernetes/pkg/labels" "k8s.io/kubernetes/pkg/labels"
) )
type HostPortMappingType string
const ( const (
// maps a Container.HostPort to the same exact offered host port, ignores .HostPort = 0 // maps a Container.HostPort to the same exact offered host port, ignores .HostPort = 0
HostPortMappingFixed HostPortMappingType = "fixed" HostPortMappingFixed = "fixed"
// same as HostPortMappingFixed, except that .HostPort of 0 are mapped to any port offered // same as HostPortMappingFixed, except that .HostPort of 0 are mapped to any port offered
HostPortMappingWildcard = "wildcard" HostPortMappingWildcard = "wildcard"
) )
// Objects implementing the HostPortMapper interface generate port mappings
// from k8s container ports to ports offered by mesos
type HostPortMapper interface { type HostPortMapper interface {
// abstracts the way that host ports are mapped to pod container ports // Map maps the given pod task and the given mesos offer
Generate(t *T, offer *mesos.Offer) ([]HostPortMapping, error) // and returns a slice of port mappings
// or an error if the mapping failed
Map(t *T, offer *mesos.Offer) ([]HostPortMapping, error)
} }
// HostPortMapperFunc is a function adapter to the HostPortMapper interface
type HostPortMapperFunc func(*T, *mesos.Offer) ([]HostPortMapping, error)
// Map calls f(t, offer)
func (f HostPortMapperFunc) Map(t *T, offer *mesos.Offer) ([]HostPortMapping, error) {
return f(t, offer)
}
// A HostPortMapping represents the mapping between k8s container ports
// ports offered by mesos. It references the k8s' container and port
// and specifies the offered mesos port and the offered port's role
type HostPortMapping struct { type HostPortMapping struct {
ContainerIdx int // index of the container in the pod spec ContainerIdx int // index of the container in the pod spec
PortIdx int // index of the port in a container's port spec PortIdx int // index of the port in a container's port spec
OfferPort uint64 OfferPort uint64 // the port offered by mesos
} Role string // the role asssociated with the offered port
func (self HostPortMappingType) Generate(t *T, offer *mesos.Offer) ([]HostPortMapping, error) {
switch self {
case HostPortMappingWildcard:
return wildcardHostPortMapping(t, offer)
case HostPortMappingFixed:
default:
log.Warningf("illegal host-port mapping spec %q, defaulting to %q", self, HostPortMappingFixed)
}
return defaultHostPortMapping(t, offer)
} }
type PortAllocationError struct { type PortAllocationError struct {
@ -75,16 +79,18 @@ func (err *DuplicateHostPortError) Error() string {
err.m1.OfferPort, err.m1.ContainerIdx, err.m1.PortIdx, err.m2.ContainerIdx, err.m2.PortIdx) err.m1.OfferPort, err.m1.ContainerIdx, err.m1.PortIdx, err.m2.ContainerIdx, err.m2.PortIdx)
} }
// wildcard k8s host port mapping implementation: hostPort == 0 gets mapped to any available offer port // WildcardMapper maps k8s wildcard ports (hostPort == 0) to any available offer port
func wildcardHostPortMapping(t *T, offer *mesos.Offer) ([]HostPortMapping, error) { func WildcardMapper(t *T, offer *mesos.Offer) ([]HostPortMapping, error) {
mapping, err := defaultHostPortMapping(t, offer) mapping, err := FixedMapper(t, offer)
if err != nil { if err != nil {
return nil, err return nil, err
} }
taken := make(map[uint64]struct{}) taken := make(map[uint64]struct{})
for _, entry := range mapping { for _, entry := range mapping {
taken[entry.OfferPort] = struct{}{} taken[entry.OfferPort] = struct{}{}
} }
wildports := []HostPortMapping{} wildports := []HostPortMapping{}
for i, container := range t.Pod.Spec.Containers { for i, container := range t.Pod.Spec.Containers {
for pi, port := range container.Ports { for pi, port := range container.Ports {
@ -96,8 +102,9 @@ func wildcardHostPortMapping(t *T, offer *mesos.Offer) ([]HostPortMapping, error
} }
} }
} }
remaining := len(wildports) remaining := len(wildports)
foreachRange(offer, "ports", func(bp, ep uint64) { foreachPortsRange(offer.GetResources(), t.Roles(), func(bp, ep uint64, role string) {
log.V(3).Infof("Searching for wildcard port in range {%d:%d}", bp, ep) log.V(3).Infof("Searching for wildcard port in range {%d:%d}", bp, ep)
for i := range wildports { for i := range wildports {
if wildports[i].OfferPort != 0 { if wildports[i].OfferPort != 0 {
@ -108,6 +115,7 @@ func wildcardHostPortMapping(t *T, offer *mesos.Offer) ([]HostPortMapping, error
continue continue
} }
wildports[i].OfferPort = port wildports[i].OfferPort = port
wildports[i].Role = starredRole(role)
mapping = append(mapping, wildports[i]) mapping = append(mapping, wildports[i])
remaining-- remaining--
taken[port] = struct{}{} taken[port] = struct{}{}
@ -115,6 +123,7 @@ func wildcardHostPortMapping(t *T, offer *mesos.Offer) ([]HostPortMapping, error
} }
} }
}) })
if remaining > 0 { if remaining > 0 {
err := &PortAllocationError{ err := &PortAllocationError{
PodId: t.Pod.Name, PodId: t.Pod.Name,
@ -122,12 +131,12 @@ func wildcardHostPortMapping(t *T, offer *mesos.Offer) ([]HostPortMapping, error
// it doesn't make sense to include a port list here because they were all zero (wildcards) // it doesn't make sense to include a port list here because they were all zero (wildcards)
return nil, err return nil, err
} }
return mapping, nil return mapping, nil
} }
// default k8s host port mapping implementation: hostPort == 0 means containerPort remains pod-private, and so // FixedMapper maps k8s host ports to offered ports ignoring hostPorts == 0 (remaining pod-private)
// no offer ports will be mapped to such Container ports. func FixedMapper(t *T, offer *mesos.Offer) ([]HostPortMapping, error) {
func defaultHostPortMapping(t *T, offer *mesos.Offer) ([]HostPortMapping, error) {
requiredPorts := make(map[uint64]HostPortMapping) requiredPorts := make(map[uint64]HostPortMapping)
mapping := []HostPortMapping{} mapping := []HostPortMapping{}
for i, container := range t.Pod.Spec.Containers { for i, container := range t.Pod.Spec.Containers {
@ -149,15 +158,19 @@ func defaultHostPortMapping(t *T, offer *mesos.Offer) ([]HostPortMapping, error)
requiredPorts[uint64(port.HostPort)] = m requiredPorts[uint64(port.HostPort)] = m
} }
} }
foreachRange(offer, "ports", func(bp, ep uint64) {
foreachPortsRange(offer.GetResources(), t.Roles(), func(bp, ep uint64, role string) {
for port := range requiredPorts { for port := range requiredPorts {
log.V(3).Infof("evaluating port range {%d:%d} %d", bp, ep, port) log.V(3).Infof("evaluating port range {%d:%d} %d", bp, ep, port)
if (bp <= port) && (port <= ep) { if (bp <= port) && (port <= ep) {
mapping = append(mapping, requiredPorts[port]) m := requiredPorts[port]
m.Role = starredRole(role)
mapping = append(mapping, m)
delete(requiredPorts, port) delete(requiredPorts, port)
} }
} }
}) })
unsatisfiedPorts := len(requiredPorts) unsatisfiedPorts := len(requiredPorts)
if unsatisfiedPorts > 0 { if unsatisfiedPorts > 0 {
err := &PortAllocationError{ err := &PortAllocationError{
@ -168,18 +181,19 @@ func defaultHostPortMapping(t *T, offer *mesos.Offer) ([]HostPortMapping, error)
} }
return nil, err return nil, err
} }
return mapping, nil return mapping, nil
} }
const PortMappingLabelKey = "k8s.mesosphere.io/portMapping" // NewHostPortMapper returns a new mapper based
// based on the port mapping key value
func MappingTypeForPod(pod *api.Pod) HostPortMappingType { func NewHostPortMapper(pod *api.Pod) HostPortMapper {
filter := map[string]string{ filter := map[string]string{
PortMappingLabelKey: string(HostPortMappingFixed), meta.PortMappingKey: HostPortMappingFixed,
} }
selector := labels.Set(filter).AsSelector() selector := labels.Set(filter).AsSelector()
if selector.Matches(labels.Set(pod.Labels)) { if selector.Matches(labels.Set(pod.Labels)) {
return HostPortMappingFixed return HostPortMapperFunc(FixedMapper)
} }
return HostPortMappingWildcard return HostPortMapperFunc(WildcardMapper)
} }

View File

@ -26,15 +26,15 @@ import (
func TestDefaultHostPortMatching(t *testing.T) { func TestDefaultHostPortMatching(t *testing.T) {
t.Parallel() t.Parallel()
task, _ := fakePodTask("foo") task := fakePodTask("foo")
pod := &task.Pod pod := &task.Pod
offer := &mesos.Offer{ offer := &mesos.Offer{
Resources: []*mesos.Resource{ Resources: []*mesos.Resource{
rangeResource("ports", []uint64{1, 1}), newPortsResource("*", 1, 1),
}, },
} }
mapping, err := defaultHostPortMapping(task, offer) mapping, err := FixedMapper(task, offer)
if err != nil { if err != nil {
t.Fatal(err) t.Fatal(err)
} }
@ -52,11 +52,11 @@ func TestDefaultHostPortMatching(t *testing.T) {
}}, }},
}}, }},
} }
task, err = New(api.NewDefaultContext(), "", pod) task, err = New(api.NewDefaultContext(), "", pod, &mesos.ExecutorInfo{}, nil)
if err != nil { if err != nil {
t.Fatal(err) t.Fatal(err)
} }
_, err = defaultHostPortMapping(task, offer) _, err = FixedMapper(task, offer)
if err, _ := err.(*DuplicateHostPortError); err == nil { if err, _ := err.(*DuplicateHostPortError); err == nil {
t.Fatal("Expected duplicate port error") t.Fatal("Expected duplicate port error")
} else if err.m1.OfferPort != 123 { } else if err.m1.OfferPort != 123 {
@ -66,11 +66,11 @@ func TestDefaultHostPortMatching(t *testing.T) {
func TestWildcardHostPortMatching(t *testing.T) { func TestWildcardHostPortMatching(t *testing.T) {
t.Parallel() t.Parallel()
task, _ := fakePodTask("foo") task := fakePodTask("foo")
pod := &task.Pod pod := &task.Pod
offer := &mesos.Offer{} offer := &mesos.Offer{}
mapping, err := wildcardHostPortMapping(task, offer) mapping, err := WildcardMapper(task, offer)
if err != nil { if err != nil {
t.Fatal(err) t.Fatal(err)
} }
@ -81,10 +81,10 @@ func TestWildcardHostPortMatching(t *testing.T) {
//-- //--
offer = &mesos.Offer{ offer = &mesos.Offer{
Resources: []*mesos.Resource{ Resources: []*mesos.Resource{
rangeResource("ports", []uint64{1, 1}), newPortsResource("*", 1, 1),
}, },
} }
mapping, err = wildcardHostPortMapping(task, offer) mapping, err = WildcardMapper(task, offer)
if err != nil { if err != nil {
t.Fatal(err) t.Fatal(err)
} }
@ -100,11 +100,11 @@ func TestWildcardHostPortMatching(t *testing.T) {
}}, }},
}}, }},
} }
task, err = New(api.NewDefaultContext(), "", pod) task, err = New(api.NewDefaultContext(), "", pod, &mesos.ExecutorInfo{}, nil)
if err != nil { if err != nil {
t.Fatal(err) t.Fatal(err)
} }
mapping, err = wildcardHostPortMapping(task, offer) mapping, err = WildcardMapper(task, offer)
if err == nil { if err == nil {
t.Fatalf("expected error instead of mappings: %#v", mapping) t.Fatalf("expected error instead of mappings: %#v", mapping)
} else if err, _ := err.(*PortAllocationError); err == nil { } else if err, _ := err.(*PortAllocationError); err == nil {
@ -123,11 +123,11 @@ func TestWildcardHostPortMatching(t *testing.T) {
}}, }},
}}, }},
} }
task, err = New(api.NewDefaultContext(), "", pod) task, err = New(api.NewDefaultContext(), "", pod, &mesos.ExecutorInfo{}, nil)
if err != nil { if err != nil {
t.Fatal(err) t.Fatal(err)
} }
mapping, err = wildcardHostPortMapping(task, offer) mapping, err = WildcardMapper(task, offer)
if err, _ := err.(*PortAllocationError); err == nil { if err, _ := err.(*PortAllocationError); err == nil {
t.Fatal("Expected port allocation error") t.Fatal("Expected port allocation error")
} else if !(len(err.Ports) == 1 && err.Ports[0] == 123) { } else if !(len(err.Ports) == 1 && err.Ports[0] == 123) {
@ -144,11 +144,11 @@ func TestWildcardHostPortMatching(t *testing.T) {
}}, }},
}}, }},
} }
task, err = New(api.NewDefaultContext(), "", pod) task, err = New(api.NewDefaultContext(), "", pod, &mesos.ExecutorInfo{}, nil)
if err != nil { if err != nil {
t.Fatal(err) t.Fatal(err)
} }
mapping, err = wildcardHostPortMapping(task, offer) mapping, err = WildcardMapper(task, offer)
if err, _ := err.(*PortAllocationError); err == nil { if err, _ := err.(*PortAllocationError); err == nil {
t.Fatal("Expected port allocation error") t.Fatal("Expected port allocation error")
} else if len(err.Ports) != 0 { } else if len(err.Ports) != 0 {
@ -158,10 +158,10 @@ func TestWildcardHostPortMatching(t *testing.T) {
//-- //--
offer = &mesos.Offer{ offer = &mesos.Offer{
Resources: []*mesos.Resource{ Resources: []*mesos.Resource{
rangeResource("ports", []uint64{1, 2}), newPortsResource("*", 1, 2),
}, },
} }
mapping, err = wildcardHostPortMapping(task, offer) mapping, err = WildcardMapper(task, offer)
if err != nil { if err != nil {
t.Fatal(err) t.Fatal(err)
} else if len(mapping) != 2 { } else if len(mapping) != 2 {
@ -190,7 +190,7 @@ func TestWildcardHostPortMatching(t *testing.T) {
}}, }},
}}, }},
} }
task, err = New(api.NewDefaultContext(), "", pod) task, err = New(api.NewDefaultContext(), "", pod, &mesos.ExecutorInfo{}, nil)
if err != nil { if err != nil {
t.Fatal(err) t.Fatal(err)
} }
@ -199,7 +199,7 @@ func TestWildcardHostPortMatching(t *testing.T) {
mesosutil.NewRangesResource("ports", []*mesos.Value_Range{mesosutil.NewValueRange(1, 1), mesosutil.NewValueRange(3, 5)}), mesosutil.NewRangesResource("ports", []*mesos.Value_Range{mesosutil.NewValueRange(1, 1), mesosutil.NewValueRange(3, 5)}),
}, },
} }
mapping, err = wildcardHostPortMapping(task, offer) mapping, err = WildcardMapper(task, offer)
if err != nil { if err != nil {
t.Fatal(err) t.Fatal(err)
} else if len(mapping) != 2 { } else if len(mapping) != 2 {
@ -218,27 +218,3 @@ func TestWildcardHostPortMatching(t *testing.T) {
t.Fatalf("Expected 2 valid port mappings, not %d", valid) t.Fatalf("Expected 2 valid port mappings, not %d", valid)
} }
} }
func TestMappingTypeForPod(t *testing.T) {
pod := &api.Pod{
ObjectMeta: api.ObjectMeta{
Labels: map[string]string{},
},
}
mt := MappingTypeForPod(pod)
if mt != HostPortMappingWildcard {
t.Fatalf("expected wildcard mapping")
}
pod.Labels[PortMappingLabelKey] = string(HostPortMappingFixed)
mt = MappingTypeForPod(pod)
if mt != HostPortMappingFixed {
t.Fatalf("expected fixed mapping")
}
pod.Labels[PortMappingLabelKey] = string(HostPortMappingWildcard)
mt = MappingTypeForPod(pod)
if mt != HostPortMappingWildcard {
t.Fatalf("expected wildcard mapping")
}
}

View File

@ -1,119 +0,0 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package podtask
import (
log "github.com/golang/glog"
mesos "github.com/mesos/mesos-go/mesosproto"
mresource "k8s.io/kubernetes/contrib/mesos/pkg/scheduler/resource"
"k8s.io/kubernetes/pkg/api"
"k8s.io/kubernetes/pkg/labels"
)
func NewDefaultPredicate(c mresource.CPUShares, m mresource.MegaBytes) FitPredicate {
return RequireAllPredicate([]FitPredicate{
ValidationPredicate,
NodeSelectorPredicate,
NewPodFitsResourcesPredicate(c, m),
PortsPredicate,
}).Fit
}
// FitPredicate implementations determine if the given task "fits" into offered Mesos resources.
// Neither the task or offer should be modified. Note that the node can be nil.
type FitPredicate func(*T, *mesos.Offer, *api.Node) bool
type RequireAllPredicate []FitPredicate
func (f RequireAllPredicate) Fit(t *T, offer *mesos.Offer, n *api.Node) bool {
for _, p := range f {
if !p(t, offer, n) {
return false
}
}
return true
}
func ValidationPredicate(t *T, offer *mesos.Offer, _ *api.Node) bool {
return t != nil && offer != nil
}
func NodeSelectorPredicate(t *T, offer *mesos.Offer, n *api.Node) bool {
// if the user has specified a target host, make sure this offer is for that host
if t.Pod.Spec.NodeName != "" && offer.GetHostname() != t.Pod.Spec.NodeName {
return false
}
// check the NodeSelector
if len(t.Pod.Spec.NodeSelector) > 0 {
if n.Labels == nil {
return false
}
selector := labels.SelectorFromSet(t.Pod.Spec.NodeSelector)
if !selector.Matches(labels.Set(n.Labels)) {
return false
}
}
return true
}
func PortsPredicate(t *T, offer *mesos.Offer, _ *api.Node) bool {
// check ports
if _, err := t.mapper.Generate(t, offer); err != nil {
log.V(3).Info(err)
return false
}
return true
}
func NewPodFitsResourcesPredicate(c mresource.CPUShares, m mresource.MegaBytes) func(t *T, offer *mesos.Offer, _ *api.Node) bool {
return func(t *T, offer *mesos.Offer, _ *api.Node) bool {
// find offered cpu and mem
var (
offeredCpus mresource.CPUShares
offeredMem mresource.MegaBytes
)
for _, resource := range offer.Resources {
if resource.GetName() == "cpus" {
offeredCpus = mresource.CPUShares(*resource.GetScalar().Value)
}
if resource.GetName() == "mem" {
offeredMem = mresource.MegaBytes(*resource.GetScalar().Value)
}
}
// calculate cpu and mem sum over all containers of the pod
// TODO (@sttts): also support pod.spec.resources.limit.request
// TODO (@sttts): take into account the executor resources
_, cpu, _, err := mresource.CPUForPod(&t.Pod, c)
if err != nil {
return false
}
_, mem, _, err := mresource.MemForPod(&t.Pod, m)
if err != nil {
return false
}
log.V(4).Infof("trying to match offer with pod %v/%v: cpus: %.2f mem: %.2f MB", t.Pod.Namespace, t.Pod.Name, cpu, mem)
if (cpu > offeredCpus) || (mem > offeredMem) {
log.V(3).Infof("not enough resources for pod %v/%v: cpus: %.2f mem: %.2f MB", t.Pod.Namespace, t.Pod.Name, cpu, mem)
return false
}
return true
}
}

View File

@ -17,31 +17,84 @@ limitations under the License.
package podtask package podtask
import ( import (
"fmt"
"math"
"github.com/gogo/protobuf/proto"
log "github.com/golang/glog" log "github.com/golang/glog"
mesos "github.com/mesos/mesos-go/mesosproto" mesos "github.com/mesos/mesos-go/mesosproto"
"github.com/mesos/mesos-go/mesosutil"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/executorinfo"
mresource "k8s.io/kubernetes/contrib/mesos/pkg/scheduler/resource" mresource "k8s.io/kubernetes/contrib/mesos/pkg/scheduler/resource"
"k8s.io/kubernetes/pkg/api"
"k8s.io/kubernetes/pkg/labels"
) )
// NewDefaultProcurement returns the default procurement strategy that combines validation // NewDefaultProcurement returns the default procurement strategy that combines validation
// and responsible Mesos resource procurement. c and m are resource quantities written into // and responsible Mesos resource procurement. c and m are resource quantities written into
// k8s api.Pod.Spec's that don't declare resources (all containers in k8s-mesos require cpu // k8s api.Pod.Spec's that don't declare resources (all containers in k8s-mesos require cpu
// and memory limits). // and memory limits).
func NewDefaultProcurement(c mresource.CPUShares, m mresource.MegaBytes) Procurement { func NewDefaultProcurement(prototype *mesos.ExecutorInfo, eir executorinfo.Registry) Procurement {
resourceProcurer := &RequirePodResources{
defaultContainerCPULimit: c,
defaultContainerMemLimit: m,
}
return AllOrNothingProcurement([]Procurement{ return AllOrNothingProcurement([]Procurement{
ValidateProcurement, NewNodeProcurement(),
NodeProcurement, NewPodResourcesProcurement(),
resourceProcurer.Procure, NewPortsProcurement(),
PortsProcurement, NewExecutorResourceProcurer(prototype.GetResources(), eir),
}).Procure })
} }
// Procurement funcs allocate resources for a task from an offer. // Procurement is the interface that implements resource procurement.
// Both the task and/or offer may be modified. //
type Procurement func(*T, *mesos.Offer) error // Procure procurs offered resources for a given pod task T
// on a given node and stores the procurement result.
//
// Initially the procurement pipe contains an initial empty Spec
// and the the complete Mesos offer. As the procurement pipeline progresses
// the specified resources go up as they are being procured
// while the remaining Mesos offer resources go down until they are depleted.
//
// It returns an error if the procurement failed.
//
// Note that the T struct also includes a Spec field.
// This differs from the procured Spec which is meant to be filled
// by a chain of Procure invocations (procurement pipeline).
//
// In contrast T.Spec is meant not to be filled by the procurement chain
// but rather by a final scheduler instance.
type Procurement interface {
Procure(*T, *api.Node, *ProcureState) error
}
// ProcureState holds the current state of the procurement pipeline.
// It contains the pod launch specification and the Mesos offer
// from which resources are being procured.
type ProcureState struct {
offer *mesos.Offer // source
spec *Spec // sink
}
// Result returns the procurement result consisting
// of the procured pod specification and the remaining
// Mesos offer.
func (ps *ProcureState) Result() (*Spec, *mesos.Offer) {
return ps.spec, ps.offer
}
// NewProcureState returns an ProcureState containing an empty Spec
// and a deep copy of the given offer.
func NewProcureState(offer *mesos.Offer) *ProcureState {
return &ProcureState{
spec: &Spec{},
offer: proto.Clone(offer).(*mesos.Offer),
}
}
// The ProcurementFunc type is an adapter to use ordinary functions as Procurement implementations.
type ProcurementFunc func(*T, *api.Node, *ProcureState) error
func (p ProcurementFunc) Procure(t *T, n *api.Node, ps *ProcureState) error {
return p(t, n, ps)
}
// AllOrNothingProcurement provides a convenient wrapper around multiple Procurement // AllOrNothingProcurement provides a convenient wrapper around multiple Procurement
// objectives: the failure of any Procurement in the set results in Procure failing. // objectives: the failure of any Procurement in the set results in Procure failing.
@ -50,77 +103,204 @@ type AllOrNothingProcurement []Procurement
// Procure runs each Procurement in the receiver list. The first Procurement func that // Procure runs each Procurement in the receiver list. The first Procurement func that
// fails triggers T.Reset() and the error is returned, otherwise returns nil. // fails triggers T.Reset() and the error is returned, otherwise returns nil.
func (a AllOrNothingProcurement) Procure(t *T, offer *mesos.Offer) error { func (a AllOrNothingProcurement) Procure(t *T, n *api.Node, ps *ProcureState) error {
for _, p := range a { for _, p := range a {
if err := p(t, offer); err != nil { err := p.Procure(t, n, ps)
t.Reset() if err != nil {
return err return err
} }
} }
return nil return nil
} }
// ValidateProcurement checks that the offered resources are kosher, and if not panics. // NewNodeProcurement returns a Procurement that checks whether the given pod task and offer
// If things check out ok, t.Spec is cleared and nil is returned. // have valid node informations available and wehther the pod spec node selector matches
func ValidateProcurement(t *T, offer *mesos.Offer) error { // the pod labels.
if offer == nil { // If the check is successfull the slave ID and assigned slave is set in the given Spec.
//programming error func NewNodeProcurement() Procurement {
panic("offer details are nil") return ProcurementFunc(func(t *T, n *api.Node, ps *ProcureState) error {
// if the user has specified a target host, make sure this offer is for that host
if t.Pod.Spec.NodeName != "" && ps.offer.GetHostname() != t.Pod.Spec.NodeName {
return fmt.Errorf(
"NodeName %q does not match offer hostname %q",
t.Pod.Spec.NodeName, ps.offer.GetHostname(),
)
} }
t.Spec = Spec{}
// check the NodeSelector
if len(t.Pod.Spec.NodeSelector) > 0 {
if n.Labels == nil {
return fmt.Errorf(
"NodeSelector %v does not match empty labels of pod %s/%s",
t.Pod.Spec.NodeSelector, t.Pod.Namespace, t.Pod.Name,
)
}
selector := labels.SelectorFromSet(t.Pod.Spec.NodeSelector)
if !selector.Matches(labels.Set(n.Labels)) {
return fmt.Errorf(
"NodeSelector %v does not match labels %v of pod %s/%s",
t.Pod.Spec.NodeSelector, t.Pod.Labels, t.Pod.Namespace, t.Pod.Name,
)
}
}
ps.spec.SlaveID = ps.offer.GetSlaveId().GetValue()
ps.spec.AssignedSlave = ps.offer.GetHostname()
return nil return nil
})
} }
// NodeProcurement updates t.Spec in preparation for the task to be launched on the // NewPodResourcesProcurement converts k8s pod cpu and memory resource requirements into
// slave associated with the offer. // mesos resource allocations.
func NodeProcurement(t *T, offer *mesos.Offer) error { func NewPodResourcesProcurement() Procurement {
t.Spec.SlaveID = offer.GetSlaveId().GetValue() return ProcurementFunc(func(t *T, _ *api.Node, ps *ProcureState) error {
t.Spec.AssignedSlave = offer.GetHostname()
return nil
}
type RequirePodResources struct {
defaultContainerCPULimit mresource.CPUShares
defaultContainerMemLimit mresource.MegaBytes
}
func (r *RequirePodResources) Procure(t *T, offer *mesos.Offer) error {
// write resource limits into the pod spec which is transferred to the executor. From here
// on we can expect that the pod spec of a task has proper limits for CPU and memory.
// TODO(sttts): For a later separation of the kubelet and the executor also patch the pod on the apiserver
// TODO(sttts): fall back to requested resources if resource limit cannot be fulfilled by the offer // TODO(sttts): fall back to requested resources if resource limit cannot be fulfilled by the offer
// TODO(jdef): changing the state of t.Pod here feels dirty, especially since we don't use a kosher _, limits, err := api.PodRequestsAndLimits(&t.Pod)
// method to clone the api.Pod state in T.Clone(). This needs some love.
_, cpuLimit, _, err := mresource.LimitPodCPU(&t.Pod, r.defaultContainerCPULimit)
if err != nil { if err != nil {
return err return err
} }
_, memLimit, _, err := mresource.LimitPodMem(&t.Pod, r.defaultContainerMemLimit) wantedCpus := float64(mresource.NewCPUShares(limits[api.ResourceCPU]))
if err != nil { wantedMem := float64(mresource.NewMegaBytes(limits[api.ResourceMemory]))
return err
log.V(4).Infof(
"trying to match offer with pod %v/%v: cpus: %.2f mem: %.2f MB",
t.Pod.Namespace, t.Pod.Name, wantedCpus, wantedMem,
)
podRoles := t.Roles()
procuredCpu, remaining := procureScalarResources("cpus", wantedCpus, podRoles, ps.offer.GetResources())
if procuredCpu == nil {
return fmt.Errorf(
"not enough cpu resources for pod %s/%s: want=%v",
t.Pod.Namespace, t.Pod.Name, wantedCpus,
)
} }
log.V(3).Infof("Recording offer(s) %s/%s against pod %v: cpu: %.2f, mem: %.2f MB", offer.Id, t.Pod.Namespace, t.Pod.Name, cpuLimit, memLimit) procuredMem, remaining := procureScalarResources("mem", wantedMem, podRoles, remaining)
if procuredMem == nil {
t.Spec.CPU = cpuLimit return fmt.Errorf(
t.Spec.Memory = memLimit "not enough mem resources for pod %s/%s: want=%v",
t.Pod.Namespace, t.Pod.Name, wantedMem,
)
}
ps.offer.Resources = remaining
ps.spec.Resources = append(ps.spec.Resources, append(procuredCpu, procuredMem...)...)
return nil return nil
})
} }
// PortsProcurement convert host port mappings into mesos port resource allocations. // NewPortsProcurement returns a Procurement procuring ports
func PortsProcurement(t *T, offer *mesos.Offer) error { func NewPortsProcurement() Procurement {
return ProcurementFunc(func(t *T, _ *api.Node, ps *ProcureState) error {
// fill in port mapping // fill in port mapping
if mapping, err := t.mapper.Generate(t, offer); err != nil { if mapping, err := t.mapper.Map(t, ps.offer); err != nil {
return err return err
} else { } else {
ports := []uint64{} ports := []Port{}
for _, entry := range mapping { for _, entry := range mapping {
ports = append(ports, entry.OfferPort) ports = append(ports, Port{
Port: entry.OfferPort,
Role: entry.Role,
})
} }
t.Spec.PortMap = mapping ps.spec.PortMap = mapping
t.Spec.Ports = ports ps.spec.Resources = append(ps.spec.Resources, portRangeResources(ports)...)
} }
return nil return nil
})
}
// NewExecutorResourceProcurer returns a Procurement procuring executor resources
// If a given offer has no executor IDs set, the given prototype executor resources are considered for procurement.
// If a given offer has one executor ID set, only pod resources are being procured.
// An offer with more than one executor ID implies an invariant violation and the first executor ID is being considered.
func NewExecutorResourceProcurer(resources []*mesos.Resource, registry executorinfo.Registry) Procurement {
return ProcurementFunc(func(t *T, _ *api.Node, ps *ProcureState) error {
eids := len(ps.offer.GetExecutorIds())
switch {
case eids == 0:
wantedCpus := sumResources(filterResources(resources, isScalar, hasName("cpus")))
wantedMem := sumResources(filterResources(resources, isScalar, hasName("mem")))
procuredCpu, remaining := procureScalarResources("cpus", wantedCpus, t.allowedRoles, ps.offer.GetResources())
if procuredCpu == nil {
return fmt.Errorf("not enough cpu resources for executor: want=%v", wantedCpus)
}
procuredMem, remaining := procureScalarResources("mem", wantedMem, t.allowedRoles, remaining)
if procuredMem == nil {
return fmt.Errorf("not enough mem resources for executor: want=%v", wantedMem)
}
ps.offer.Resources = remaining
ps.spec.Executor = registry.New(ps.offer.GetHostname(), append(procuredCpu, procuredMem...))
return nil
case eids == 1:
e, err := registry.Get(ps.offer.GetHostname())
if err != nil {
return err
}
ps.spec.Executor = e
return nil
default:
// offers with more than 1 ExecutorId should be rejected by the
// framework long before they arrive here.
return fmt.Errorf("got offer with more than 1 executor id: %v", ps.offer.GetExecutorIds())
}
})
}
// smallest number such that 1.0 + epsilon != 1.0
// see https://github.com/golang/go/issues/966
var epsilon = math.Nextafter(1, 2) - 1
// procureScalarResources procures offered resources that
// 1. Match the given name
// 2. Match the given roles
// 3. The given wanted scalar value can be fully consumed by offered resources
// Roles are being considered in the specified roles slice ordering.
func procureScalarResources(
name string,
want float64,
roles []string,
offered []*mesos.Resource,
) (procured, remaining []*mesos.Resource) {
sorted := byRoles(roles...).sort(offered)
procured = make([]*mesos.Resource, 0, len(sorted))
remaining = make([]*mesos.Resource, 0, len(sorted))
for _, r := range sorted {
if want >= epsilon && resourceMatchesAll(r, hasName(name), isScalar) {
left, role := r.GetScalar().GetValue(), r.Role
consumed := math.Min(want, left)
want -= consumed
left -= consumed
if left >= epsilon {
r = mesosutil.NewScalarResource(name, left)
r.Role = role
remaining = append(remaining, r)
}
consumedRes := mesosutil.NewScalarResource(name, consumed)
consumedRes.Role = role
procured = append(procured, consumedRes)
} else {
remaining = append(remaining, r)
}
}
// demanded value (want) was not fully consumed violating invariant 3.
// thus no resources must be procured
if want >= epsilon {
return nil, offered
}
return
} }

View File

@ -0,0 +1,218 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package podtask
import (
"testing"
"github.com/mesos/mesos-go/mesosproto"
"github.com/mesos/mesos-go/mesosutil"
mesos "github.com/mesos/mesos-go/mesosproto"
"k8s.io/kubernetes/pkg/api"
"k8s.io/kubernetes/pkg/api/resource"
"reflect"
)
func TestNewPodResourcesProcurement(t *testing.T) {
executor := mesosutil.NewExecutorInfo(
mesosutil.NewExecutorID("executor-id"),
mesosutil.NewCommandInfo("executor-cmd"),
)
executor.Data = []byte{0, 1, 2}
executor.Resources = []*mesosproto.Resource{
scalar("cpus", 0.1, "*"),
scalar("mem", 64.0, "*"),
}
executor.Command = &mesosproto.CommandInfo{
Arguments: []string{},
}
offer := &mesosproto.Offer{
Resources: []*mesosproto.Resource{
scalar("cpus", 4.0, "*"),
scalar("mem", 512.0, "*"),
},
}
task, _ := New(
api.NewDefaultContext(),
"",
&api.Pod{
ObjectMeta: api.ObjectMeta{
Name: "test",
Namespace: api.NamespaceDefault,
},
Spec: api.PodSpec{
Containers: []api.Container{
{
Resources: api.ResourceRequirements{
Limits: api.ResourceList{
api.ResourceCPU: *resource.NewQuantity(
3,
resource.DecimalSI,
),
api.ResourceMemory: *resource.NewQuantity(
128*1024*1024,
resource.BinarySI,
),
},
},
},
},
},
},
executor,
[]string{"*"},
)
procurement := NewPodResourcesProcurement()
ps := NewProcureState(offer)
if err := procurement.Procure(task, &api.Node{}, ps); err != nil {
t.Error(err)
}
if len(ps.spec.Resources) == 0 {
t.Errorf("expected procured resources but got none")
}
}
func TestProcureRoleResources(t *testing.T) {
for i, tt := range []struct {
offered []*mesos.Resource
name string // cpu or mem
want float64
roles []string
consumed []*mesos.Resource
left []*mesos.Resource
}{
{
offered: []*mesos.Resource{
scalar("mem", 128.0, "*"),
scalar("mem", 32.0, "slave_public"),
},
name: "mem",
want: 128.0,
roles: []string{"slave_public", "*"},
consumed: []*mesos.Resource{
scalar("mem", 32.0, "slave_public"),
scalar("mem", 96.0, "*"),
},
left: []*mesos.Resource{
scalar("mem", 32.0, "*"),
},
},
{
offered: []*mesos.Resource{
scalar("mem", 128.0, "*"),
scalar("mem", 32.0, "slave_public"),
},
name: "mem",
want: 128.0,
roles: []string{"slave_public"},
consumed: nil,
left: []*mesos.Resource{
scalar("mem", 128.0, "*"),
scalar("mem", 32.0, "slave_public"),
},
},
{
offered: []*mesos.Resource{
scalar("cpus", 1.5, "slave_public"),
scalar("cpus", 1, "slave_public"),
scalar("mem", 128.0, "slave_public"),
scalar("mem", 64.0, "slave_public"),
scalar("mem", 128.0, "*"),
},
name: "mem",
want: 200.0,
roles: []string{"slave_public", "*"},
consumed: []*mesos.Resource{
scalar("mem", 128.0, "slave_public"),
scalar("mem", 64.0, "slave_public"),
scalar("mem", 8.0, "*"),
},
left: []*mesos.Resource{
scalar("cpus", 1.5, "slave_public"),
scalar("cpus", 1, "slave_public"),
scalar("mem", 120, "*"),
},
},
{
offered: []*mesos.Resource{
scalar("mem", 128.0, "*"),
},
name: "mem",
want: 128.0,
roles: []string{"slave_public", "*"},
consumed: []*mesos.Resource{
scalar("mem", 128, "*"),
},
left: []*mesos.Resource{},
},
{
offered: []*mesos.Resource{
scalar("cpu", 32.0, "slave_public"),
},
name: "mem",
want: 128.0,
roles: []string{"slave_public", "*"},
consumed: nil,
left: []*mesos.Resource{
scalar("cpu", 32.0, "slave_public"),
},
},
{
offered: nil,
name: "mem",
want: 160.0,
roles: []string{"slave_public", "*"},
consumed: nil, left: nil,
},
} {
consumed, remaining := procureScalarResources(tt.name, tt.want, tt.roles, tt.offered)
if !reflect.DeepEqual(consumed, tt.consumed) {
t.Errorf("test #%d (consumed):\ngot %v\nwant %v", i, consumed, tt.consumed)
}
if !reflect.DeepEqual(remaining, tt.left) {
t.Errorf("test #%d (remaining):\ngot %v\nwant %v", i, remaining, tt.left)
}
}
}
func scalar(name string, value float64, role string) *mesos.Resource {
res := mesosutil.NewScalarResource(name, value)
res.Role = stringPtrTo(role)
return res
}

View File

@ -1,57 +0,0 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package podtask
import (
"github.com/gogo/protobuf/proto"
mesos "github.com/mesos/mesos-go/mesosproto"
)
// create a range resource for the listed ports
func rangeResource(name string, ports []uint64) *mesos.Resource {
if len(ports) == 0 {
// pod may consist of a container that doesn't expose any ports on the host
return nil
}
return &mesos.Resource{
Name: proto.String(name),
Type: mesos.Value_RANGES.Enum(),
Ranges: newRanges(ports),
}
}
// generate port ranges from a list of ports. this implementation is very naive
func newRanges(ports []uint64) *mesos.Value_Ranges {
r := make([]*mesos.Value_Range, 0)
for _, port := range ports {
x := proto.Uint64(port)
r = append(r, &mesos.Value_Range{Begin: x, End: x})
}
return &mesos.Value_Ranges{Range: r}
}
func foreachRange(offer *mesos.Offer, resourceName string, f func(begin, end uint64)) {
for _, resource := range offer.Resources {
if resource.GetName() == resourceName {
for _, r := range (*resource).GetRanges().Range {
bp := r.GetBegin()
ep := r.GetEnd()
f(bp, ep)
}
}
}
}

View File

@ -132,7 +132,6 @@ func (k *inMemoryRegistry) Update(task *T) error {
case StatePending: case StatePending:
internal.Offer = task.Offer internal.Offer = task.Offer
internal.Spec = task.Spec internal.Spec = task.Spec
(&task.Spec).copyTo(&internal.Spec)
internal.Flags = map[FlagType]struct{}{} internal.Flags = map[FlagType]struct{}{}
fallthrough fallthrough
case StateRunning: case StateRunning:

View File

@ -17,6 +17,7 @@ limitations under the License.
package podtask package podtask
import ( import (
"fmt"
"testing" "testing"
"time" "time"
@ -37,14 +38,14 @@ func TestInMemoryRegistry_RegisterGetUnregister(t *testing.T) {
assert.Empty(tasks) assert.Empty(tasks)
// add a task // add a task
a, _ := fakePodTask("a") a := fakePodTask("a")
a_clone, err := registry.Register(a) a_clone, err := registry.Register(a)
assert.NoError(err) assert.NoError(err)
assert.Equal(a_clone.ID, a.ID) assert.Equal(a_clone.ID, a.ID)
assert.Equal(a_clone.podKey, a.podKey) assert.Equal(a_clone.podKey, a.podKey)
// add another task // add another task
b, _ := fakePodTask("b") b := fakePodTask("b")
b_clone, err := registry.Register(b) b_clone, err := registry.Register(b)
assert.NoError(err) assert.NoError(err)
assert.Equal(b_clone.ID, b.ID) assert.Equal(b_clone.ID, b.ID)
@ -53,12 +54,12 @@ func TestInMemoryRegistry_RegisterGetUnregister(t *testing.T) {
// find tasks in the registry // find tasks in the registry
tasks = registry.List(func(t *T) bool { return true }) tasks = registry.List(func(t *T) bool { return true })
assert.Len(tasks, 2) assert.Len(tasks, 2)
assert.Contains(tasks, a_clone) assertContains(t, a_clone, tasks...)
assert.Contains(tasks, b_clone) assertContains(t, b_clone, tasks...)
tasks = registry.List(func(t *T) bool { return t.ID == a.ID }) tasks = registry.List(func(t *T) bool { return t.ID == a.ID })
assert.Len(tasks, 1) assert.Len(tasks, 1)
assert.Contains(tasks, a_clone) assertContains(t, a_clone, tasks...)
task, _ := registry.ForPod(a.podKey) task, _ := registry.ForPod(a.podKey)
assert.NotNil(task) assert.NotNil(task)
@ -102,10 +103,10 @@ func TestInMemoryRegistry_RegisterGetUnregister(t *testing.T) {
tasks = registry.List(func(t *T) bool { return true }) tasks = registry.List(func(t *T) bool { return true })
assert.Len(tasks, 1) assert.Len(tasks, 1)
assert.Contains(tasks, a) assertContains(t, a, tasks...)
// unregister a task not registered // unregister a task not registered
unregistered_task, _ := fakePodTask("unregistered-task") unregistered_task := fakePodTask("unregistered-task")
registry.Unregister(unregistered_task) registry.Unregister(unregistered_task)
} }
@ -123,7 +124,7 @@ func TestInMemoryRegistry_State(t *testing.T) {
registry := NewInMemoryRegistry() registry := NewInMemoryRegistry()
// add a task // add a task
a, _ := fakePodTask("a") a := fakePodTask("a")
a_clone, err := registry.Register(a) a_clone, err := registry.Register(a)
assert.NoError(err) assert.NoError(err)
assert.Equal(a.State, a_clone.State) assert.Equal(a.State, a_clone.State)
@ -166,7 +167,7 @@ func TestInMemoryRegistry_Update(t *testing.T) {
// create registry // create registry
registry := NewInMemoryRegistry() registry := NewInMemoryRegistry()
a, _ := fakePodTask("a") a := fakePodTask("a")
registry.Register(a.Clone()) // here clone a because we change it below registry.Register(a.Clone()) // here clone a because we change it below
// state changes are ignored // state changes are ignored
@ -184,7 +185,7 @@ func TestInMemoryRegistry_Update(t *testing.T) {
assert.Equal(offer.Id(), a_clone.Offer.Id()) assert.Equal(offer.Id(), a_clone.Offer.Id())
// spec is updated while pending // spec is updated while pending
a.Spec = Spec{SlaveID: "slave-1"} a.Spec = &Spec{SlaveID: "slave-1"}
err = registry.Update(a) err = registry.Update(a)
assert.NoError(err) assert.NoError(err)
a_clone, _ = registry.Get(a.ID) a_clone, _ = registry.Get(a.ID)
@ -212,7 +213,7 @@ func TestInMemoryRegistry_Update(t *testing.T) {
assert.True(found_bound) assert.True(found_bound)
// spec is ignored while running // spec is ignored while running
a.Spec = Spec{SlaveID: "slave-2"} a.Spec = &Spec{SlaveID: "slave-2"}
err = registry.Update(a) err = registry.Update(a)
assert.NoError(err) assert.NoError(err)
a_clone, _ = registry.Get(a.ID) a_clone, _ = registry.Get(a.ID)
@ -224,7 +225,7 @@ func TestInMemoryRegistry_Update(t *testing.T) {
assert.Error(err) assert.Error(err)
// update unknown task // update unknown task
unknown_task, _ := fakePodTask("unknown-task") unknown_task := fakePodTask("unknown-task")
err = registry.Update(unknown_task) err = registry.Update(unknown_task)
assert.Error(err) assert.Error(err)
@ -255,7 +256,7 @@ func testStateTrace(t *testing.T, transitions []transition) *Registry {
assert := assert.New(t) assert := assert.New(t)
registry := NewInMemoryRegistry() registry := NewInMemoryRegistry()
a, _ := fakePodTask("a") a := fakePodTask("a")
a, _ = registry.Register(a) a, _ = registry.Register(a)
// initial pending state // initial pending state
@ -319,3 +320,17 @@ func TestInMemoryRegistry_NotFinished(t *testing.T) {
}) })
} }
} }
func assertContains(t *testing.T, want *T, ts ...*T) bool {
for _, got := range ts {
if taskEquals(want, got) {
return true
}
}
return assert.Fail(t, fmt.Sprintf("%v does not contain %v", ts, want))
}
func taskEquals(t1, t2 *T) bool {
return t1.ID == t2.ID && t1.podKey == t2.podKey
}

View File

@ -0,0 +1,156 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package podtask
import (
"github.com/gogo/protobuf/proto"
mesos "github.com/mesos/mesos-go/mesosproto"
)
// portRangeResources creates a range resource for the spec ports.
func portRangeResources(Ports []Port) []*mesos.Resource {
rolePorts := make(map[string][]uint64, len(Ports))
for _, p := range Ports {
rolePorts[p.Role] = append(rolePorts[p.Role], p.Port)
}
resources := make([]*mesos.Resource, 0, len(rolePorts))
for role, ports := range rolePorts {
resources = append(
resources,
&mesos.Resource{
Name: proto.String("ports"),
Type: mesos.Value_RANGES.Enum(),
Ranges: newRanges(ports),
Role: stringPtrTo(role),
},
)
}
return resources
}
// newRanges generates port ranges from the given list of ports. (naive implementation)
func newRanges(ports []uint64) *mesos.Value_Ranges {
r := make([]*mesos.Value_Range, 0, len(ports))
for _, port := range ports {
x := proto.Uint64(port)
r = append(r, &mesos.Value_Range{Begin: x, End: x})
}
return &mesos.Value_Ranges{Range: r}
}
// foreachPortsRange calls f for each resource that matches the given roles
// in the order of the given roles.
func foreachPortsRange(rs []*mesos.Resource, roles []string, f func(begin, end uint64, role string)) {
rs = filterResources(rs, hasName("ports"))
rs = byRoles(roles...).sort(rs)
for _, resource := range rs {
for _, r := range (*resource).GetRanges().Range {
bp := r.GetBegin()
ep := r.GetEnd()
f(bp, ep, (*resource).GetRole())
}
}
}
// byRolesSorter sorts resources according to the ordering of roles.
type byRolesSorter struct {
roles []string
}
// byRoles returns a byRolesSorter with the given roles.
func byRoles(roles ...string) *byRolesSorter {
return &byRolesSorter{roles: roles}
}
// sort sorts the given resources according to the order of roles in the byRolesSorter
// and returns the sorted resources.
func (sorter *byRolesSorter) sort(resources []*mesos.Resource) []*mesos.Resource {
rolesMap := map[string][]*mesos.Resource{} // maps roles to resources
for _, res := range resources {
role := starredRole(res.GetRole())
rolesMap[role] = append(rolesMap[role], res)
}
result := make([]*mesos.Resource, 0, len(resources))
for _, role := range sorter.roles {
for _, res := range rolesMap[role] {
result = append(result, res)
}
}
return result
}
// resourcePredicate is a predicate function on *mesos.Resource structs.
type resourcePredicate func(*mesos.Resource) bool
// filter filters the given slice of resources and returns a slice of resources
// matching all given predicates.
func filterResources(res []*mesos.Resource, ps ...resourcePredicate) []*mesos.Resource {
filtered := make([]*mesos.Resource, 0, len(res))
next:
for _, r := range res {
for _, p := range ps {
if !p(r) {
continue next
}
}
filtered = append(filtered, r)
}
return filtered
}
// resourceMatchesAll returns true if the given resource matches all given predicates ps.
func resourceMatchesAll(res *mesos.Resource, ps ...resourcePredicate) bool {
for _, p := range ps {
if !p(res) {
return false
}
}
return true
}
func sumResources(res []*mesos.Resource) float64 {
var sum float64
for _, r := range res {
sum += r.GetScalar().GetValue()
}
return sum
}
// isScalar returns true if the given resource is a scalar type.
func isScalar(r *mesos.Resource) bool {
return r.GetType() == mesos.Value_SCALAR
}
// hasName returns a resourcePredicate which returns true
// if the given resource has the given name.
func hasName(name string) resourcePredicate {
return func(r *mesos.Resource) bool {
return r.GetName() == name
}
}

View File

@ -0,0 +1,104 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package podtask
// rolePredicate is a predicate function on role strings
type rolePredicate func(string) bool
// filterRoles filters the given slice of roles and returns a slice of roles
// matching all given predicates
func filterRoles(roles []string, ps ...rolePredicate) []string {
filtered := make([]string, 0, len(roles))
next:
for _, r := range roles {
for _, p := range ps {
if !p(r) {
continue next
}
}
filtered = append(filtered, r)
}
return filtered
}
// seenRole returns a rolePredicate which returns true
// if a given role has already been seen in previous invocations.
func seenRole() rolePredicate {
seen := map[string]struct{}{}
return func(role string) bool {
_, ok := seen[role]
if !ok {
seen[role] = struct{}{}
}
return ok
}
}
// emptyRole returns true if the given role is empty
func emptyRole(name string) bool {
return name == ""
}
// not returns a rolePredicate which returns the negation
// of the given predicate
func not(p rolePredicate) rolePredicate {
return func(r string) bool {
return !p(r)
}
}
// inRoles returns a rolePredicate which returns true
// if the given role is present in the given roles
func inRoles(roles ...string) rolePredicate {
roleSet := make(map[string]struct{}, len(roles))
for _, r := range roles {
roleSet[r] = struct{}{}
}
return func(r string) bool {
_, ok := roleSet[r]
return ok
}
}
// starredRole returns a "*" if the given role is empty else the role itself
func starredRole(name string) string {
if name == "" {
return "*"
}
return name
}
// stringPtrTo returns a pointer to the given string
// or nil if it is empty string.
func stringPtrTo(s string) *string {
var protos *string
if s != "" {
protos = &s
}
return protos
}

View File

@ -0,0 +1,66 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package podtask
import (
"reflect"
"testing"
)
func TestFilterRoles(t *testing.T) {
for i, tt := range []struct {
roles, want []string
predicates []rolePredicate
}{
{
[]string{"role1", "", "role1", "role2", "role3", "role2"},
[]string{"role1", "role2", "role3"},
[]rolePredicate{not(emptyRole), not(seenRole())},
},
{
[]string{},
[]string{},
[]rolePredicate{not(emptyRole)},
},
{
[]string{""},
[]string{},
[]rolePredicate{not(emptyRole)},
},
{
nil,
[]string{},
[]rolePredicate{not(emptyRole)},
},
{
[]string{"role1", "role2"},
[]string{"role1", "role2"},
nil,
},
{
nil,
[]string{},
nil,
},
} {
got := filterRoles(tt.roles, tt.predicates...)
if !reflect.DeepEqual(got, tt.want) {
t.Errorf("test #%d got %#v want %#v", i, got, tt.want)
}
}
}

View File

@ -125,8 +125,8 @@ func LimitPodMem(pod *api.Pod, defaultLimit MegaBytes) (request, limit MegaBytes
return NewMegaBytes(*r), NewMegaBytes(*l), m, nil return NewMegaBytes(*r), NewMegaBytes(*l), m, nil
} }
// CPUForPod computes the limits from the spec plus the default CPU limit difference for unlimited containers // LimitedCPUForPod computes the limits from the spec plus the default CPU limit difference for unlimited containers
func CPUForPod(pod *api.Pod, defaultLimit CPUShares) (request, limit CPUShares, modified bool, err error) { func LimitedCPUForPod(pod *api.Pod, defaultLimit CPUShares) (request, limit CPUShares, modified bool, err error) {
r, l, m, err := podResources(pod, api.ResourceCPU, *defaultLimit.Quantity(), *MinimumContainerCPU.Quantity(), false) r, l, m, err := podResources(pod, api.ResourceCPU, *defaultLimit.Quantity(), *MinimumContainerCPU.Quantity(), false)
if err != nil { if err != nil {
return 0.0, 0.0, false, err return 0.0, 0.0, false, err
@ -134,8 +134,8 @@ func CPUForPod(pod *api.Pod, defaultLimit CPUShares) (request, limit CPUShares,
return NewCPUShares(*r), NewCPUShares(*l), m, nil return NewCPUShares(*r), NewCPUShares(*l), m, nil
} }
// MemForPod computes the limits from the spec plus the default memory limit difference for unlimited containers // LimitedMemForPod computes the limits from the spec plus the default memory limit difference for unlimited containers
func MemForPod(pod *api.Pod, defaultLimit MegaBytes) (request, limit MegaBytes, modified bool, err error) { func LimitedMemForPod(pod *api.Pod, defaultLimit MegaBytes) (request, limit MegaBytes, modified bool, err error) {
r, l, m, err := podResources(pod, api.ResourceMemory, *defaultLimit.Quantity(), *MinimumContainerMem.Quantity(), true) r, l, m, err := podResources(pod, api.ResourceMemory, *defaultLimit.Quantity(), *MinimumContainerMem.Quantity(), true)
if err != nil { if err != nil {
return 0.0, 0.0, false, err return 0.0, 0.0, false, err

View File

@ -83,10 +83,10 @@ func TestResources(tst *testing.T) {
tst.Logf("Testing resource computation for %v => request=%v limit=%v", t, pod.Spec.Containers[0].Resources.Requests, pod.Spec.Containers[0].Resources.Limits) tst.Logf("Testing resource computation for %v => request=%v limit=%v", t, pod.Spec.Containers[0].Resources.Requests, pod.Spec.Containers[0].Resources.Limits)
tst.Logf("hasRequests: cpu => %v, mem => %v", resourcequota.PodHasRequests(pod, api.ResourceCPU), resourcequota.PodHasRequests(pod, api.ResourceMemory)) tst.Logf("hasRequests: cpu => %v, mem => %v", resourcequota.PodHasRequests(pod, api.ResourceCPU), resourcequota.PodHasRequests(pod, api.ResourceMemory))
beforeCpuR, beforeCpuL, _, err := CPUForPod(pod, DefaultDefaultContainerCPULimit) beforeCpuR, beforeCpuL, _, err := LimitedCPUForPod(pod, DefaultDefaultContainerCPULimit)
assert.NoError(err, "CPUForPod should not return an error") assert.NoError(err, "CPUForPod should not return an error")
beforeMemR, beforeMemL, _, err := MemForPod(pod, DefaultDefaultContainerMemLimit) beforeMemR, beforeMemL, _, err := LimitedMemForPod(pod, DefaultDefaultContainerMemLimit)
assert.NoError(err, "MemForPod should not return an error") assert.NoError(err, "MemForPod should not return an error")
cpuR, cpuL, _, err := LimitPodCPU(pod, DefaultDefaultContainerCPULimit) cpuR, cpuL, _, err := LimitPodCPU(pod, DefaultDefaultContainerCPULimit)

View File

@ -57,12 +57,12 @@ import (
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/components/algorithm/podschedulers" "k8s.io/kubernetes/contrib/mesos/pkg/scheduler/components/algorithm/podschedulers"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/components/framework" "k8s.io/kubernetes/contrib/mesos/pkg/scheduler/components/framework"
schedcfg "k8s.io/kubernetes/contrib/mesos/pkg/scheduler/config" schedcfg "k8s.io/kubernetes/contrib/mesos/pkg/scheduler/config"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/executorinfo"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/ha" "k8s.io/kubernetes/contrib/mesos/pkg/scheduler/ha"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/meta" "k8s.io/kubernetes/contrib/mesos/pkg/scheduler/meta"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/metrics" "k8s.io/kubernetes/contrib/mesos/pkg/scheduler/metrics"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/podtask" "k8s.io/kubernetes/contrib/mesos/pkg/scheduler/podtask"
mresource "k8s.io/kubernetes/contrib/mesos/pkg/scheduler/resource" mresource "k8s.io/kubernetes/contrib/mesos/pkg/scheduler/resource"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/uid"
"k8s.io/kubernetes/pkg/api" "k8s.io/kubernetes/pkg/api"
"k8s.io/kubernetes/pkg/api/resource" "k8s.io/kubernetes/pkg/api/resource"
"k8s.io/kubernetes/pkg/client/cache" "k8s.io/kubernetes/pkg/client/cache"
@ -70,6 +70,7 @@ import (
client "k8s.io/kubernetes/pkg/client/unversioned" client "k8s.io/kubernetes/pkg/client/unversioned"
clientauth "k8s.io/kubernetes/pkg/client/unversioned/auth" clientauth "k8s.io/kubernetes/pkg/client/unversioned/auth"
cloud "k8s.io/kubernetes/pkg/cloudprovider/providers/mesos" cloud "k8s.io/kubernetes/pkg/cloudprovider/providers/mesos"
controllerfw "k8s.io/kubernetes/pkg/controller/framework"
"k8s.io/kubernetes/pkg/fields" "k8s.io/kubernetes/pkg/fields"
"k8s.io/kubernetes/pkg/healthz" "k8s.io/kubernetes/pkg/healthz"
"k8s.io/kubernetes/pkg/master/ports" "k8s.io/kubernetes/pkg/master/ports"
@ -83,12 +84,14 @@ import (
const ( const (
defaultMesosMaster = "localhost:5050" defaultMesosMaster = "localhost:5050"
defaultMesosUser = "root" // should have privs to execute docker and iptables commands defaultMesosUser = "root" // should have privs to execute docker and iptables commands
defaultMesosRoles = "*"
defaultReconcileInterval = 300 // 5m default task reconciliation interval defaultReconcileInterval = 300 // 5m default task reconciliation interval
defaultReconcileCooldown = 15 * time.Second defaultReconcileCooldown = 15 * time.Second
defaultNodeRelistPeriod = 5 * time.Minute defaultNodeRelistPeriod = 5 * time.Minute
defaultFrameworkName = "Kubernetes" defaultFrameworkName = "Kubernetes"
defaultExecutorCPUs = mresource.CPUShares(0.25) // initial CPU allocated for executor defaultExecutorCPUs = mresource.CPUShares(0.25) // initial CPU allocated for executor
defaultExecutorMem = mresource.MegaBytes(128.0) // initial memory allocated for executor defaultExecutorMem = mresource.MegaBytes(128.0) // initial memory allocated for executor
defaultExecutorInfoCacheSize = 10000
) )
type SchedulerServer struct { type SchedulerServer struct {
@ -104,7 +107,7 @@ type SchedulerServer struct {
proxyPath string proxyPath string
mesosMaster string mesosMaster string
mesosUser string mesosUser string
mesosRole string mesosRoles []string
mesosAuthPrincipal string mesosAuthPrincipal string
mesosAuthSecretFile string mesosAuthSecretFile string
mesosCgroupPrefix string mesosCgroupPrefix string
@ -156,7 +159,6 @@ type SchedulerServer struct {
staticPodsConfigPath string staticPodsConfigPath string
dockerCfgPath string dockerCfgPath string
containPodResources bool containPodResources bool
accountForPodResources bool
nodeRelistPeriod time.Duration nodeRelistPeriod time.Duration
sandboxOverlay string sandboxOverlay string
@ -199,6 +201,7 @@ func NewSchedulerServer() *SchedulerServer {
mesosUser: defaultMesosUser, mesosUser: defaultMesosUser,
mesosExecutorCPUs: defaultExecutorCPUs, mesosExecutorCPUs: defaultExecutorCPUs,
mesosExecutorMem: defaultExecutorMem, mesosExecutorMem: defaultExecutorMem,
mesosRoles: strings.Split(defaultMesosRoles, ","),
reconcileInterval: defaultReconcileInterval, reconcileInterval: defaultReconcileInterval,
reconcileCooldown: defaultReconcileCooldown, reconcileCooldown: defaultReconcileCooldown,
checkpoint: true, checkpoint: true,
@ -208,7 +211,6 @@ func NewSchedulerServer() *SchedulerServer {
kubeletCadvisorPort: 4194, // copied from github.com/GoogleCloudPlatform/kubernetes/blob/release-0.14/cmd/kubelet/app/server.go kubeletCadvisorPort: 4194, // copied from github.com/GoogleCloudPlatform/kubernetes/blob/release-0.14/cmd/kubelet/app/server.go
kubeletSyncFrequency: 10 * time.Second, kubeletSyncFrequency: 10 * time.Second,
containPodResources: true, containPodResources: true,
accountForPodResources: true,
nodeRelistPeriod: defaultNodeRelistPeriod, nodeRelistPeriod: defaultNodeRelistPeriod,
} }
// cache this for later use. also useful in case the original binary gets deleted, e.g. // cache this for later use. also useful in case the original binary gets deleted, e.g.
@ -238,7 +240,7 @@ func (s *SchedulerServer) addCoreFlags(fs *pflag.FlagSet) {
fs.StringVar(&s.mesosMaster, "mesos-master", s.mesosMaster, "Location of the Mesos master. The format is a comma-delimited list of of hosts like zk://host1:port,host2:port/mesos. If using ZooKeeper, pay particular attention to the leading zk:// and trailing /mesos! If not using ZooKeeper, standard URLs like http://localhost are also acceptable.") fs.StringVar(&s.mesosMaster, "mesos-master", s.mesosMaster, "Location of the Mesos master. The format is a comma-delimited list of of hosts like zk://host1:port,host2:port/mesos. If using ZooKeeper, pay particular attention to the leading zk:// and trailing /mesos! If not using ZooKeeper, standard URLs like http://localhost are also acceptable.")
fs.StringVar(&s.mesosUser, "mesos-user", s.mesosUser, "Mesos user for this framework, defaults to root.") fs.StringVar(&s.mesosUser, "mesos-user", s.mesosUser, "Mesos user for this framework, defaults to root.")
fs.StringVar(&s.mesosRole, "mesos-role", s.mesosRole, "Mesos role for this framework, defaults to none.") fs.StringSliceVar(&s.mesosRoles, "mesos-roles", s.mesosRoles, "Mesos framework roles. The first role will be used to launch pods having no "+meta.RolesKey+" label.")
fs.StringVar(&s.mesosAuthPrincipal, "mesos-authentication-principal", s.mesosAuthPrincipal, "Mesos authentication principal.") fs.StringVar(&s.mesosAuthPrincipal, "mesos-authentication-principal", s.mesosAuthPrincipal, "Mesos authentication principal.")
fs.StringVar(&s.mesosAuthSecretFile, "mesos-authentication-secret-file", s.mesosAuthSecretFile, "Mesos authentication secret file.") fs.StringVar(&s.mesosAuthSecretFile, "mesos-authentication-secret-file", s.mesosAuthSecretFile, "Mesos authentication secret file.")
fs.StringVar(&s.mesosAuthProvider, "mesos-authentication-provider", s.mesosAuthProvider, fmt.Sprintf("Authentication provider to use, default is SASL that supports mechanisms: %+v", mech.ListSupported())) fs.StringVar(&s.mesosAuthProvider, "mesos-authentication-provider", s.mesosAuthProvider, fmt.Sprintf("Authentication provider to use, default is SASL that supports mechanisms: %+v", mech.ListSupported()))
@ -262,7 +264,6 @@ func (s *SchedulerServer) addCoreFlags(fs *pflag.FlagSet) {
fs.Var(&s.defaultContainerCPULimit, "default-container-cpu-limit", "Containers without a CPU resource limit are admitted this much CPU shares") fs.Var(&s.defaultContainerCPULimit, "default-container-cpu-limit", "Containers without a CPU resource limit are admitted this much CPU shares")
fs.Var(&s.defaultContainerMemLimit, "default-container-mem-limit", "Containers without a memory resource limit are admitted this much amount of memory in MB") fs.Var(&s.defaultContainerMemLimit, "default-container-mem-limit", "Containers without a memory resource limit are admitted this much amount of memory in MB")
fs.BoolVar(&s.containPodResources, "contain-pod-resources", s.containPodResources, "Reparent pod containers into mesos cgroups; disable if you're having strange mesos/docker/systemd interactions.") fs.BoolVar(&s.containPodResources, "contain-pod-resources", s.containPodResources, "Reparent pod containers into mesos cgroups; disable if you're having strange mesos/docker/systemd interactions.")
fs.BoolVar(&s.accountForPodResources, "account-for-pod-resources", s.accountForPodResources, "Allocate pod CPU and memory resources from offers (Default: true)")
fs.DurationVar(&s.nodeRelistPeriod, "node-monitor-period", s.nodeRelistPeriod, "Period between relisting of all nodes from the apiserver.") fs.DurationVar(&s.nodeRelistPeriod, "node-monitor-period", s.nodeRelistPeriod, "Period between relisting of all nodes from the apiserver.")
fs.IntVar(&s.executorLogV, "executor-logv", s.executorLogV, "Logging verbosity of spawned minion and executor processes.") fs.IntVar(&s.executorLogV, "executor-logv", s.executorLogV, "Logging verbosity of spawned minion and executor processes.")
@ -332,7 +333,7 @@ func (s *SchedulerServer) serveFrameworkArtifactWithFilename(path string, filena
return hostURI return hostURI
} }
func (s *SchedulerServer) prepareExecutorInfo(hks hyperkube.Interface) (*mesos.ExecutorInfo, *uid.UID, error) { func (s *SchedulerServer) prepareExecutorInfo(hks hyperkube.Interface) (*mesos.ExecutorInfo, error) {
ci := &mesos.CommandInfo{ ci := &mesos.CommandInfo{
Shell: proto.Bool(false), Shell: proto.Bool(false),
} }
@ -342,7 +343,7 @@ func (s *SchedulerServer) prepareExecutorInfo(hks hyperkube.Interface) (*mesos.E
ci.Uris = append(ci.Uris, &mesos.CommandInfo_URI{Value: proto.String(uri), Executable: proto.Bool(true)}) ci.Uris = append(ci.Uris, &mesos.CommandInfo_URI{Value: proto.String(uri), Executable: proto.Bool(true)})
ci.Value = proto.String(fmt.Sprintf("./%s", executorCmd)) ci.Value = proto.String(fmt.Sprintf("./%s", executorCmd))
} else if !hks.FindServer(hyperkube.CommandMinion) { } else if !hks.FindServer(hyperkube.CommandMinion) {
return nil, nil, fmt.Errorf("either run this scheduler via km or else --executor-path is required") return nil, fmt.Errorf("either run this scheduler via km or else --executor-path is required")
} else { } else {
if strings.Index(s.kmPath, "://") > 0 { if strings.Index(s.kmPath, "://") > 0 {
// URI could point directly to executable, e.g. hdfs:///km // URI could point directly to executable, e.g. hdfs:///km
@ -374,7 +375,7 @@ func (s *SchedulerServer) prepareExecutorInfo(hks hyperkube.Interface) (*mesos.E
if s.sandboxOverlay != "" { if s.sandboxOverlay != "" {
if _, err := os.Stat(s.sandboxOverlay); os.IsNotExist(err) { if _, err := os.Stat(s.sandboxOverlay); os.IsNotExist(err) {
return nil, nil, fmt.Errorf("Sandbox overlay archive not found: %s", s.sandboxOverlay) return nil, fmt.Errorf("Sandbox overlay archive not found: %s", s.sandboxOverlay)
} }
uri, _ := s.serveFrameworkArtifact(s.sandboxOverlay) uri, _ := s.serveFrameworkArtifact(s.sandboxOverlay)
ci.Uris = append(ci.Uris, &mesos.CommandInfo_URI{Value: proto.String(uri), Executable: proto.Bool(false), Extract: proto.Bool(true)}) ci.Uris = append(ci.Uris, &mesos.CommandInfo_URI{Value: proto.String(uri), Executable: proto.Bool(false), Extract: proto.Bool(true)})
@ -441,19 +442,23 @@ func (s *SchedulerServer) prepareExecutorInfo(hks hyperkube.Interface) (*mesos.E
// Check for staticPods // Check for staticPods
data, staticPodCPUs, staticPodMem := s.prepareStaticPods() data, staticPodCPUs, staticPodMem := s.prepareStaticPods()
// set prototype resource. During procument these act as the blue print only.
// In a final ExecutorInfo they might differ due to different procured
// resource roles.
execInfo.Resources = []*mesos.Resource{ execInfo.Resources = []*mesos.Resource{
mutil.NewScalarResource("cpus", float64(s.mesosExecutorCPUs)+staticPodCPUs), mutil.NewScalarResource("cpus", float64(s.mesosExecutorCPUs)+staticPodCPUs),
mutil.NewScalarResource("mem", float64(s.mesosExecutorMem)+staticPodMem), mutil.NewScalarResource("mem", float64(s.mesosExecutorMem)+staticPodMem),
} }
// calculate ExecutorInfo hash to be used for validating compatibility // calculate the ExecutorInfo hash to be used for validating compatibility.
// of ExecutorInfo's generated by other HA schedulers. // It is used to determine whether a running executor is compatible with the
ehash := hashExecutorInfo(execInfo) // current scheduler configuration. If it is not, offers for those nodes
eid := uid.New(ehash, execcfg.DefaultInfoID) // are declined by our framework and the operator has to phase out those
execInfo.ExecutorId = &mesos.ExecutorID{Value: proto.String(eid.String())} // running executors in a cluster.
execInfo.ExecutorId = executorinfo.NewID(execInfo)
execInfo.Data = data execInfo.Data = data
return execInfo, eid, nil return execInfo, nil
} }
func (s *SchedulerServer) prepareStaticPods() (data []byte, staticPodCPUs, staticPodMem float64) { func (s *SchedulerServer) prepareStaticPods() (data []byte, staticPodCPUs, staticPodMem float64) {
@ -531,6 +536,10 @@ func (s *SchedulerServer) getDriver() (driver bindings.SchedulerDriver) {
} }
func (s *SchedulerServer) Run(hks hyperkube.Interface, _ []string) error { func (s *SchedulerServer) Run(hks hyperkube.Interface, _ []string) error {
if n := len(s.mesosRoles); n == 0 || n > 2 || (n == 2 && s.mesosRoles[0] != "*" && s.mesosRoles[1] != "*") {
log.Fatalf(`only one custom role allowed in addition to "*"`)
}
// get scheduler low-level config // get scheduler low-level config
sc := schedcfg.CreateDefaultConfig() sc := schedcfg.CreateDefaultConfig()
if s.schedulerConfigFileName != "" { if s.schedulerConfigFileName != "" {
@ -559,9 +568,8 @@ func (s *SchedulerServer) Run(hks hyperkube.Interface, _ []string) error {
validation := ha.ValidationFunc(validateLeadershipTransition) validation := ha.ValidationFunc(validateLeadershipTransition)
srv := ha.NewCandidate(schedulerProcess, driverFactory, validation) srv := ha.NewCandidate(schedulerProcess, driverFactory, validation)
path := fmt.Sprintf(meta.DefaultElectionFormat, s.frameworkName) path := fmt.Sprintf(meta.DefaultElectionFormat, s.frameworkName)
sid := uid.New(eid.Group(), "").String() log.Infof("registering for election at %v with id %v", path, eid.GetValue())
log.Infof("registering for election at %v with id %v", path, sid) go election.Notify(election.NewEtcdMasterElector(etcdClient), path, eid.GetValue(), srv, nil)
go election.Notify(election.NewEtcdMasterElector(etcdClient), path, sid, srv, nil)
} else { } else {
log.Infoln("self-electing in non-HA mode") log.Infoln("self-electing in non-HA mode")
schedulerProcess.Elect(driverFactory) schedulerProcess.Elect(driverFactory)
@ -616,14 +624,8 @@ func (s *SchedulerServer) awaitFailover(schedulerProcess schedulerProcessInterfa
func validateLeadershipTransition(desired, current string) { func validateLeadershipTransition(desired, current string) {
log.Infof("validating leadership transition") log.Infof("validating leadership transition")
d := uid.Parse(desired).Group() if desired != current && current != "" {
c := uid.Parse(current).Group() log.Fatalf("desired executor id != current executor id", desired, current)
if d == 0 {
// should *never* happen, but..
log.Fatalf("illegal scheduler UID: %q", desired)
}
if d != c && c != 0 {
log.Fatalf("desired scheduler group (%x) != current scheduler group (%x)", d, c)
} }
} }
@ -637,8 +639,7 @@ func newEtcd(etcdConfigFile string, etcdServerList []string) (client tools.EtcdC
return return
} }
func (s *SchedulerServer) bootstrap(hks hyperkube.Interface, sc *schedcfg.Config) (*ha.SchedulerProcess, ha.DriverFactory, tools.EtcdClient, *uid.UID) { func (s *SchedulerServer) bootstrap(hks hyperkube.Interface, sc *schedcfg.Config) (*ha.SchedulerProcess, ha.DriverFactory, tools.EtcdClient, *mesos.ExecutorID) {
s.frameworkName = strings.TrimSpace(s.frameworkName) s.frameworkName = strings.TrimSpace(s.frameworkName)
if s.frameworkName == "" { if s.frameworkName == "" {
log.Fatalf("framework-name must be a non-empty string") log.Fatalf("framework-name must be a non-empty string")
@ -669,7 +670,7 @@ func (s *SchedulerServer) bootstrap(hks hyperkube.Interface, sc *schedcfg.Config
log.Warningf("user-specified reconcile cooldown too small, defaulting to %v", s.reconcileCooldown) log.Warningf("user-specified reconcile cooldown too small, defaulting to %v", s.reconcileCooldown)
} }
executor, eid, err := s.prepareExecutorInfo(hks) eiPrototype, err := s.prepareExecutorInfo(hks)
if err != nil { if err != nil {
log.Fatalf("misconfigured executor: %v", err) log.Fatalf("misconfigured executor: %v", err)
} }
@ -683,32 +684,22 @@ func (s *SchedulerServer) bootstrap(hks hyperkube.Interface, sc *schedcfg.Config
log.Fatalf("misconfigured etcd: %v", err) log.Fatalf("misconfigured etcd: %v", err)
} }
as := podschedulers.NewAllocationStrategy(
podtask.NewDefaultPredicate(
s.defaultContainerCPULimit,
s.defaultContainerMemLimit,
),
podtask.NewDefaultProcurement(
s.defaultContainerCPULimit,
s.defaultContainerMemLimit,
),
)
// downgrade allocation strategy if user disables "account-for-pod-resources"
if !s.accountForPodResources {
as = podschedulers.NewAllocationStrategy(
podtask.DefaultMinimalPredicate,
podtask.DefaultMinimalProcurement)
}
// mirror all nodes into the nodeStore // mirror all nodes into the nodeStore
var eiRegistry executorinfo.Registry
nodesClient, err := s.createAPIServerClient() nodesClient, err := s.createAPIServerClient()
if err != nil { if err != nil {
log.Fatalf("Cannot create client to watch nodes: %v", err) log.Fatalf("Cannot create client to watch nodes: %v", err)
} }
nodeStore := cache.NewStore(cache.MetaNamespaceKeyFunc)
nodeLW := cache.NewListWatchFromClient(nodesClient, "nodes", api.NamespaceAll, fields.Everything()) nodeLW := cache.NewListWatchFromClient(nodesClient, "nodes", api.NamespaceAll, fields.Everything())
cache.NewReflector(nodeLW, &api.Node{}, nodeStore, s.nodeRelistPeriod).Run() nodeStore, nodeCtl := controllerfw.NewInformer(nodeLW, &api.Node{}, s.nodeRelistPeriod, &controllerfw.ResourceEventHandlerFuncs{
DeleteFunc: func(obj interface{}) {
node := obj.(*api.Node)
if eiRegistry != nil {
log.V(2).Infof("deleting node %q from registry", node.Name)
eiRegistry.Invalidate(node.Name)
}
},
})
lookupNode := func(hostName string) *api.Node { lookupNode := func(hostName string) *api.Node {
n, _, _ := nodeStore.GetByKey(hostName) // ignore error and return nil then n, _, _ := nodeStore.GetByKey(hostName) // ignore error and return nil then
@ -718,10 +709,21 @@ func (s *SchedulerServer) bootstrap(hks hyperkube.Interface, sc *schedcfg.Config
return n.(*api.Node) return n.(*api.Node)
} }
fcfs := podschedulers.NewFCFSPodScheduler(as, lookupNode) execInfoCache, err := executorinfo.NewCache(defaultExecutorInfoCacheSize)
if err != nil {
log.Fatalf("cannot create executorinfo cache: %v", err)
}
eiRegistry, err = executorinfo.NewRegistry(lookupNode, eiPrototype, execInfoCache)
if err != nil {
log.Fatalf("cannot create executorinfo registry: %v", err)
}
pr := podtask.NewDefaultProcurement(eiPrototype, eiRegistry)
fcfs := podschedulers.NewFCFSPodScheduler(pr, lookupNode)
framework := framework.New(framework.Config{ framework := framework.New(framework.Config{
SchedulerConfig: *sc, SchedulerConfig: *sc,
Executor: executor,
Client: client, Client: client,
FailoverTimeout: s.failoverTimeout, FailoverTimeout: s.failoverTimeout,
ReconcileInterval: s.reconcileInterval, ReconcileInterval: s.reconcileInterval,
@ -734,6 +736,7 @@ func (s *SchedulerServer) bootstrap(hks hyperkube.Interface, sc *schedcfg.Config
log.Errorf("failed to renew frameworkId TTL: %v", err) log.Errorf("failed to renew frameworkId TTL: %v", err)
} }
}, },
ExecutorId: eiPrototype.GetExecutorId(),
}) })
masterUri := s.mesosMaster masterUri := s.mesosMaster
@ -765,10 +768,24 @@ func (s *SchedulerServer) bootstrap(hks hyperkube.Interface, sc *schedcfg.Config
// create scheduler core with all components arranged around it // create scheduler core with all components arranged around it
lw := cache.NewListWatchFromClient(client, "pods", api.NamespaceAll, fields.Everything()) lw := cache.NewListWatchFromClient(client, "pods", api.NamespaceAll, fields.Everything())
sched := components.New(sc, framework, fcfs, client, recorder, schedulerProcess.Terminal(), s.mux, lw) sched := components.New(
sc,
framework,
fcfs,
client,
recorder,
schedulerProcess.Terminal(),
s.mux,
lw,
eiPrototype,
s.mesosRoles,
s.defaultContainerCPULimit,
s.defaultContainerMemLimit,
)
runtime.On(framework.Registration(), func() { sched.Run(schedulerProcess.Terminal()) }) runtime.On(framework.Registration(), func() { sched.Run(schedulerProcess.Terminal()) })
runtime.On(framework.Registration(), s.newServiceWriter(schedulerProcess.Terminal())) runtime.On(framework.Registration(), s.newServiceWriter(schedulerProcess.Terminal()))
runtime.On(framework.Registration(), func() { nodeCtl.Run(schedulerProcess.Terminal()) })
driverFactory := ha.DriverFactory(func() (drv bindings.SchedulerDriver, err error) { driverFactory := ha.DriverFactory(func() (drv bindings.SchedulerDriver, err error) {
log.V(1).Infoln("performing deferred initialization") log.V(1).Infoln("performing deferred initialization")
@ -792,7 +809,7 @@ func (s *SchedulerServer) bootstrap(hks hyperkube.Interface, sc *schedcfg.Config
return drv, nil return drv, nil
}) })
return schedulerProcess, driverFactory, etcdClient, eid return schedulerProcess, driverFactory, etcdClient, eiPrototype.GetExecutorId()
} }
func (s *SchedulerServer) failover(driver bindings.SchedulerDriver, hks hyperkube.Interface) error { func (s *SchedulerServer) failover(driver bindings.SchedulerDriver, hks hyperkube.Interface) error {
@ -871,9 +888,18 @@ func (s *SchedulerServer) buildFrameworkInfo() (info *mesos.FrameworkInfo, cred
if s.failoverTimeout > 0 { if s.failoverTimeout > 0 {
info.FailoverTimeout = proto.Float64(s.failoverTimeout) info.FailoverTimeout = proto.Float64(s.failoverTimeout)
} }
if s.mesosRole != "" {
info.Role = proto.String(s.mesosRole) // set the framework's role to the first configured non-star role.
// once Mesos supports multiple roles simply set the configured mesos roles slice.
for _, role := range s.mesosRoles {
if role != "*" {
// mesos currently supports only one role per framework info
// The framework will be offered role's resources as well as * resources
info.Role = proto.String(role)
break
} }
}
if s.mesosAuthPrincipal != "" { if s.mesosAuthPrincipal != "" {
info.Principal = proto.String(s.mesosAuthPrincipal) info.Principal = proto.String(s.mesosAuthPrincipal)
if s.mesosAuthSecretFile == "" { if s.mesosAuthSecretFile == "" {

View File

@ -1,85 +0,0 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package uid
import (
"fmt"
"strconv"
"strings"
log "github.com/golang/glog"
"github.com/pborman/uuid"
)
type UID struct {
group uint64
name string
ser string
}
func New(group uint64, name string) *UID {
if name == "" {
name = uuid.New()
}
return &UID{
group: group,
name: name,
ser: fmt.Sprintf("%x_%s", group, name),
}
}
func (self *UID) Name() string {
if self != nil {
return self.name
}
return ""
}
func (self *UID) Group() uint64 {
if self != nil {
return self.group
}
return 0
}
func (self *UID) String() string {
if self != nil {
return self.ser
}
return ""
}
func Parse(ser string) *UID {
parts := strings.SplitN(ser, "_", 2)
if len(parts) != 2 {
return nil
}
group, err := strconv.ParseUint(parts[0], 16, 64)
if err != nil {
log.Errorf("illegal UID group %q: %v", parts[0], err)
return nil
}
if parts[1] == "" {
log.Errorf("missing UID name: %q", ser)
return nil
}
return &UID{
group: group,
name: parts[1],
ser: ser,
}
}

View File

@ -1,7 +1,6 @@
accept-hosts accept-hosts
accept-paths accept-paths
account-for-pod-resources
admission-control admission-control
admission-control-config-file admission-control-config-file
advertise-address advertise-address
@ -187,8 +186,8 @@ mesos-executor-cpus
mesos-executor-mem mesos-executor-mem
mesos-launch-grace-period mesos-launch-grace-period
mesos-master mesos-master
mesos-role
mesos-sandbox-overlay mesos-sandbox-overlay
mesos-roles
mesos-user mesos-user
minimum-container-ttl-duration minimum-container-ttl-duration
minion-max-log-age minion-max-log-age

View File

@ -20,6 +20,8 @@ import (
"fmt" "fmt"
"k8s.io/kubernetes/pkg/api" "k8s.io/kubernetes/pkg/api"
"k8s.io/kubernetes/pkg/api/unversioned"
client "k8s.io/kubernetes/pkg/client/unversioned"
"k8s.io/kubernetes/pkg/fields" "k8s.io/kubernetes/pkg/fields"
"k8s.io/kubernetes/pkg/labels" "k8s.io/kubernetes/pkg/labels"
"k8s.io/kubernetes/pkg/util" "k8s.io/kubernetes/pkg/util"
@ -30,9 +32,13 @@ import (
var _ = Describe("Mesos", func() { var _ = Describe("Mesos", func() {
framework := NewFramework("pods") framework := NewFramework("pods")
var c *client.Client
var ns string
BeforeEach(func() { BeforeEach(func() {
SkipUnlessProviderIs("mesos/docker") SkipUnlessProviderIs("mesos/docker")
c = framework.Client
ns = framework.Namespace.Name
}) })
It("applies slave attributes as labels", func() { It("applies slave attributes as labels", func() {
@ -66,4 +72,46 @@ var _ = Describe("Mesos", func() {
expectNoError(waitForPodsRunningReady(ns, numpods, util.ForeverTestTimeout), expectNoError(waitForPodsRunningReady(ns, numpods, util.ForeverTestTimeout),
fmt.Sprintf("number of static pods in namespace %s is %d", ns, numpods)) fmt.Sprintf("number of static pods in namespace %s is %d", ns, numpods))
}) })
It("schedules pods labelled with roles on correct slaves", func() {
// launch a pod to find a node which can launch a pod. We intentionally do
// not just take the node list and choose the first of them. Depending on the
// cluster and the scheduler it might be that a "normal" pod cannot be
// scheduled onto it.
By("Trying to launch a pod with a label to get a node which can launch it.")
podName := "with-label"
_, err := c.Pods(ns).Create(&api.Pod{
TypeMeta: unversioned.TypeMeta{
Kind: "Pod",
},
ObjectMeta: api.ObjectMeta{
Name: podName,
Labels: map[string]string{
"k8s.mesosphere.io/roles": "role1",
},
},
Spec: api.PodSpec{
Containers: []api.Container{
{
Name: podName,
Image: "beta.gcr.io/google_containers/pause:2.0",
},
},
},
})
expectNoError(err)
expectNoError(waitForPodRunningInNamespace(c, podName, ns))
pod, err := c.Pods(ns).Get(podName)
expectNoError(err)
nodeClient := framework.Client.Nodes()
role1 := labels.SelectorFromSet(map[string]string{
"k8s.mesosphere.io/attribute-role": "role1",
})
nodes, err := nodeClient.List(role1, fields.Everything())
expectNoError(err)
Expect(nodes.Items[0].Name).To(Equal(pod.Spec.NodeName))
})
}) })