scheduler: implement role awareness

This commit is contained in:
Sergiusz Urbaniak
2015-10-01 16:51:58 +02:00
parent 1a43dcf720
commit 9eae47c6e6
45 changed files with 2591 additions and 914 deletions

View File

@@ -28,7 +28,6 @@ import (
mesos "github.com/mesos/mesos-go/mesosproto"
mutil "github.com/mesos/mesos-go/mesosutil"
bindings "github.com/mesos/mesos-go/scheduler"
execcfg "k8s.io/kubernetes/contrib/mesos/pkg/executor/config"
"k8s.io/kubernetes/contrib/mesos/pkg/executor/messages"
"k8s.io/kubernetes/contrib/mesos/pkg/node"
"k8s.io/kubernetes/contrib/mesos/pkg/offers"
@@ -42,7 +41,6 @@ import (
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/meta"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/metrics"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/podtask"
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/uid"
"k8s.io/kubernetes/pkg/api"
"k8s.io/kubernetes/pkg/api/errors"
client "k8s.io/kubernetes/pkg/client/unversioned"
@@ -71,13 +69,13 @@ type framework struct {
// Config related, write-once
sched scheduler.Scheduler
schedulerConfig *schedcfg.Config
executor *mesos.ExecutorInfo
executorGroup uint64
client *client.Client
failoverTimeout float64 // in seconds
reconcileInterval int64
nodeRegistrator node.Registrator
storeFrameworkId func(id string)
lookupNode node.LookupFunc
executorId *mesos.ExecutorID
// Mesos context
driver bindings.SchedulerDriver // late initialization
@@ -99,7 +97,7 @@ type framework struct {
type Config struct {
SchedulerConfig schedcfg.Config
Executor *mesos.ExecutorInfo
ExecutorId *mesos.ExecutorID
Client *client.Client
StoreFrameworkId func(id string)
FailoverTimeout float64
@@ -114,12 +112,11 @@ func New(config Config) Framework {
k = &framework{
schedulerConfig: &config.SchedulerConfig,
RWMutex: new(sync.RWMutex),
executor: config.Executor,
executorGroup: uid.Parse(config.Executor.ExecutorId.GetValue()).Group(),
client: config.Client,
failoverTimeout: config.FailoverTimeout,
reconcileInterval: config.ReconcileInterval,
nodeRegistrator: node.NewRegistrator(config.Client, config.LookupNode),
executorId: config.ExecutorId,
offers: offers.CreateRegistry(offers.RegistryConfig{
Compat: func(o *mesos.Offer) bool {
// the node must be registered and have up-to-date labels
@@ -128,10 +125,17 @@ func New(config Config) Framework {
return false
}
// the executor IDs must not identify a kubelet-executor with a group that doesn't match ours
for _, eid := range o.GetExecutorIds() {
execuid := uid.Parse(eid.GetValue())
if execuid.Name() == execcfg.DefaultInfoID && execuid.Group() != k.executorGroup {
eids := len(o.GetExecutorIds())
switch {
case eids > 1:
// at most one executor id expected. More than one means that
// the given node is seriously in trouble.
return false
case eids == 1:
// the executor id must match, otherwise the running executor
// is incompatible with the current scheduler configuration.
if eid := o.GetExecutorIds()[0]; eid.GetValue() != config.ExecutorId.GetValue() {
return false
}
}
@@ -161,6 +165,7 @@ func New(config Config) Framework {
return proc.ErrorChanf("cannot execute action with unregistered scheduler")
}),
storeFrameworkId: config.StoreFrameworkId,
lookupNode: config.LookupNode,
}
return k
}
@@ -188,6 +193,45 @@ func (k *framework) asMaster() proc.Doer {
return k.asRegisteredMaster
}
// An executorRef holds a reference to an executor and the slave it is running on
type executorRef struct {
executorID *mesos.ExecutorID
slaveID *mesos.SlaveID
}
// executorRefs returns a slice of known references to running executors known to this framework
func (k *framework) executorRefs() []executorRef {
slaves := k.slaveHostNames.SlaveIDs()
refs := make([]executorRef, 0, len(slaves))
for _, slaveID := range slaves {
hostname := k.slaveHostNames.HostName(slaveID)
if hostname == "" {
log.Warningf("hostname lookup for slaveID %q failed", slaveID)
continue
}
node := k.lookupNode(hostname)
if node == nil {
log.Warningf("node lookup for slaveID %q failed", slaveID)
continue
}
eid, ok := node.Annotations[meta.ExecutorIdKey]
if !ok {
log.Warningf("unable to find %q annotation for node %v", meta.ExecutorIdKey, node)
continue
}
refs = append(refs, executorRef{
executorID: mutil.NewExecutorID(eid),
slaveID: mutil.NewSlaveID(slaveID),
})
}
return refs
}
func (k *framework) installDebugHandlers(mux *http.ServeMux) {
wrappedHandler := func(uri string, h http.Handler) {
mux.HandleFunc(uri, func(w http.ResponseWriter, r *http.Request) {
@@ -210,6 +254,7 @@ func (k *framework) installDebugHandlers(mux *http.ServeMux) {
}
})
}
requestReconciliation := func(uri string, requestAction func()) {
wrappedHandler(uri, http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
requestAction()
@@ -220,18 +265,34 @@ func (k *framework) installDebugHandlers(mux *http.ServeMux) {
requestReconciliation("/debug/actions/requestImplicit", k.tasksReconciler.RequestImplicit)
wrappedHandler("/debug/actions/kamikaze", http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
slaves := k.slaveHostNames.SlaveIDs()
for _, slaveId := range slaves {
refs := k.executorRefs()
for _, ref := range refs {
_, err := k.driver.SendFrameworkMessage(
k.executor.ExecutorId,
mutil.NewSlaveID(slaveId),
messages.Kamikaze)
ref.executorID,
ref.slaveID,
messages.Kamikaze,
)
if err != nil {
log.Warningf("failed to send kamikaze message to slave %s: %v", slaveId, err)
} else {
io.WriteString(w, fmt.Sprintf("kamikaze slave %s\n", slaveId))
msg := fmt.Sprintf(
"error sending kamikaze message to executor %q on slave %q: %v",
ref.executorID.GetValue(),
ref.slaveID.GetValue(),
err,
)
log.Warning(msg)
fmt.Fprintln(w, msg)
continue
}
io.WriteString(w, fmt.Sprintf(
"kamikaze message sent to executor %q on slave %q\n",
ref.executorID.GetValue(),
ref.slaveID.GetValue(),
))
}
io.WriteString(w, "OK")
}))
}
@@ -702,11 +763,16 @@ func (ks *framework) KillTask(id string) error {
}
func (ks *framework) LaunchTask(t *podtask.T) error {
taskInfo, err := t.BuildTaskInfo()
if err != nil {
return err
}
// assume caller is holding scheduler lock
taskList := []*mesos.TaskInfo{t.BuildTaskInfo(ks.executor)}
taskList := []*mesos.TaskInfo{taskInfo}
offerIds := []*mesos.OfferID{t.Offer.Details().Id}
filters := &mesos.Filters{}
_, err := ks.driver.LaunchTasks(offerIds, taskList, filters)
_, err = ks.driver.LaunchTasks(offerIds, taskList, filters)
return err
}