scheduler: implement role awareness
This commit is contained in:
@@ -28,7 +28,6 @@ import (
|
||||
mesos "github.com/mesos/mesos-go/mesosproto"
|
||||
mutil "github.com/mesos/mesos-go/mesosutil"
|
||||
bindings "github.com/mesos/mesos-go/scheduler"
|
||||
execcfg "k8s.io/kubernetes/contrib/mesos/pkg/executor/config"
|
||||
"k8s.io/kubernetes/contrib/mesos/pkg/executor/messages"
|
||||
"k8s.io/kubernetes/contrib/mesos/pkg/node"
|
||||
"k8s.io/kubernetes/contrib/mesos/pkg/offers"
|
||||
@@ -42,7 +41,6 @@ import (
|
||||
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/meta"
|
||||
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/metrics"
|
||||
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/podtask"
|
||||
"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/uid"
|
||||
"k8s.io/kubernetes/pkg/api"
|
||||
"k8s.io/kubernetes/pkg/api/errors"
|
||||
client "k8s.io/kubernetes/pkg/client/unversioned"
|
||||
@@ -71,13 +69,13 @@ type framework struct {
|
||||
// Config related, write-once
|
||||
sched scheduler.Scheduler
|
||||
schedulerConfig *schedcfg.Config
|
||||
executor *mesos.ExecutorInfo
|
||||
executorGroup uint64
|
||||
client *client.Client
|
||||
failoverTimeout float64 // in seconds
|
||||
reconcileInterval int64
|
||||
nodeRegistrator node.Registrator
|
||||
storeFrameworkId func(id string)
|
||||
lookupNode node.LookupFunc
|
||||
executorId *mesos.ExecutorID
|
||||
|
||||
// Mesos context
|
||||
driver bindings.SchedulerDriver // late initialization
|
||||
@@ -99,7 +97,7 @@ type framework struct {
|
||||
|
||||
type Config struct {
|
||||
SchedulerConfig schedcfg.Config
|
||||
Executor *mesos.ExecutorInfo
|
||||
ExecutorId *mesos.ExecutorID
|
||||
Client *client.Client
|
||||
StoreFrameworkId func(id string)
|
||||
FailoverTimeout float64
|
||||
@@ -114,12 +112,11 @@ func New(config Config) Framework {
|
||||
k = &framework{
|
||||
schedulerConfig: &config.SchedulerConfig,
|
||||
RWMutex: new(sync.RWMutex),
|
||||
executor: config.Executor,
|
||||
executorGroup: uid.Parse(config.Executor.ExecutorId.GetValue()).Group(),
|
||||
client: config.Client,
|
||||
failoverTimeout: config.FailoverTimeout,
|
||||
reconcileInterval: config.ReconcileInterval,
|
||||
nodeRegistrator: node.NewRegistrator(config.Client, config.LookupNode),
|
||||
executorId: config.ExecutorId,
|
||||
offers: offers.CreateRegistry(offers.RegistryConfig{
|
||||
Compat: func(o *mesos.Offer) bool {
|
||||
// the node must be registered and have up-to-date labels
|
||||
@@ -128,10 +125,17 @@ func New(config Config) Framework {
|
||||
return false
|
||||
}
|
||||
|
||||
// the executor IDs must not identify a kubelet-executor with a group that doesn't match ours
|
||||
for _, eid := range o.GetExecutorIds() {
|
||||
execuid := uid.Parse(eid.GetValue())
|
||||
if execuid.Name() == execcfg.DefaultInfoID && execuid.Group() != k.executorGroup {
|
||||
eids := len(o.GetExecutorIds())
|
||||
switch {
|
||||
case eids > 1:
|
||||
// at most one executor id expected. More than one means that
|
||||
// the given node is seriously in trouble.
|
||||
return false
|
||||
|
||||
case eids == 1:
|
||||
// the executor id must match, otherwise the running executor
|
||||
// is incompatible with the current scheduler configuration.
|
||||
if eid := o.GetExecutorIds()[0]; eid.GetValue() != config.ExecutorId.GetValue() {
|
||||
return false
|
||||
}
|
||||
}
|
||||
@@ -161,6 +165,7 @@ func New(config Config) Framework {
|
||||
return proc.ErrorChanf("cannot execute action with unregistered scheduler")
|
||||
}),
|
||||
storeFrameworkId: config.StoreFrameworkId,
|
||||
lookupNode: config.LookupNode,
|
||||
}
|
||||
return k
|
||||
}
|
||||
@@ -188,6 +193,45 @@ func (k *framework) asMaster() proc.Doer {
|
||||
return k.asRegisteredMaster
|
||||
}
|
||||
|
||||
// An executorRef holds a reference to an executor and the slave it is running on
|
||||
type executorRef struct {
|
||||
executorID *mesos.ExecutorID
|
||||
slaveID *mesos.SlaveID
|
||||
}
|
||||
|
||||
// executorRefs returns a slice of known references to running executors known to this framework
|
||||
func (k *framework) executorRefs() []executorRef {
|
||||
slaves := k.slaveHostNames.SlaveIDs()
|
||||
refs := make([]executorRef, 0, len(slaves))
|
||||
|
||||
for _, slaveID := range slaves {
|
||||
hostname := k.slaveHostNames.HostName(slaveID)
|
||||
if hostname == "" {
|
||||
log.Warningf("hostname lookup for slaveID %q failed", slaveID)
|
||||
continue
|
||||
}
|
||||
|
||||
node := k.lookupNode(hostname)
|
||||
if node == nil {
|
||||
log.Warningf("node lookup for slaveID %q failed", slaveID)
|
||||
continue
|
||||
}
|
||||
|
||||
eid, ok := node.Annotations[meta.ExecutorIdKey]
|
||||
if !ok {
|
||||
log.Warningf("unable to find %q annotation for node %v", meta.ExecutorIdKey, node)
|
||||
continue
|
||||
}
|
||||
|
||||
refs = append(refs, executorRef{
|
||||
executorID: mutil.NewExecutorID(eid),
|
||||
slaveID: mutil.NewSlaveID(slaveID),
|
||||
})
|
||||
}
|
||||
|
||||
return refs
|
||||
}
|
||||
|
||||
func (k *framework) installDebugHandlers(mux *http.ServeMux) {
|
||||
wrappedHandler := func(uri string, h http.Handler) {
|
||||
mux.HandleFunc(uri, func(w http.ResponseWriter, r *http.Request) {
|
||||
@@ -210,6 +254,7 @@ func (k *framework) installDebugHandlers(mux *http.ServeMux) {
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
requestReconciliation := func(uri string, requestAction func()) {
|
||||
wrappedHandler(uri, http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
requestAction()
|
||||
@@ -220,18 +265,34 @@ func (k *framework) installDebugHandlers(mux *http.ServeMux) {
|
||||
requestReconciliation("/debug/actions/requestImplicit", k.tasksReconciler.RequestImplicit)
|
||||
|
||||
wrappedHandler("/debug/actions/kamikaze", http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
slaves := k.slaveHostNames.SlaveIDs()
|
||||
for _, slaveId := range slaves {
|
||||
refs := k.executorRefs()
|
||||
|
||||
for _, ref := range refs {
|
||||
_, err := k.driver.SendFrameworkMessage(
|
||||
k.executor.ExecutorId,
|
||||
mutil.NewSlaveID(slaveId),
|
||||
messages.Kamikaze)
|
||||
ref.executorID,
|
||||
ref.slaveID,
|
||||
messages.Kamikaze,
|
||||
)
|
||||
|
||||
if err != nil {
|
||||
log.Warningf("failed to send kamikaze message to slave %s: %v", slaveId, err)
|
||||
} else {
|
||||
io.WriteString(w, fmt.Sprintf("kamikaze slave %s\n", slaveId))
|
||||
msg := fmt.Sprintf(
|
||||
"error sending kamikaze message to executor %q on slave %q: %v",
|
||||
ref.executorID.GetValue(),
|
||||
ref.slaveID.GetValue(),
|
||||
err,
|
||||
)
|
||||
log.Warning(msg)
|
||||
fmt.Fprintln(w, msg)
|
||||
continue
|
||||
}
|
||||
|
||||
io.WriteString(w, fmt.Sprintf(
|
||||
"kamikaze message sent to executor %q on slave %q\n",
|
||||
ref.executorID.GetValue(),
|
||||
ref.slaveID.GetValue(),
|
||||
))
|
||||
}
|
||||
|
||||
io.WriteString(w, "OK")
|
||||
}))
|
||||
}
|
||||
@@ -702,11 +763,16 @@ func (ks *framework) KillTask(id string) error {
|
||||
}
|
||||
|
||||
func (ks *framework) LaunchTask(t *podtask.T) error {
|
||||
taskInfo, err := t.BuildTaskInfo()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// assume caller is holding scheduler lock
|
||||
taskList := []*mesos.TaskInfo{t.BuildTaskInfo(ks.executor)}
|
||||
taskList := []*mesos.TaskInfo{taskInfo}
|
||||
offerIds := []*mesos.OfferID{t.Offer.Details().Id}
|
||||
filters := &mesos.Filters{}
|
||||
_, err := ks.driver.LaunchTasks(offerIds, taskList, filters)
|
||||
_, err = ks.driver.LaunchTasks(offerIds, taskList, filters)
|
||||
return err
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user