Use BindingHostKey annotation to detect scheduled pods in k8sm-scheduler

Before NodeName in the pod spec was used. Hence, pods with a fixed, pre-set NodeName were never scheduled by the k8sm-scheduler, leading e.g. to a failing e2e intra-pod test. Fixes mesosphere/kubernetes-mesos#388
2015-07-13 16:34:43 +02:00
parent b0d31fb794
commit f59b5f503b
7 changed files with 199 additions and 84 deletions
--- a/contrib/mesos/pkg/scheduler/meta/annotations.go
+++ b/contrib/mesos/pkg/scheduler/meta/annotations.go
@@ -18,7 +18,10 @@ package meta

 // kubernetes api object annotations
 const (
-	BindingHostKey           = "k8s.mesosphere.io/bindingHost"
+	// the BindingHostKey pod annotation marks a pod as being assigned to a Mesos
+	// slave. It is already or will be launched on the slave as a task.
+	BindingHostKey = "k8s.mesosphere.io/bindingHost"
+
 	TaskIdKey                = "k8s.mesosphere.io/taskId"
 	SlaveIdKey               = "k8s.mesosphere.io/slaveId"
 	OfferIdKey               = "k8s.mesosphere.io/offerId"
--- a/contrib/mesos/pkg/scheduler/plugin.go
+++ b/contrib/mesos/pkg/scheduler/plugin.go
@@ -151,6 +151,11 @@ func (b *binder) rollback(task *podtask.T, err error) error {
 }

 // assumes that: caller has acquired scheduler lock and that the task is still pending
+//
+// bind does not actually do the binding itself, but launches the pod as a Mesos task. The
+// kubernetes executor on the slave will finally do the binding. This is different from the
+// upstream scheduler in the sense that the upstream scheduler does the binding and the
+// kubelet will notice that and launches the pod.
 func (b *binder) bind(ctx api.Context, binding *api.Binding, task *podtask.T) (err error) {
 	// sanity check: ensure that the task hasAcceptedOffer(), it's possible that between
 	// Schedule() and now that the offer for this task was rescinded or invalidated.
@@ -193,16 +198,7 @@ func (b *binder) prepareTaskForLaunch(ctx api.Context, machine string, task *pod
 	oemCt := pod.Spec.Containers
 	pod.Spec.Containers = append([]api.Container{}, oemCt...) // (shallow) clone before mod

-	if pod.Annotations == nil {
-		pod.Annotations = make(map[string]string)
-	} else {
-		oemAnn := pod.Annotations
-		pod.Annotations = make(map[string]string)
-		for k, v := range oemAnn {
-			pod.Annotations[k] = v
-		}
-	}
-	pod.Annotations[annotation.BindingHostKey] = machine
+	annotateForExecutorOnSlave(&pod, machine)
 	task.SaveRecoveryInfo(pod.Annotations)

 	for _, entry := range task.Spec.PortMap {
@@ -237,6 +233,31 @@ type kubeScheduler struct {
 	defaultContainerMemLimit mresource.MegaBytes
 }

+// annotatedForExecutor checks whether a pod is assigned to a Mesos slave, and
+// possibly already launched. It can, but doesn't have to be scheduled already
+// in the sense of kubernetes, i.e. the NodeName field might still be empty.
+func annotatedForExecutor(pod *api.Pod) bool {
+	_, ok := pod.ObjectMeta.Annotations[annotation.BindingHostKey]
+	return ok
+}
+
+// annotateForExecutorOnSlave sets the BindingHostKey annotation which
+// marks the pod to be processed by the scheduler and launched as a Mesos
+// task. The executor on the slave will to the final binding to finish the
+// scheduling in the kubernetes sense.
+func annotateForExecutorOnSlave(pod *api.Pod, slave string) {
+	if pod.Annotations == nil {
+		pod.Annotations = make(map[string]string)
+	} else {
+		oemAnn := pod.Annotations
+		pod.Annotations = make(map[string]string)
+		for k, v := range oemAnn {
+			pod.Annotations[k] = v
+		}
+	}
+	pod.Annotations[annotation.BindingHostKey] = slave
+}
+
 // Schedule implements the Scheduler interface of Kubernetes.
 // It returns the selectedMachine's name and error (if there's any).
 func (k *kubeScheduler) Schedule(pod *api.Pod, unused algorithm.MinionLister) (string, error) {
@@ -441,7 +462,7 @@ func (q *queuer) Run(done <-chan struct{}) {
 			}

 			pod := p.(*Pod)
-			if pod.Spec.NodeName != "" {
+			if annotatedForExecutor(pod.Pod) {
 				log.V(3).Infof("dequeuing pod for scheduling: %v", pod.Pod.Name)
 				q.dequeue(pod.GetUID())
 			} else {
@@ -490,7 +511,7 @@ func (q *queuer) yield() *api.Pod {
 			log.Warningf("yield unable to understand pod object %+v, will skip: %v", pod, err)
 		} else if !q.podUpdates.Poll(podName, queue.POP_EVENT) {
 			log.V(1).Infof("yield popped a transitioning pod, skipping: %+v", pod)
-		} else if pod.Spec.NodeName != "" {
+		} else if annotatedForExecutor(pod) {
 			// should never happen if enqueuePods is filtering properly
 			log.Warningf("yield popped an already-scheduled pod, skipping: %+v", pod)
 		} else {
@@ -798,7 +819,7 @@ func (s *schedulingPlugin) reconcilePod(oldPod api.Pod) {
 		return
 	}
 	if oldPod.Spec.NodeName != pod.Spec.NodeName {
-		if pod.Spec.NodeName == "" {
+		if annotatedForExecutor(pod) {
 			// pod is unscheduled.
 			// it's possible that we dropped the pod in the scheduler error handler
 			// because of task misalignment with the pod (task.Has(podtask.Launched) == true)
--- a/contrib/mesos/pkg/scheduler/plugin_test.go
+++ b/contrib/mesos/pkg/scheduler/plugin_test.go
@@ -37,6 +37,7 @@ import (
 	"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/queue"
 	schedcfg "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/scheduler/config"
 	"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/scheduler/ha"
+	"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/scheduler/meta"
 	"github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/scheduler/podtask"
 	mresource "github.com/GoogleCloudPlatform/kubernetes/contrib/mesos/pkg/scheduler/resource"
 	log "github.com/golang/glog"
@@ -189,8 +190,11 @@ func (lw *MockPodsListWatch) Delete(pod *api.Pod, notify bool) {
 }

 // Create a pod with a given index, requiring one port
-func NewTestPod(i int) *api.Pod {
-	name := fmt.Sprintf("pod%d", i)
+var currentPodNum int = 0
+
+func NewTestPod() (*api.Pod, int) {
+	currentPodNum = currentPodNum + 1
+	name := fmt.Sprintf("pod%d", currentPodNum)
 	return &api.Pod{
 		TypeMeta: api.TypeMeta{APIVersion: testapi.Version()},
 		ObjectMeta: api.ObjectMeta{
@@ -203,7 +207,7 @@ func NewTestPod(i int) *api.Pod {
 				{
 					Ports: []api.ContainerPort{
 						{
-							ContainerPort: 8000 + i,
+							ContainerPort: 8000 + currentPodNum,
 							Protocol:      api.ProtocolTCP,
 						},
 					},
@@ -211,7 +215,7 @@ func NewTestPod(i int) *api.Pod {
 			},
 		},
 		Status: api.PodStatus{
-			PodIP: fmt.Sprintf("1.2.3.%d", 4+i),
+			PodIP: fmt.Sprintf("1.2.3.%d", 4+currentPodNum),
 			Conditions: []api.PodCondition{
 				{
 					Type:   api.PodReady,
@@ -219,12 +223,12 @@ func NewTestPod(i int) *api.Pod {
 				},
 			},
 		},
-	}
+	}, currentPodNum
 }

 // Offering some cpus and memory and the 8000-9000 port range
-func NewTestOffer(i int) *mesos.Offer {
-	hostname := fmt.Sprintf("h%d", i)
+func NewTestOffer(id string) *mesos.Offer {
+	hostname := "some_hostname"
 	cpus := util.NewScalarResource("cpus", 3.75)
 	mem := util.NewScalarResource("mem", 940)
 	var port8000 uint64 = 8000
@@ -232,7 +236,7 @@ func NewTestOffer(i int) *mesos.Offer {
 	ports8000to9000 := mesos.Value_Range{Begin: &port8000, End: &port9000}
 	ports := util.NewRangesResource("ports", []*mesos.Value_Range{&ports8000to9000})
 	return &mesos.Offer{
-		Id:        util.NewOfferID(fmt.Sprintf("offer%d", i)),
+		Id:        util.NewOfferID(id),
 		Hostname:  &hostname,
 		SlaveId:   util.NewSlaveID(hostname),
 		Resources: []*mesos.Resource{cpus, mem, ports},
@@ -435,11 +439,20 @@ func TestPlugin_LifeCycle(t *testing.T) {
 	mockDriver.On("SendFrameworkMessage", mAny("*mesosproto.ExecutorID"), mAny("*mesosproto.SlaveID"), mAny("string")).
 		Return(mesos.Status_DRIVER_RUNNING, nil)

-	launchedTasks := make(chan *mesos.TaskInfo, 1)
+	type LaunchedTask struct {
+		offerId  mesos.OfferID
+		taskInfo *mesos.TaskInfo
+	}
+	launchedTasks := make(chan LaunchedTask, 1)
 	launchTasksCalledFunc := func(args mock.Arguments) {
+		offerIDs := args.Get(0).([]*mesos.OfferID)
 		taskInfos := args.Get(1).([]*mesos.TaskInfo)
+		assert.Equal(1, len(offerIDs))
 		assert.Equal(1, len(taskInfos))
-		launchedTasks <- taskInfos[0]
+		launchedTasks <- LaunchedTask{
+			offerId:  *offerIDs[0],
+			taskInfo: taskInfos[0],
+		}
 	}
 	mockDriver.On("LaunchTasks", mAny("[]*mesosproto.OfferID"), mAny("[]*mesosproto.TaskInfo"), mAny("*mesosproto.Filters")).
 		Return(mesos.Status_DRIVER_RUNNING, nil).Run(launchTasksCalledFunc)
@@ -469,30 +482,30 @@ func TestPlugin_LifeCycle(t *testing.T) {
 	//TODO(jdef) refactor things above here into a test suite setup of some sort

 	// fake new, unscheduled pod
-	pod1 := NewTestPod(1)
-	podListWatch.Add(pod1, true) // notify watchers
+	pod, i := NewTestPod()
+	podListWatch.Add(pod, true) // notify watchers

 	// wait for failedScheduling event because there is no offer
 	assert.EventWithReason(eventObserver, "failedScheduling", "failedScheduling event not received")

 	// add some matching offer
-	offers1 := []*mesos.Offer{NewTestOffer(1)}
-	testScheduler.ResourceOffers(nil, offers1)
+	offers := []*mesos.Offer{NewTestOffer(fmt.Sprintf("offer%d", i))}
+	testScheduler.ResourceOffers(nil, offers)

 	// and wait for scheduled pod
 	assert.EventWithReason(eventObserver, "scheduled")
 	select {
 	case launchedTask := <-launchedTasks:
 		// report back that the task has been staged, and then started by mesos
-		testScheduler.StatusUpdate(mockDriver, newTaskStatusForTask(launchedTask, mesos.TaskState_TASK_STAGING))
-		testScheduler.StatusUpdate(mockDriver, newTaskStatusForTask(launchedTask, mesos.TaskState_TASK_RUNNING))
+		testScheduler.StatusUpdate(mockDriver, newTaskStatusForTask(launchedTask.taskInfo, mesos.TaskState_TASK_STAGING))
+		testScheduler.StatusUpdate(mockDriver, newTaskStatusForTask(launchedTask.taskInfo, mesos.TaskState_TASK_RUNNING))

 		// check that ExecutorInfo.data has the static pod data
-		assert.Len(launchedTask.Executor.Data, 3)
+		assert.Len(launchedTask.taskInfo.Executor.Data, 3)

 		// report back that the task has been lost
 		mockDriver.AssertNumberOfCalls(t, "SendFrameworkMessage", 0)
-		testScheduler.StatusUpdate(mockDriver, newTaskStatusForTask(launchedTask, mesos.TaskState_TASK_LOST))
+		testScheduler.StatusUpdate(mockDriver, newTaskStatusForTask(launchedTask.taskInfo, mesos.TaskState_TASK_LOST))

 		// and wait that framework message is sent to executor
 		mockDriver.AssertNumberOfCalls(t, "SendFrameworkMessage", 1)
@@ -501,20 +514,15 @@ func TestPlugin_LifeCycle(t *testing.T) {
 		t.Fatalf("timed out waiting for launchTasks call")
 	}

-	// start another pod
-	podNum := 2
-	startPod := func() (*api.Pod, *mesos.TaskInfo, *mesos.Offer) {
-		podNum = podNum + 1
-
-		// create pod
-		pod := NewTestPod(podNum)
-		podListWatch.Add(pod, true) // notify watchers
+	// define generic pod startup
+	startPodWithOffers := func(pod *api.Pod, offers []*mesos.Offer) (*api.Pod, *LaunchedTask, *mesos.Offer) {
+		// notify watchers of new pod
+		podListWatch.Add(pod, true)

 		// wait for failedScheduling event because there is no offer
 		assert.EventWithReason(eventObserver, "failedScheduling", "failedScheduling event not received")

 		// supply a matching offer
-		offers := []*mesos.Offer{NewTestOffer(podNum)}
 		testScheduler.ResourceOffers(mockDriver, offers)

 		// and wait to get scheduled
@@ -523,9 +531,15 @@ func TestPlugin_LifeCycle(t *testing.T) {
 		// wait for driver.launchTasks call
 		select {
 		case launchedTask := <-launchedTasks:
-			testScheduler.StatusUpdate(mockDriver, newTaskStatusForTask(launchedTask, mesos.TaskState_TASK_STAGING))
-			testScheduler.StatusUpdate(mockDriver, newTaskStatusForTask(launchedTask, mesos.TaskState_TASK_RUNNING))
-			return pod, launchedTask, offers[0]
+			testScheduler.StatusUpdate(mockDriver, newTaskStatusForTask(launchedTask.taskInfo, mesos.TaskState_TASK_STAGING))
+			testScheduler.StatusUpdate(mockDriver, newTaskStatusForTask(launchedTask.taskInfo, mesos.TaskState_TASK_RUNNING))
+			for _, offer := range offers {
+				if offer.Id.GetValue() == launchedTask.offerId.GetValue() {
+					return pod, &launchedTask, offer
+				}
+			}
+			t.Fatalf("unknown offer used to start a pod")
+			return nil, nil, nil

 		case <-time.After(5 * time.Second):
 			t.Fatal("timed out waiting for launchTasks")
@@ -533,12 +547,19 @@ func TestPlugin_LifeCycle(t *testing.T) {
 		}
 	}

-	pod, launchedTask, _ := startPod()
+	startTestPod := func() (*api.Pod, *LaunchedTask, *mesos.Offer) {
+		pod, i := NewTestPod()
+		offers := []*mesos.Offer{NewTestOffer(fmt.Sprintf("offer%d", i))}
+		return startPodWithOffers(pod, offers)
+	}
+
+	// start another pod
+	pod, launchedTask, _ := startTestPod()

 	// mock drvier.KillTask, should be invoked when a pod is deleted
 	mockDriver.On("KillTask", mAny("*mesosproto.TaskID")).Return(mesos.Status_DRIVER_RUNNING, nil).Run(func(args mock.Arguments) {
 		killedTaskId := *(args.Get(0).(*mesos.TaskID))
-		assert.Equal(*launchedTask.TaskId, killedTaskId, "expected same TaskID as during launch")
+		assert.Equal(*launchedTask.taskInfo.TaskId, killedTaskId, "expected same TaskID as during launch")
 	})
 	killTaskCalled := mockDriver.Upon()

@@ -549,12 +570,31 @@ func TestPlugin_LifeCycle(t *testing.T) {
 	select {
 	case <-killTaskCalled:
 		// report back that the task is finished
-		testScheduler.StatusUpdate(mockDriver, newTaskStatusForTask(launchedTask, mesos.TaskState_TASK_FINISHED))
+		testScheduler.StatusUpdate(mockDriver, newTaskStatusForTask(launchedTask.taskInfo, mesos.TaskState_TASK_FINISHED))

 	case <-time.After(5 * time.Second):
 		t.Fatal("timed out waiting for KillTask")
 	}

+	// start a pod with on a given NodeName and check that it is scheduled to the right host
+	pod, i = NewTestPod()
+	pod.Spec.NodeName = "hostname1"
+	offers = []*mesos.Offer{}
+	for j := 0; j < 3; j++ {
+		offer := NewTestOffer(fmt.Sprintf("offer%d_%d", i, j))
+		hostname := fmt.Sprintf("hostname%d", j)
+		offer.Hostname = &hostname
+		offers = append(offers, offer)
+	}
+
+	_, _, usedOffer := startPodWithOffers(pod, offers)
+
+	assert.Equal(offers[1].Id.GetValue(), usedOffer.Id.GetValue())
+	assert.Equal(pod.Spec.NodeName, *usedOffer.Hostname)
+
+	testScheduler.OfferRescinded(mockDriver, offers[0].Id)
+	testScheduler.OfferRescinded(mockDriver, offers[2].Id)
+
 	// start pods:
 	// - which are failing while binding,
 	// - leading to reconciliation
@@ -574,26 +614,30 @@ func TestPlugin_LifeCycle(t *testing.T) {
 	}

 	// 1. with pod deleted from the apiserver
-	pod, launchedTask, _ = startPod()
+	pod, launchedTask, _ = startTestPod()
 	podListWatch.Delete(pod, false) // not notifying the watchers
-	failPodFromExecutor(launchedTask)
+	failPodFromExecutor(launchedTask.taskInfo)

 	// 2. with pod still on the apiserver, not bound
-	pod, launchedTask, _ = startPod()
-	failPodFromExecutor(launchedTask)
+	pod, launchedTask, _ = startTestPod()
+	failPodFromExecutor(launchedTask.taskInfo)

 	// 3. with pod still on the apiserver, bound i.e. host!=""
-	pod, launchedTask, usedOffer := startPod()
-	pod.Spec.NodeName = *usedOffer.Hostname
+	pod, launchedTask, usedOffer = startTestPod()
+	pod.Annotations = map[string]string{
+		meta.BindingHostKey: *usedOffer.Hostname,
+	}
 	podListWatch.Modify(pod, false) // not notifying the watchers
-	failPodFromExecutor(launchedTask)
+	failPodFromExecutor(launchedTask.taskInfo)

 	// 4. with pod still on the apiserver, bound i.e. host!="", notified via ListWatch
-	pod, launchedTask, usedOffer = startPod()
-	pod.Spec.NodeName = *usedOffer.Hostname
+	pod, launchedTask, usedOffer = startTestPod()
+	pod.Annotations = map[string]string{
+		meta.BindingHostKey: *usedOffer.Hostname,
+	}
 	podListWatch.Modify(pod, true) // notifying the watchers
 	time.Sleep(time.Second / 2)
-	failPodFromExecutor(launchedTask)
+	failPodFromExecutor(launchedTask.taskInfo)
 }

 func TestDeleteOne_NonexistentPod(t *testing.T) {
--- a/contrib/mesos/pkg/scheduler/podtask/pod_task.go
+++ b/contrib/mesos/pkg/scheduler/podtask/pod_task.go
@@ -213,6 +213,11 @@ func (t *T) AcceptOffer(offer *mesos.Offer) bool {
 		return false
 	}

+	// if the user has specified a target host, make sure this offer is for that host
+	if t.Pod.Spec.NodeName != "" && offer.GetHostname() != t.Pod.Spec.NodeName {
+		return false
+	}
+
 	// check ports
 	if _, err := t.mapper.Generate(t, offer); err != nil {
 		log.V(3).Info(err)