Merge pull request #50949 from bsalamat/preemption_eviction

Automatic merge from submit-queue Add pod preemption to the scheduler **What this PR does / why we need it**: This is the last of a series of PRs to add priority-based preemption to the scheduler. This PR connects the preemption logic to the scheduler workflow. **Which issue this PR fixes** *(optional, in `fixes #<issue number>(, fixes #<issue_number>, ...)` format, will close that issue when PR gets merged)*: fixes #48646 **Special notes for your reviewer**: This PR includes other PRs which are under review (#50805, #50405, #50190). All the new code is located in 43627afdf9. **Release note**: ```release-note Add priority-based preemption to the scheduler. ``` ref/ #47604 /assign @davidopp @kubernetes/sig-scheduling-pr-reviews
2017-09-08 14:19:42 -07:00
parent ed154988c5 c0b718373b
commit f695a3120a
34 changed files with 1900 additions and 91 deletions
--- a/test/e2e/scheduling/BUILD
+++ b/test/e2e/scheduling/BUILD
@@ -15,6 +15,7 @@ go_library(
        "nvidia-gpus.go",
        "opaque_resource.go",
        "predicates.go",
+        "preemption.go",
        "priorities.go",
        "rescheduler.go",
    ],
@@ -33,6 +34,7 @@ go_library(
        "//vendor/github.com/stretchr/testify/assert:go_default_library",
        "//vendor/k8s.io/api/core/v1:go_default_library",
        "//vendor/k8s.io/api/extensions/v1beta1:go_default_library",
+        "//vendor/k8s.io/api/scheduling/v1alpha1:go_default_library",
        "//vendor/k8s.io/apimachinery/pkg/api/errors:go_default_library",
        "//vendor/k8s.io/apimachinery/pkg/api/resource:go_default_library",
        "//vendor/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library",
--- a/test/e2e/scheduling/predicates.go
+++ b/test/e2e/scheduling/predicates.go
@@ -52,6 +52,7 @@ type pausePodConfig struct {
 	NodeName                          string
 	Ports                             []v1.ContainerPort
 	OwnerReferences                   []metav1.OwnerReference
+	PriorityClassName                 string
 }

 var _ = SIGDescribe("SchedulerPredicates [Serial]", func() {
@@ -555,8 +556,9 @@ func initPausePod(f *framework.Framework, conf pausePodConfig) *v1.Pod {
 					Ports: conf.Ports,
 				},
 			},
-			Tolerations: conf.Tolerations,
-			NodeName:    conf.NodeName,
+			Tolerations:       conf.Tolerations,
+			NodeName:          conf.NodeName,
+			PriorityClassName: conf.PriorityClassName,
 		},
 	}
 	if conf.Resources != nil {
--- a/test/e2e/scheduling/preemption.go
+++ b/test/e2e/scheduling/preemption.go
@@ -0,0 +1,128 @@
+/*
+Copyright 2017 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package scheduling
+
+import (
+	"fmt"
+	"time"
+
+	"k8s.io/api/core/v1"
+	"k8s.io/api/scheduling/v1alpha1"
+	"k8s.io/apimachinery/pkg/api/errors"
+	"k8s.io/apimachinery/pkg/api/resource"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	clientset "k8s.io/client-go/kubernetes"
+	"k8s.io/kubernetes/test/e2e/framework"
+
+	. "github.com/onsi/ginkgo"
+	. "github.com/onsi/gomega"
+	_ "github.com/stretchr/testify/assert"
+)
+
+var _ = SIGDescribe("SchedulerPreemption [Serial] [Feature:PodPreemption]", func() {
+	var cs clientset.Interface
+	var nodeList *v1.NodeList
+	var ns string
+	f := framework.NewDefaultFramework("sched-preemption")
+
+	lowPriority, mediumPriority, highPriority := int32(1), int32(100), int32(1000)
+	lowPriorityClassName := f.BaseName + "-low-priority"
+	mediumPriorityClassName := f.BaseName + "-medium-priority"
+	highPriorityClassName := f.BaseName + "-high-priority"
+
+	AfterEach(func() {
+	})
+
+	BeforeEach(func() {
+		cs = f.ClientSet
+		ns = f.Namespace.Name
+		nodeList = &v1.NodeList{}
+
+		_, err := f.ClientSet.SchedulingV1alpha1().PriorityClasses().Create(&v1alpha1.PriorityClass{ObjectMeta: metav1.ObjectMeta{Name: highPriorityClassName}, Value: highPriority})
+		Expect(err == nil || errors.IsAlreadyExists(err)).To(Equal(true))
+		_, err = f.ClientSet.SchedulingV1alpha1().PriorityClasses().Create(&v1alpha1.PriorityClass{ObjectMeta: metav1.ObjectMeta{Name: mediumPriorityClassName}, Value: mediumPriority})
+		Expect(err == nil || errors.IsAlreadyExists(err)).To(Equal(true))
+		_, err = f.ClientSet.SchedulingV1alpha1().PriorityClasses().Create(&v1alpha1.PriorityClass{ObjectMeta: metav1.ObjectMeta{Name: lowPriorityClassName}, Value: lowPriority})
+		Expect(err == nil || errors.IsAlreadyExists(err)).To(Equal(true))
+
+		framework.WaitForAllNodesHealthy(cs, time.Minute)
+		masterNodes, nodeList = framework.GetMasterAndWorkerNodesOrDie(cs)
+
+		err = framework.CheckTestingNSDeletedExcept(cs, ns)
+		framework.ExpectNoError(err)
+	})
+
+	// This test verifies that when a higher priority pod is created and no node with
+	// enough resources is found, scheduler preempts a lower priority pod to schedule
+	// the high priority pod.
+	It("validates basic preemption works", func() {
+		var podRes v1.ResourceList
+		// Create one pod per node that uses a lot of the node's resources.
+		By("Create pods that use 60% of node resources.")
+		pods := make([]*v1.Pod, len(nodeList.Items))
+		for i, node := range nodeList.Items {
+			cpuAllocatable, found := node.Status.Allocatable["cpu"]
+			Expect(found).To(Equal(true))
+			milliCPU := cpuAllocatable.MilliValue() * 40 / 100
+			memAllocatable, found := node.Status.Allocatable["memory"]
+			Expect(found).To(Equal(true))
+			memory := memAllocatable.Value() * 60 / 100
+			podRes = v1.ResourceList{}
+			podRes[v1.ResourceCPU] = *resource.NewMilliQuantity(int64(milliCPU), resource.DecimalSI)
+			podRes[v1.ResourceMemory] = *resource.NewQuantity(int64(memory), resource.BinarySI)
+
+			// make the first pod low priority and the rest medium priority.
+			priorityName := mediumPriorityClassName
+			if i == 0 {
+				priorityName = lowPriorityClassName
+			}
+			pods[i] = createPausePod(f, pausePodConfig{
+				Name:              fmt.Sprintf("pod%d-%v", i, priorityName),
+				PriorityClassName: priorityName,
+				Resources: &v1.ResourceRequirements{
+					Requests: podRes,
+				},
+			})
+			framework.Logf("Created pod: %v", pods[i].Name)
+		}
+		By("Wait for pods to be scheduled.")
+		for _, pod := range pods {
+			framework.ExpectNoError(framework.WaitForPodRunningInNamespace(cs, pod))
+		}
+
+		By("Run a high priority pod that use 60% of a node resources.")
+		// Create a high priority pod and make sure it is scheduled.
+		runPausePod(f, pausePodConfig{
+			Name:              "preemptor-pod",
+			PriorityClassName: highPriorityClassName,
+			Resources: &v1.ResourceRequirements{
+				Requests: podRes,
+			},
+		})
+		// Make sure that the lowest priority pod is deleted.
+		preemptedPod, err := cs.CoreV1().Pods(pods[0].Namespace).Get(pods[0].Name, metav1.GetOptions{})
+		podDeleted := (err != nil && errors.IsNotFound(err)) ||
+			(err == nil && preemptedPod.DeletionTimestamp != nil)
+		Expect(podDeleted).To(BeTrue())
+		// Other pods (mid priority ones) should be present.
+		for i := 1; i < len(pods); i++ {
+			livePod, err := cs.CoreV1().Pods(pods[i].Namespace).Get(pods[i].Name, metav1.GetOptions{})
+			framework.ExpectNoError(err)
+			Expect(livePod.DeletionTimestamp).To(BeNil())
+		}
+	})
+})
--- a/test/integration/scheduler/BUILD
+++ b/test/integration/scheduler/BUILD
@@ -21,12 +21,14 @@ go_test(
    deps = [
        "//pkg/api:go_default_library",
        "//pkg/api/testapi:go_default_library",
+        "//pkg/features:go_default_library",
        "//plugin/cmd/kube-scheduler/app:go_default_library",
        "//plugin/cmd/kube-scheduler/app/options:go_default_library",
        "//plugin/pkg/scheduler:go_default_library",
        "//plugin/pkg/scheduler/algorithm:go_default_library",
        "//plugin/pkg/scheduler/algorithmprovider:go_default_library",
        "//plugin/pkg/scheduler/api:go_default_library",
+        "//plugin/pkg/scheduler/core:go_default_library",
        "//plugin/pkg/scheduler/factory:go_default_library",
        "//plugin/pkg/scheduler/schedulercache:go_default_library",
        "//test/e2e/framework:go_default_library",
@@ -37,6 +39,7 @@ go_test(
        "//vendor/k8s.io/apimachinery/pkg/api/resource:go_default_library",
        "//vendor/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library",
        "//vendor/k8s.io/apimachinery/pkg/util/wait:go_default_library",
+        "//vendor/k8s.io/apiserver/pkg/util/feature:go_default_library",
        "//vendor/k8s.io/client-go/informers:go_default_library",
        "//vendor/k8s.io/client-go/kubernetes:go_default_library",
        "//vendor/k8s.io/client-go/kubernetes/typed/core/v1:go_default_library",
--- a/test/integration/scheduler/priorities_test.go
+++ b/test/integration/scheduler/priorities_test.go
@@ -51,7 +51,7 @@ func TestNodeAffinity(t *testing.T) {
 	}
 	// Create a pod with node affinity.
 	podName := "pod-with-node-affinity"
-	pod, err := runPausePod(context.clientSet, &pausePodConfig{
+	pod, err := runPausePod(context.clientSet, initPausePod(context.clientSet, &pausePodConfig{
 		Name:      podName,
 		Namespace: context.ns.Name,
 		Affinity: &v1.Affinity{
@@ -72,7 +72,7 @@ func TestNodeAffinity(t *testing.T) {
 				},
 			},
 		},
-	})
+	}))
 	if err != nil {
 		t.Fatalf("Error running pause pod: %v", err)
 	}
@@ -110,11 +110,11 @@ func TestPodAffinity(t *testing.T) {
 	// Add a pod with a label and wait for it to schedule.
 	labelKey := "service"
 	labelValue := "S1"
-	_, err = runPausePod(context.clientSet, &pausePodConfig{
+	_, err = runPausePod(context.clientSet, initPausePod(context.clientSet, &pausePodConfig{
 		Name:      "attractor-pod",
 		Namespace: context.ns.Name,
 		Labels:    map[string]string{labelKey: labelValue},
-	})
+	}))
 	if err != nil {
 		t.Fatalf("Error running the attractor pod: %v", err)
 	}
@@ -125,7 +125,7 @@ func TestPodAffinity(t *testing.T) {
 	}
 	// Add a new pod with affinity to the attractor pod.
 	podName := "pod-with-podaffinity"
-	pod, err := runPausePod(context.clientSet, &pausePodConfig{
+	pod, err := runPausePod(context.clientSet, initPausePod(context.clientSet, &pausePodConfig{
 		Name:      podName,
 		Namespace: context.ns.Name,
 		Affinity: &v1.Affinity{
@@ -158,7 +158,7 @@ func TestPodAffinity(t *testing.T) {
 				},
 			},
 		},
-	})
+	}))
 	if err != nil {
 		t.Fatalf("Error running pause pod: %v", err)
 	}
--- a/test/integration/scheduler/scheduler_test.go
+++ b/test/integration/scheduler/scheduler_test.go
@@ -24,9 +24,11 @@ import (
 	"time"

 	"k8s.io/api/core/v1"
+	"k8s.io/apimachinery/pkg/api/errors"
 	"k8s.io/apimachinery/pkg/api/resource"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/util/wait"
+	utilfeature "k8s.io/apiserver/pkg/util/feature"
 	"k8s.io/client-go/informers"
 	clientset "k8s.io/client-go/kubernetes"
 	clientv1core "k8s.io/client-go/kubernetes/typed/core/v1"
@@ -36,15 +38,18 @@ import (
 	"k8s.io/client-go/tools/record"
 	"k8s.io/kubernetes/pkg/api"
 	"k8s.io/kubernetes/pkg/api/testapi"
+	"k8s.io/kubernetes/pkg/features"
 	"k8s.io/kubernetes/plugin/cmd/kube-scheduler/app"
 	"k8s.io/kubernetes/plugin/cmd/kube-scheduler/app/options"
 	"k8s.io/kubernetes/plugin/pkg/scheduler"
 	"k8s.io/kubernetes/plugin/pkg/scheduler/algorithm"
 	_ "k8s.io/kubernetes/plugin/pkg/scheduler/algorithmprovider"
 	schedulerapi "k8s.io/kubernetes/plugin/pkg/scheduler/api"
+	"k8s.io/kubernetes/plugin/pkg/scheduler/core"
 	"k8s.io/kubernetes/plugin/pkg/scheduler/factory"
 	"k8s.io/kubernetes/plugin/pkg/scheduler/schedulercache"
 	"k8s.io/kubernetes/test/integration/framework"
+	testutils "k8s.io/kubernetes/test/utils"
 )

 const enableEquivalenceCache = true
@@ -56,11 +61,11 @@ type nodeStateManager struct {
 	makeUnSchedulable nodeMutationFunc
 }

-func PredicateOne(pod *v1.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) {
+func PredicateOne(pod *v1.Pod, meta algorithm.PredicateMetadata, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) {
 	return true, nil, nil
 }

-func PredicateTwo(pod *v1.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) {
+func PredicateTwo(pod *v1.Pod, meta algorithm.PredicateMetadata, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) {
 	return true, nil, nil
 }

@@ -457,13 +462,13 @@ func TestMultiScheduler(t *testing.T) {
 	}

 	defaultScheduler := "default-scheduler"
-	testPodFitsDefault, err := createPausePod(context.clientSet, &pausePodConfig{Name: "pod-fits-default", Namespace: context.ns.Name, SchedulerName: defaultScheduler})
+	testPodFitsDefault, err := createPausePod(context.clientSet, initPausePod(context.clientSet, &pausePodConfig{Name: "pod-fits-default", Namespace: context.ns.Name, SchedulerName: defaultScheduler}))
 	if err != nil {
 		t.Fatalf("Failed to create pod: %v", err)
 	}

 	fooScheduler := "foo-scheduler"
-	testPodFitsFoo, err := createPausePod(context.clientSet, &pausePodConfig{Name: "pod-fits-foo", Namespace: context.ns.Name, SchedulerName: fooScheduler})
+	testPodFitsFoo, err := createPausePod(context.clientSet, initPausePod(context.clientSet, &pausePodConfig{Name: "pod-fits-foo", Namespace: context.ns.Name, SchedulerName: fooScheduler}))
 	if err != nil {
 		t.Fatalf("Failed to create pod: %v", err)
 	}
@@ -647,3 +652,251 @@ func TestAllocatable(t *testing.T) {
 		t.Logf("Test allocatable awareness: %s Pod not scheduled as expected", testAllocPod2.Name)
 	}
 }
+
+// TestPreemption tests a few preemption scenarios.
+func TestPreemption(t *testing.T) {
+	// Enable PodPriority feature gate.
+	utilfeature.DefaultFeatureGate.Set(fmt.Sprintf("%s=true", features.PodPriority))
+	// Initialize scheduler.
+	context := initTest(t, "preemption")
+	defer cleanupTest(t, context)
+	cs := context.clientSet
+
+	lowPriority, mediumPriority, highPriority := int32(100), int32(200), int32(300)
+	defaultPodRes := &v1.ResourceRequirements{Requests: v1.ResourceList{
+		v1.ResourceCPU:    *resource.NewMilliQuantity(100, resource.DecimalSI),
+		v1.ResourceMemory: *resource.NewQuantity(100, resource.BinarySI)},
+	}
+
+	tests := []struct {
+		description         string
+		existingPods        []*v1.Pod
+		pod                 *v1.Pod
+		preemptedPodIndexes map[int]struct{}
+	}{
+		{
+			description: "basic pod preemption",
+			existingPods: []*v1.Pod{
+				initPausePod(context.clientSet, &pausePodConfig{
+					Name:      "victim-pod",
+					Namespace: context.ns.Name,
+					Priority:  &lowPriority,
+					Resources: &v1.ResourceRequirements{Requests: v1.ResourceList{
+						v1.ResourceCPU:    *resource.NewMilliQuantity(400, resource.DecimalSI),
+						v1.ResourceMemory: *resource.NewQuantity(200, resource.BinarySI)},
+					},
+				}),
+			},
+			pod: initPausePod(cs, &pausePodConfig{
+				Name:      "preemptor-pod",
+				Namespace: context.ns.Name,
+				Priority:  &highPriority,
+				Resources: &v1.ResourceRequirements{Requests: v1.ResourceList{
+					v1.ResourceCPU:    *resource.NewMilliQuantity(300, resource.DecimalSI),
+					v1.ResourceMemory: *resource.NewQuantity(200, resource.BinarySI)},
+				},
+			}),
+			preemptedPodIndexes: map[int]struct{}{0: {}},
+		},
+		{
+			description: "preemption is performed to satisfy anti-affinity",
+			existingPods: []*v1.Pod{
+				initPausePod(cs, &pausePodConfig{
+					Name: "pod-0", Namespace: context.ns.Name,
+					Priority:  &mediumPriority,
+					Labels:    map[string]string{"pod": "p0"},
+					Resources: defaultPodRes,
+				}),
+				initPausePod(cs, &pausePodConfig{
+					Name: "pod-1", Namespace: context.ns.Name,
+					Priority:  &lowPriority,
+					Labels:    map[string]string{"pod": "p1"},
+					Resources: defaultPodRes,
+					Affinity: &v1.Affinity{
+						PodAntiAffinity: &v1.PodAntiAffinity{
+							RequiredDuringSchedulingIgnoredDuringExecution: []v1.PodAffinityTerm{
+								{
+									LabelSelector: &metav1.LabelSelector{
+										MatchExpressions: []metav1.LabelSelectorRequirement{
+											{
+												Key:      "pod",
+												Operator: metav1.LabelSelectorOpIn,
+												Values:   []string{"preemptor"},
+											},
+										},
+									},
+									TopologyKey: "node",
+								},
+							},
+						},
+					},
+				}),
+			},
+			// A higher priority pod with anti-affinity.
+			pod: initPausePod(cs, &pausePodConfig{
+				Name:      "preemptor-pod",
+				Namespace: context.ns.Name,
+				Priority:  &highPriority,
+				Labels:    map[string]string{"pod": "preemptor"},
+				Resources: defaultPodRes,
+				Affinity: &v1.Affinity{
+					PodAntiAffinity: &v1.PodAntiAffinity{
+						RequiredDuringSchedulingIgnoredDuringExecution: []v1.PodAffinityTerm{
+							{
+								LabelSelector: &metav1.LabelSelector{
+									MatchExpressions: []metav1.LabelSelectorRequirement{
+										{
+											Key:      "pod",
+											Operator: metav1.LabelSelectorOpIn,
+											Values:   []string{"p0"},
+										},
+									},
+								},
+								TopologyKey: "node",
+							},
+						},
+					},
+				},
+			}),
+			preemptedPodIndexes: map[int]struct{}{0: {}, 1: {}},
+		},
+		{
+			// This is similar to the previous case only pod-1 is high priority.
+			description: "preemption is not performed when anti-affinity is not satisfied",
+			existingPods: []*v1.Pod{
+				initPausePod(cs, &pausePodConfig{
+					Name: "pod-0", Namespace: context.ns.Name,
+					Priority:  &mediumPriority,
+					Labels:    map[string]string{"pod": "p0"},
+					Resources: defaultPodRes,
+				}),
+				initPausePod(cs, &pausePodConfig{
+					Name: "pod-1", Namespace: context.ns.Name,
+					Priority:  &highPriority,
+					Labels:    map[string]string{"pod": "p1"},
+					Resources: defaultPodRes,
+					Affinity: &v1.Affinity{
+						PodAntiAffinity: &v1.PodAntiAffinity{
+							RequiredDuringSchedulingIgnoredDuringExecution: []v1.PodAffinityTerm{
+								{
+									LabelSelector: &metav1.LabelSelector{
+										MatchExpressions: []metav1.LabelSelectorRequirement{
+											{
+												Key:      "pod",
+												Operator: metav1.LabelSelectorOpIn,
+												Values:   []string{"preemptor"},
+											},
+										},
+									},
+									TopologyKey: "node",
+								},
+							},
+						},
+					},
+				}),
+			},
+			// A higher priority pod with anti-affinity.
+			pod: initPausePod(cs, &pausePodConfig{
+				Name:      "preemptor-pod",
+				Namespace: context.ns.Name,
+				Priority:  &highPriority,
+				Labels:    map[string]string{"pod": "preemptor"},
+				Resources: defaultPodRes,
+				Affinity: &v1.Affinity{
+					PodAntiAffinity: &v1.PodAntiAffinity{
+						RequiredDuringSchedulingIgnoredDuringExecution: []v1.PodAffinityTerm{
+							{
+								LabelSelector: &metav1.LabelSelector{
+									MatchExpressions: []metav1.LabelSelectorRequirement{
+										{
+											Key:      "pod",
+											Operator: metav1.LabelSelectorOpIn,
+											Values:   []string{"p0"},
+										},
+									},
+								},
+								TopologyKey: "node",
+							},
+						},
+					},
+				},
+			}),
+			preemptedPodIndexes: map[int]struct{}{},
+		},
+	}
+
+	// Create a node with some resources and a label.
+	nodeRes := &v1.ResourceList{
+		v1.ResourcePods:   *resource.NewQuantity(32, resource.DecimalSI),
+		v1.ResourceCPU:    *resource.NewMilliQuantity(500, resource.DecimalSI),
+		v1.ResourceMemory: *resource.NewQuantity(500, resource.BinarySI),
+	}
+	node, err := createNode(context.clientSet, "node1", nodeRes)
+	if err != nil {
+		t.Fatalf("Error creating nodes: %v", err)
+	}
+	nodeLabels := map[string]string{"node": node.Name}
+	if err = testutils.AddLabelsToNode(context.clientSet, node.Name, nodeLabels); err != nil {
+		t.Fatalf("Cannot add labels to node: %v", err)
+	}
+	if err = waitForNodeLabels(context.clientSet, node.Name, nodeLabels); err != nil {
+		t.Fatalf("Adding labels to node didn't succeed: %v", err)
+	}
+
+	for _, test := range tests {
+		pods := make([]*v1.Pod, len(test.existingPods))
+		// Create and run existingPods.
+		for i, p := range test.existingPods {
+			pods[i], err = runPausePod(cs, p)
+			if err != nil {
+				t.Fatalf("Test [%v]: Error running pause pod: %v", test.description, err)
+			}
+		}
+		// Create the "pod".
+		preemptor, err := createPausePod(cs, test.pod)
+		if err != nil {
+			t.Errorf("Error while creating high priority pod: %v", err)
+		}
+		// Wait for preemption of pods and make sure the other ones are not preempted.
+		for i, p := range pods {
+			if _, found := test.preemptedPodIndexes[i]; found {
+				if err = wait.Poll(time.Second, wait.ForeverTestTimeout, podIsGettingEvicted(cs, p.Namespace, p.Name)); err != nil {
+					t.Errorf("Test [%v]: Pod %v is not getting evicted.", test.description, p.Name)
+				}
+			} else {
+				if p.DeletionTimestamp != nil {
+					t.Errorf("Test [%v]: Didn't expect pod %v to get preempted.", test.description, p.Name)
+				}
+			}
+		}
+		// Also check that the preemptor pod gets the annotation for nominated node name.
+		if len(test.preemptedPodIndexes) > 0 {
+			if err = wait.Poll(time.Second, wait.ForeverTestTimeout, func() (bool, error) {
+				pod, err := context.clientSet.CoreV1().Pods(context.ns.Name).Get("preemptor-pod", metav1.GetOptions{})
+				if err != nil {
+					t.Errorf("Test [%v]: error getting pod: %v", test.description, err)
+				}
+				annot, found := pod.Annotations[core.NominatedNodeAnnotationKey]
+				if found && len(annot) > 0 {
+					return true, nil
+				}
+				return false, err
+			}); err != nil {
+				t.Errorf("Test [%v]: Pod annotation did not get set.", test.description)
+			}
+		}
+
+		// Cleanup
+		pods = append(pods, preemptor)
+		for _, p := range pods {
+			err = cs.CoreV1().Pods(p.Namespace).Delete(p.Name, metav1.NewDeleteOptions(0))
+			if err != nil && !errors.IsNotFound(err) {
+				t.Errorf("Test [%v]: error, %v, while deleting pod during test.", test.description, err)
+			}
+			err = wait.Poll(time.Second, wait.ForeverTestTimeout, podDeleted(cs, p.Namespace, p.Name))
+			if err != nil {
+				t.Errorf("Test [%v]: error, %v, while waiting for pod to get deleted.", test.description, err)
+			}
+		}
+	}
+}
--- a/test/integration/scheduler/util.go
+++ b/test/integration/scheduler/util.go
@@ -205,6 +205,7 @@ type pausePodConfig struct {
 	Tolerations                       []v1.Toleration
 	NodeName                          string
 	SchedulerName                     string
+	Priority                          *int32
 }

 // initPausePod initializes a pod API object from the given config. It is used
@@ -213,6 +214,7 @@ func initPausePod(cs clientset.Interface, conf *pausePodConfig) *v1.Pod {
 	pod := &v1.Pod{
 		ObjectMeta: metav1.ObjectMeta{
 			Name:        conf.Name,
+			Namespace:   conf.Namespace,
 			Labels:      conf.Labels,
 			Annotations: conf.Annotations,
 		},
@@ -228,6 +230,7 @@ func initPausePod(cs clientset.Interface, conf *pausePodConfig) *v1.Pod {
 			Tolerations:   conf.Tolerations,
 			NodeName:      conf.NodeName,
 			SchedulerName: conf.SchedulerName,
+			Priority:      conf.Priority,
 		},
 	}
 	if conf.Resources != nil {
@@ -238,9 +241,8 @@ func initPausePod(cs clientset.Interface, conf *pausePodConfig) *v1.Pod {

 // createPausePod creates a pod with "Pause" image and the given config and
 // return its pointer and error status.
-func createPausePod(cs clientset.Interface, conf *pausePodConfig) (*v1.Pod, error) {
-	p := initPausePod(cs, conf)
-	return cs.CoreV1().Pods(conf.Namespace).Create(p)
+func createPausePod(cs clientset.Interface, p *v1.Pod) (*v1.Pod, error) {
+	return cs.CoreV1().Pods(p.Namespace).Create(p)
 }

 // createPausePodWithResource creates a pod with "Pause" image and the given
@@ -262,22 +264,21 @@ func createPausePodWithResource(cs clientset.Interface, podName string, nsName s
 			},
 		}
 	}
-	return createPausePod(cs, &conf)
+	return createPausePod(cs, initPausePod(cs, &conf))
 }

 // runPausePod creates a pod with "Pause" image and the given config and waits
 // until it is scheduled. It returns its pointer and error status.
-func runPausePod(cs clientset.Interface, conf *pausePodConfig) (*v1.Pod, error) {
-	p := initPausePod(cs, conf)
-	pod, err := cs.CoreV1().Pods(conf.Namespace).Create(p)
+func runPausePod(cs clientset.Interface, pod *v1.Pod) (*v1.Pod, error) {
+	pod, err := cs.CoreV1().Pods(pod.Namespace).Create(pod)
 	if err != nil {
 		return nil, fmt.Errorf("Error creating pause pod: %v", err)
 	}
 	if err = waitForPodToSchedule(cs, pod); err != nil {
 		return pod, fmt.Errorf("Pod %v didn't schedule successfully. Error: %v", pod.Name, err)
 	}
-	if pod, err = cs.CoreV1().Pods(conf.Namespace).Get(conf.Name, metav1.GetOptions{}); err != nil {
-		return pod, fmt.Errorf("Error getting pod %v info: %v", conf.Name, err)
+	if pod, err = cs.CoreV1().Pods(pod.Namespace).Get(pod.Name, metav1.GetOptions{}); err != nil {
+		return pod, fmt.Errorf("Error getting pod %v info: %v", pod.Name, err)
 	}
 	return pod, nil
 }
@@ -285,7 +286,10 @@ func runPausePod(cs clientset.Interface, conf *pausePodConfig) (*v1.Pod, error)
 // podDeleted returns true if a pod is not found in the given namespace.
 func podDeleted(c clientset.Interface, podNamespace, podName string) wait.ConditionFunc {
 	return func() (bool, error) {
-		_, err := c.CoreV1().Pods(podNamespace).Get(podName, metav1.GetOptions{})
+		pod, err := c.CoreV1().Pods(podNamespace).Get(podName, metav1.GetOptions{})
+		if pod.DeletionTimestamp != nil {
+			return true, nil
+		}
 		if errors.IsNotFound(err) {
 			return true, nil
 		}
@@ -293,6 +297,20 @@ func podDeleted(c clientset.Interface, podNamespace, podName string) wait.Condit
 	}
 }

+// podIsGettingEvicted returns true if the pod's deletion timestamp is set.
+func podIsGettingEvicted(c clientset.Interface, podNamespace, podName string) wait.ConditionFunc {
+	return func() (bool, error) {
+		pod, err := c.CoreV1().Pods(podNamespace).Get(podName, metav1.GetOptions{})
+		if err != nil {
+			return false, err
+		}
+		if pod.DeletionTimestamp != nil {
+			return true, nil
+		}
+		return false, nil
+	}
+}
+
 // podScheduled returns true if a node is assigned to the given pod.
 func podScheduled(c clientset.Interface, podNamespace, podName string) wait.ConditionFunc {
 	return func() (bool, error) {