Show the details on the failure of preemption

Since the filter status is missed for the phase of preemption, there
will be no way to tell why the preemption failed for some reasons, and
those reasons could be different with the status from the main scheduling
process (the first failed plugin will hide other failures in the chain).

This change provides verbose information based on the node status generated
during pod preemption, those information helps us to diagnose the issue which
is happened during pod preemption.

Signed-off-by: Dave Chen <dave.chen@arm.com>
This commit is contained in:
Dave Chen
2020-12-10 15:40:14 +08:00
parent ab8dda3c88
commit 7315c1f6dd
9 changed files with 122 additions and 73 deletions

View File

@@ -22,6 +22,7 @@ import (
"math"
"math/rand"
"sort"
"sync"
"sync/atomic"
"k8s.io/klog/v2"
@@ -93,8 +94,12 @@ func (pl *DefaultPreemption) PostFilter(ctx context.Context, state *framework.Cy
nnn, err := pl.preempt(ctx, state, pod, m)
if err != nil {
if _, ok := err.(*framework.FitError); ok {
return nil, framework.NewStatus(framework.Unschedulable, err.Error())
}
return nil, framework.AsStatus(err)
}
// This happens when the pod is not eligible for preemption or extenders filtered all candidates.
if nnn == "" {
return nil, framework.NewStatus(framework.Unschedulable)
}
@@ -214,8 +219,16 @@ func (pl *DefaultPreemption) FindCandidates(ctx context.Context, state *framewor
}
klog.Infof("from a pool of %d nodes (offset: %d, sample %d nodes: %v), ~%d candidates will be chosen", len(potentialNodes), offset, len(sample), sample, numCandidates)
}
return dryRunPreemption(ctx, pl.fh, state, pod, potentialNodes, pdbs, offset, numCandidates), nil
candidates, nodeStatuses := dryRunPreemption(ctx, pl.fh, state, pod, potentialNodes, pdbs, offset, numCandidates)
// Return a FitError only when there are no candidates that fit the pod.
if len(candidates) == 0 {
return candidates, &framework.FitError{
Pod: pod,
NumAllNodes: len(potentialNodes),
FilteredNodesStatuses: nodeStatuses,
}
}
return candidates, nil
}
// PodEligibleToPreemptOthers determines whether this pod should be considered
@@ -301,21 +314,22 @@ func (cl *candidateList) get() []Candidate {
}
// dryRunPreemption simulates Preemption logic on <potentialNodes> in parallel,
// and returns preemption candidates. The number of candidates depends on the
// constraints defined in the plugin's args. In the returned list of
// returns preemption candidates and a map indicating filtered nodes statuses.
// The number of candidates depends on the constraints defined in the plugin's args. In the returned list of
// candidates, ones that do not violate PDB are preferred over ones that do.
func dryRunPreemption(ctx context.Context, fh framework.Handle,
state *framework.CycleState, pod *v1.Pod, potentialNodes []*framework.NodeInfo,
pdbs []*policy.PodDisruptionBudget, offset int32, numCandidates int32) []Candidate {
pdbs []*policy.PodDisruptionBudget, offset int32, numCandidates int32) ([]Candidate, framework.NodeToStatusMap) {
nonViolatingCandidates := newCandidateList(numCandidates)
violatingCandidates := newCandidateList(numCandidates)
parallelCtx, cancel := context.WithCancel(ctx)
nodeStatuses := make(framework.NodeToStatusMap)
var statusesLock sync.Mutex
checkNode := func(i int) {
nodeInfoCopy := potentialNodes[(int(offset)+i)%len(potentialNodes)].Clone()
stateCopy := state.Clone()
pods, numPDBViolations, fits := selectVictimsOnNode(ctx, fh, stateCopy, pod, nodeInfoCopy, pdbs)
if fits {
pods, numPDBViolations, status := selectVictimsOnNode(ctx, fh, stateCopy, pod, nodeInfoCopy, pdbs)
if status.IsSuccess() {
victims := extenderv1.Victims{
Pods: pods,
NumPDBViolations: int64(numPDBViolations),
@@ -333,10 +347,14 @@ func dryRunPreemption(ctx context.Context, fh framework.Handle,
if nvcSize > 0 && nvcSize+vcSize >= numCandidates {
cancel()
}
} else {
statusesLock.Lock()
nodeStatuses[nodeInfoCopy.Node().Name] = status
statusesLock.Unlock()
}
}
parallelize.Until(parallelCtx, len(potentialNodes), checkNode)
return append(nonViolatingCandidates.get(), violatingCandidates.get()...)
return append(nonViolatingCandidates.get(), violatingCandidates.get()...), nodeStatuses
}
// CallExtenders calls given <extenders> to select the list of feasible candidates.
@@ -578,9 +596,8 @@ func selectVictimsOnNode(
pod *v1.Pod,
nodeInfo *framework.NodeInfo,
pdbs []*policy.PodDisruptionBudget,
) ([]*v1.Pod, int, bool) {
) ([]*v1.Pod, int, *framework.Status) {
var potentialVictims []*v1.Pod
ph := fh.PreemptHandle()
removePod := func(rp *v1.Pod) error {
if err := nodeInfo.RemovePod(rp); err != nil {
@@ -607,14 +624,15 @@ func selectVictimsOnNode(
if corev1helpers.PodPriority(p.Pod) < podPriority {
potentialVictims = append(potentialVictims, p.Pod)
if err := removePod(p.Pod); err != nil {
return nil, 0, false
return nil, 0, framework.NewStatus(framework.Error, err.Error())
}
}
}
// No potential victims are found, and so we don't need to evaluate the node again since its state didn't change.
if len(potentialVictims) == 0 {
return nil, 0, false
message := fmt.Sprintf("No victims found on node %v for preemptor pod %v", nodeInfo.Node().Name, pod.Name)
return nil, 0, framework.NewStatus(framework.UnschedulableAndUnresolvable, message)
}
// If the new pod does not fit after removing all the lower priority pods,
@@ -624,11 +642,7 @@ func selectVictimsOnNode(
// support this case for performance reasons. Having affinity to lower
// priority pods is not a recommended configuration anyway.
if status := fh.RunFilterPluginsWithNominatedPods(ctx, state, pod, nodeInfo); !status.IsSuccess() {
if status.Code() == framework.Error {
klog.Warningf("Encountered error while selecting victims on node %v: %v", nodeInfo.Node().Name, status.AsError())
}
return nil, 0, false
return nil, 0, status
}
var victims []*v1.Pod
numViolatingVictim := 0
@@ -654,8 +668,7 @@ func selectVictimsOnNode(
}
for _, p := range violatingVictims {
if fits, err := reprievePod(p); err != nil {
klog.Warningf("Failed to reprieve pod %q: %v", p.Name, err)
return nil, 0, false
return nil, 0, framework.NewStatus(framework.Error, err.Error())
} else if !fits {
numViolatingVictim++
}
@@ -663,11 +676,10 @@ func selectVictimsOnNode(
// Now we try to reprieve non-violating victims.
for _, p := range nonViolatingVictims {
if _, err := reprievePod(p); err != nil {
klog.Warningf("Failed to reprieve pod %q: %v", p.Name, err)
return nil, 0, false
return nil, 0, framework.NewStatus(framework.Error, err.Error())
}
}
return victims, numViolatingVictim, true
return victims, numViolatingVictim, framework.NewStatus(framework.Success)
}
// PrepareCandidate does some preparation work before nominating the selected candidate:

View File

@@ -100,6 +100,7 @@ func getDefaultDefaultPreemptionArgs() *config.DefaultPreemptionArgs {
func TestPostFilter(t *testing.T) {
onePodRes := map[v1.ResourceName]string{v1.ResourcePods: "1"}
nodeRes := map[v1.ResourceName]string{v1.ResourceCPU: "200m", v1.ResourceMemory: "400"}
tests := []struct {
name string
pod *v1.Pod
@@ -138,7 +139,7 @@ func TestPostFilter(t *testing.T) {
"node1": framework.NewStatus(framework.Unschedulable),
},
wantResult: nil,
wantStatus: framework.NewStatus(framework.Unschedulable),
wantStatus: framework.NewStatus(framework.Unschedulable, "0/1 nodes are available: 1 No victims found on node node1 for preemptor pod p."),
},
{
name: "preemption should respect filteredNodesStatuses",
@@ -194,6 +195,42 @@ func TestPostFilter(t *testing.T) {
},
wantStatus: framework.NewStatus(framework.Success),
},
{
name: "no candidate nodes found, no enough resource after removing low priority pods",
pod: st.MakePod().Name("p").UID("p").Namespace(v1.NamespaceDefault).Priority(highPriority).Req(largeRes).Obj(),
pods: []*v1.Pod{
st.MakePod().Name("p1").UID("p1").Namespace(v1.NamespaceDefault).Node("node1").Obj(),
st.MakePod().Name("p2").UID("p2").Namespace(v1.NamespaceDefault).Node("node2").Obj(),
},
nodes: []*v1.Node{
st.MakeNode().Name("node1").Capacity(nodeRes).Obj(), // no enough CPU resource
st.MakeNode().Name("node2").Capacity(nodeRes).Obj(), // no enough CPU resource
},
filteredNodesStatuses: framework.NodeToStatusMap{
"node1": framework.NewStatus(framework.Unschedulable),
"node2": framework.NewStatus(framework.Unschedulable),
},
wantResult: nil,
wantStatus: framework.NewStatus(framework.Unschedulable, "0/2 nodes are available: 2 Insufficient cpu."),
},
{
name: "no candidate nodes found with mixed reasons, no lower priority pod and no enough CPU resource",
pod: st.MakePod().Name("p").UID("p").Namespace(v1.NamespaceDefault).Priority(highPriority).Req(largeRes).Obj(),
pods: []*v1.Pod{
st.MakePod().Name("p1").UID("p1").Namespace(v1.NamespaceDefault).Node("node1").Priority(highPriority).Obj(),
st.MakePod().Name("p2").UID("p2").Namespace(v1.NamespaceDefault).Node("node2").Obj(),
},
nodes: []*v1.Node{
st.MakeNode().Name("node1").Capacity(onePodRes).Obj(), // no pod will be preempted
st.MakeNode().Name("node2").Capacity(nodeRes).Obj(), // no enough CPU resource
},
filteredNodesStatuses: framework.NodeToStatusMap{
"node1": framework.NewStatus(framework.Unschedulable),
"node2": framework.NewStatus(framework.Unschedulable),
},
wantResult: nil,
wantStatus: framework.NewStatus(framework.Unschedulable, "0/2 nodes are available: 1 Insufficient cpu, 1 No victims found on node node1 for preemptor pod p."),
},
}
for _, tt := range tests {
@@ -978,7 +1015,7 @@ func TestDryRunPreemption(t *testing.T) {
t.Errorf("cycle %d: Unexpected PreFilter Status: %v", cycle, status)
}
offset, numCandidates := pl.getOffsetAndNumCandidates(int32(len(nodeInfos)))
got := dryRunPreemption(context.Background(), fwk, state, pod, nodeInfos, tt.pdbs, offset, numCandidates)
got, _ := dryRunPreemption(context.Background(), fwk, state, pod, nodeInfos, tt.pdbs, offset, numCandidates)
if err != nil {
t.Fatal(err)
}
@@ -1201,7 +1238,7 @@ func TestSelectBestCandidate(t *testing.T) {
pl := &DefaultPreemption{args: *getDefaultDefaultPreemptionArgs()}
offset, numCandidates := pl.getOffsetAndNumCandidates(int32(len(nodeInfos)))
candidates := dryRunPreemption(context.Background(), fwk, state, tt.pod, nodeInfos, nil, offset, numCandidates)
candidates, _ := dryRunPreemption(context.Background(), fwk, state, tt.pod, nodeInfos, nil, offset, numCandidates)
s := SelectCandidate(candidates)
found := false
for _, nodeName := range tt.expected {