k8s.io/dynamic-resource-allocation: fix potential scheduling deadlock
When handling a PodSchedulingContext object, the code first checked for
unsuitable nodes and then tried to allocate if (and only if) the selected node
hadn't been found to be unsuitable.
If for whatever reason the selected node wasn't listed as potential node, then
scheduling got stuck because the allocation would fail and cause a return with
an error instead of updating the list of unsuitable nodes. This would be
retried with the same result.
To avoid this scenario, the selected node now also gets checked. This is better
than assuming a certain kube-scheduler behavior.
This problem occurred when experimenting with cluster autoscaling:
spec:
potentialNodes:
- gke-cluster-pohly-pool-dra-69b88e1e-bz6c
- gke-cluster-pohly-pool-dra-69b88e1e-fpvh
selectedNode: gke-cluster-pohly-default-pool-c9f60a43-6kxh
Why the scheduler wrote a spec like this is unclear. This was with Kubernetes
1.27 and the code has been updated since then, so perhaps it's resolved.
This commit is contained in:
@@ -122,6 +122,12 @@ type Driver interface {
|
||||
// can be allocated for it (for example, two GPUs requested but
|
||||
// the node only has one).
|
||||
//
|
||||
// The potentialNodes slice contains all potential nodes selected
|
||||
// by the scheduler plus the selected node. The response must
|
||||
// not contain any other nodes. Implementations do not have to
|
||||
// care about size limits in the PodSchedulingContext status, the
|
||||
// caller will handle that.
|
||||
//
|
||||
// The result of the check is in ClaimAllocation.UnsuitableNodes.
|
||||
// An error indicates that the entire check must be repeated.
|
||||
UnsuitableNodes(ctx context.Context, pod *v1.Pod, claims []*ClaimAllocation, potentialNodes []string) error
|
||||
@@ -752,12 +758,20 @@ func (ctrl *controller) syncPodSchedulingContexts(ctx context.Context, schedulin
|
||||
// and shouldn't, because those allocations might have to be undone to
|
||||
// pick a better node. If we don't need to allocate now, then we'll
|
||||
// simply report back the gather information.
|
||||
//
|
||||
// We shouldn't assume that the scheduler has included the selected node
|
||||
// in the list of potential nodes. Usually it does, but let's make sure
|
||||
// that we check it.
|
||||
selectedNode := schedulingCtx.Spec.SelectedNode
|
||||
potentialNodes := schedulingCtx.Spec.PotentialNodes
|
||||
if selectedNode != "" && !hasString(potentialNodes, selectedNode) {
|
||||
potentialNodes = append(potentialNodes, selectedNode)
|
||||
}
|
||||
if len(schedulingCtx.Spec.PotentialNodes) > 0 {
|
||||
if err := ctrl.driver.UnsuitableNodes(ctx, pod, claims, schedulingCtx.Spec.PotentialNodes); err != nil {
|
||||
if err := ctrl.driver.UnsuitableNodes(ctx, pod, claims, potentialNodes); err != nil {
|
||||
return fmt.Errorf("checking potential nodes: %v", err)
|
||||
}
|
||||
}
|
||||
selectedNode := schedulingCtx.Spec.SelectedNode
|
||||
logger.V(5).Info("pending pod claims", "claims", claims, "selectedNode", selectedNode)
|
||||
if selectedNode != "" {
|
||||
unsuitable := false
|
||||
@@ -811,12 +825,12 @@ func (ctrl *controller) syncPodSchedulingContexts(ctx context.Context, schedulin
|
||||
schedulingCtx.Status.ResourceClaims = append(schedulingCtx.Status.ResourceClaims,
|
||||
resourcev1alpha2.ResourceClaimSchedulingStatus{
|
||||
Name: delayed.PodClaimName,
|
||||
UnsuitableNodes: delayed.UnsuitableNodes,
|
||||
UnsuitableNodes: truncateNodes(delayed.UnsuitableNodes, selectedNode),
|
||||
})
|
||||
modified = true
|
||||
} else if stringsDiffer(schedulingCtx.Status.ResourceClaims[i].UnsuitableNodes, delayed.UnsuitableNodes) {
|
||||
// Update existing entry.
|
||||
schedulingCtx.Status.ResourceClaims[i].UnsuitableNodes = delayed.UnsuitableNodes
|
||||
schedulingCtx.Status.ResourceClaims[i].UnsuitableNodes = truncateNodes(delayed.UnsuitableNodes, selectedNode)
|
||||
modified = true
|
||||
}
|
||||
}
|
||||
@@ -832,6 +846,23 @@ func (ctrl *controller) syncPodSchedulingContexts(ctx context.Context, schedulin
|
||||
return errPeriodic
|
||||
}
|
||||
|
||||
func truncateNodes(nodes []string, selectedNode string) []string {
|
||||
// We might have checked "potential nodes + selected node" above, so
|
||||
// this list might be too long by one element. When truncating it, make
|
||||
// sure that the selected node is listed.
|
||||
lenUnsuitable := len(nodes)
|
||||
if lenUnsuitable > resourcev1alpha2.PodSchedulingNodeListMaxSize {
|
||||
if nodes[0] == selectedNode {
|
||||
// Truncate at the end and keep selected node in the first element.
|
||||
nodes = nodes[0 : lenUnsuitable-1]
|
||||
} else {
|
||||
// Truncate at the front, it's not the selected node.
|
||||
nodes = nodes[1:lenUnsuitable]
|
||||
}
|
||||
}
|
||||
return nodes
|
||||
}
|
||||
|
||||
type claimAllocations []*ClaimAllocation
|
||||
|
||||
// MarshalLog replaces the pointers with the actual structs because
|
||||
|
||||
@@ -19,6 +19,7 @@ package controller
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
@@ -64,6 +65,10 @@ func TestController(t *testing.T) {
|
||||
otherNodeName := "worker-2"
|
||||
unsuitableNodes := []string{otherNodeName}
|
||||
potentialNodes := []string{nodeName, otherNodeName}
|
||||
maxNodes := make([]string, resourcev1alpha2.PodSchedulingNodeListMaxSize)
|
||||
for i := range maxNodes {
|
||||
maxNodes[i] = fmt.Sprintf("node-%d", i)
|
||||
}
|
||||
withDeletionTimestamp := func(claim *resourcev1alpha2.ResourceClaim) *resourcev1alpha2.ResourceClaim {
|
||||
var deleted metav1.Time
|
||||
claim = claim.DeepCopy()
|
||||
@@ -101,18 +106,24 @@ func TestController(t *testing.T) {
|
||||
podSchedulingCtx.Spec.SelectedNode = nodeName
|
||||
return podSchedulingCtx
|
||||
}
|
||||
withUnsuitableNodes := func(podSchedulingCtx *resourcev1alpha2.PodSchedulingContext) *resourcev1alpha2.PodSchedulingContext {
|
||||
withSpecificUnsuitableNodes := func(podSchedulingCtx *resourcev1alpha2.PodSchedulingContext, unsuitableNodes []string) *resourcev1alpha2.PodSchedulingContext {
|
||||
podSchedulingCtx = podSchedulingCtx.DeepCopy()
|
||||
podSchedulingCtx.Status.ResourceClaims = append(podSchedulingCtx.Status.ResourceClaims,
|
||||
resourcev1alpha2.ResourceClaimSchedulingStatus{Name: podClaimName, UnsuitableNodes: unsuitableNodes},
|
||||
)
|
||||
return podSchedulingCtx
|
||||
}
|
||||
withPotentialNodes := func(podSchedulingCtx *resourcev1alpha2.PodSchedulingContext) *resourcev1alpha2.PodSchedulingContext {
|
||||
withUnsuitableNodes := func(podSchedulingCtx *resourcev1alpha2.PodSchedulingContext) *resourcev1alpha2.PodSchedulingContext {
|
||||
return withSpecificUnsuitableNodes(podSchedulingCtx, unsuitableNodes)
|
||||
}
|
||||
withSpecificPotentialNodes := func(podSchedulingCtx *resourcev1alpha2.PodSchedulingContext, potentialNodes []string) *resourcev1alpha2.PodSchedulingContext {
|
||||
podSchedulingCtx = podSchedulingCtx.DeepCopy()
|
||||
podSchedulingCtx.Spec.PotentialNodes = potentialNodes
|
||||
return podSchedulingCtx
|
||||
}
|
||||
withPotentialNodes := func(podSchedulingCtx *resourcev1alpha2.PodSchedulingContext) *resourcev1alpha2.PodSchedulingContext {
|
||||
return withSpecificPotentialNodes(podSchedulingCtx, potentialNodes)
|
||||
}
|
||||
|
||||
var m mockDriver
|
||||
|
||||
@@ -376,6 +387,48 @@ func TestController(t *testing.T) {
|
||||
expectedSchedulingCtx: withUnsuitableNodes(withSelectedNode(withPotentialNodes(podSchedulingCtx))),
|
||||
expectedError: errPeriodic.Error(),
|
||||
},
|
||||
// pod with delayed allocation, potential nodes, selected node, all unsuitable -> update unsuitable nodes
|
||||
"pod-selected-is-potential-node": {
|
||||
key: podKey,
|
||||
classes: classes,
|
||||
claim: delayedClaim,
|
||||
expectedClaim: delayedClaim,
|
||||
pod: podWithClaim,
|
||||
schedulingCtx: withPotentialNodes(withSelectedNode(withPotentialNodes(podSchedulingCtx))),
|
||||
driver: m.expectClassParameters(map[string]interface{}{className: 1}).
|
||||
expectClaimParameters(map[string]interface{}{claimName: 2}).
|
||||
expectUnsuitableNodes(map[string][]string{podClaimName: potentialNodes}, nil),
|
||||
expectedSchedulingCtx: withSpecificUnsuitableNodes(withSelectedNode(withPotentialNodes(podSchedulingCtx)), potentialNodes),
|
||||
expectedError: errPeriodic.Error(),
|
||||
},
|
||||
// pod with delayed allocation, max potential nodes, other selected node, all unsuitable -> update unsuitable nodes with truncation at start
|
||||
"pod-selected-is-potential-node-truncate-first": {
|
||||
key: podKey,
|
||||
classes: classes,
|
||||
claim: delayedClaim,
|
||||
expectedClaim: delayedClaim,
|
||||
pod: podWithClaim,
|
||||
schedulingCtx: withSpecificPotentialNodes(withSelectedNode(withSpecificPotentialNodes(podSchedulingCtx, maxNodes)), maxNodes),
|
||||
driver: m.expectClassParameters(map[string]interface{}{className: 1}).
|
||||
expectClaimParameters(map[string]interface{}{claimName: 2}).
|
||||
expectUnsuitableNodes(map[string][]string{podClaimName: append(maxNodes, nodeName)}, nil),
|
||||
expectedSchedulingCtx: withSpecificUnsuitableNodes(withSelectedNode(withSpecificPotentialNodes(podSchedulingCtx, maxNodes)), append(maxNodes[1:], nodeName)),
|
||||
expectedError: errPeriodic.Error(),
|
||||
},
|
||||
// pod with delayed allocation, max potential nodes, other selected node, all unsuitable (but in reverse order) -> update unsuitable nodes with truncation at end
|
||||
"pod-selected-is-potential-node-truncate-last": {
|
||||
key: podKey,
|
||||
classes: classes,
|
||||
claim: delayedClaim,
|
||||
expectedClaim: delayedClaim,
|
||||
pod: podWithClaim,
|
||||
schedulingCtx: withSpecificPotentialNodes(withSelectedNode(withSpecificPotentialNodes(podSchedulingCtx, maxNodes)), maxNodes),
|
||||
driver: m.expectClassParameters(map[string]interface{}{className: 1}).
|
||||
expectClaimParameters(map[string]interface{}{claimName: 2}).
|
||||
expectUnsuitableNodes(map[string][]string{podClaimName: append([]string{nodeName}, maxNodes...)}, nil),
|
||||
expectedSchedulingCtx: withSpecificUnsuitableNodes(withSelectedNode(withSpecificPotentialNodes(podSchedulingCtx, maxNodes)), append([]string{nodeName}, maxNodes[:len(maxNodes)-1]...)),
|
||||
expectedError: errPeriodic.Error(),
|
||||
},
|
||||
} {
|
||||
t.Run(name, func(t *testing.T) {
|
||||
_, ctx := ktesting.NewTestContext(t)
|
||||
|
||||
Reference in New Issue
Block a user