Ubernetes Lite: Volumes can dictate zone scheduling
For AWS EBS, a volume can only be attached to a node in the same AZ. The scheduler must therefore detect if a volume is being attached to a pod, and ensure that the pod is scheduled on a node in the same AZ as the volume. So that the scheduler need not query the cloud provider every time, and to support decoupled operation (e.g. bare metal) we tag the volume with our placement labels. This is done automatically by means of an admission controller on AWS when a PersistentVolume is created backed by an EBS volume. Support for tagging GCE PVs will follow. Pods that specify a volume directly (i.e. without using a PersistentVolumeClaim) will not currently be scheduled correctly (i.e. they will be scheduled without zone-awareness).
This commit is contained in:
@@ -26,12 +26,21 @@ import (
|
||||
"k8s.io/kubernetes/plugin/pkg/scheduler/algorithm"
|
||||
|
||||
"github.com/golang/glog"
|
||||
"k8s.io/kubernetes/pkg/api/unversioned"
|
||||
)
|
||||
|
||||
type NodeInfo interface {
|
||||
GetNodeInfo(nodeID string) (*api.Node, error)
|
||||
}
|
||||
|
||||
type PersistentVolumeInfo interface {
|
||||
GetPersistentVolumeInfo(pvID string) (*api.PersistentVolume, error)
|
||||
}
|
||||
|
||||
type PersistentVolumeClaimInfo interface {
|
||||
GetPersistentVolumeClaimInfo(namespace string, pvcID string) (*api.PersistentVolumeClaim, error)
|
||||
}
|
||||
|
||||
type StaticNodeInfo struct {
|
||||
*api.NodeList
|
||||
}
|
||||
@@ -136,6 +145,108 @@ func NoDiskConflict(pod *api.Pod, existingPods []*api.Pod, node string) (bool, e
|
||||
return true, nil
|
||||
}
|
||||
|
||||
type VolumeZoneChecker struct {
|
||||
nodeInfo NodeInfo
|
||||
pvInfo PersistentVolumeInfo
|
||||
pvcInfo PersistentVolumeClaimInfo
|
||||
}
|
||||
|
||||
// VolumeZonePredicate evaluates if a pod can fit due to the volumes it requests, given
|
||||
// that some volumes may have zone scheduling constraints. The requirement is that any
|
||||
// volume zone-labels must match the equivalent zone-labels on the node. It is OK for
|
||||
// the node to have more zone-label constraints (for example, a hypothetical replicated
|
||||
// volume might allow region-wide access)
|
||||
//
|
||||
// Currently this is only supported with PersistentVolumeClaims, and looks to the labels
|
||||
// only on the bound PersistentVolume.
|
||||
//
|
||||
// Working with volumes declared inline in the pod specification (i.e. not
|
||||
// using a PersistentVolume) is likely to be harder, as it would require
|
||||
// determining the zone of a volume during scheduling, and that is likely to
|
||||
// require calling out to the cloud provider. It seems that we are moving away
|
||||
// from inline volume declarations anyway.
|
||||
func NewVolumeZonePredicate(nodeInfo NodeInfo, pvInfo PersistentVolumeInfo, pvcInfo PersistentVolumeClaimInfo) algorithm.FitPredicate {
|
||||
c := &VolumeZoneChecker{
|
||||
nodeInfo: nodeInfo,
|
||||
pvInfo: pvInfo,
|
||||
pvcInfo: pvcInfo,
|
||||
}
|
||||
return c.predicate
|
||||
}
|
||||
|
||||
func (c *VolumeZoneChecker) predicate(pod *api.Pod, existingPods []*api.Pod, nodeID string) (bool, error) {
|
||||
node, err := c.nodeInfo.GetNodeInfo(nodeID)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
if node == nil {
|
||||
return false, fmt.Errorf("node not found: %q", nodeID)
|
||||
}
|
||||
|
||||
nodeConstraints := make(map[string]string)
|
||||
for k, v := range node.ObjectMeta.Labels {
|
||||
if k != unversioned.LabelZoneFailureDomain && k != unversioned.LabelZoneRegion {
|
||||
continue
|
||||
}
|
||||
nodeConstraints[k] = v
|
||||
}
|
||||
|
||||
if len(nodeConstraints) == 0 {
|
||||
// The node has no zone constraints, so we're OK to schedule.
|
||||
// In practice, when using zones, all nodes must be labeled with zone labels.
|
||||
// We want to fast-path this case though.
|
||||
return true, nil
|
||||
}
|
||||
|
||||
namespace := pod.Namespace
|
||||
|
||||
manifest := &(pod.Spec)
|
||||
for i := range manifest.Volumes {
|
||||
volume := &manifest.Volumes[i]
|
||||
if volume.PersistentVolumeClaim != nil {
|
||||
pvcName := volume.PersistentVolumeClaim.ClaimName
|
||||
if pvcName == "" {
|
||||
return false, fmt.Errorf("PersistentVolumeClaim had no name: %q", pvcName)
|
||||
}
|
||||
pvc, err := c.pvcInfo.GetPersistentVolumeClaimInfo(namespace, pvcName)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
|
||||
if pvc == nil {
|
||||
return false, fmt.Errorf("PersistentVolumeClaim was not found: %q", pvcName)
|
||||
}
|
||||
|
||||
pvName := pvc.Spec.VolumeName
|
||||
if pvName == "" {
|
||||
return false, fmt.Errorf("PersistentVolumeClaim is not bound: %q", pvcName)
|
||||
}
|
||||
|
||||
pv, err := c.pvInfo.GetPersistentVolumeInfo(pvName)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
|
||||
if pv == nil {
|
||||
return false, fmt.Errorf("PersistentVolume not found: %q", pvName)
|
||||
}
|
||||
|
||||
for k, v := range pv.ObjectMeta.Labels {
|
||||
if k != unversioned.LabelZoneFailureDomain && k != unversioned.LabelZoneRegion {
|
||||
continue
|
||||
}
|
||||
nodeV, _ := nodeConstraints[k]
|
||||
if v != nodeV {
|
||||
glog.V(2).Infof("Won't schedule pod %q onto node %q due to volume %q (mismatch on %q)", pod.Name, nodeID, pvName, k)
|
||||
return false, nil
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return true, nil
|
||||
}
|
||||
|
||||
type ResourceFit struct {
|
||||
info NodeInfo
|
||||
}
|
||||
|
||||
@@ -64,6 +64,13 @@ func defaultPredicates() sets.String {
|
||||
),
|
||||
// Fit is determined by non-conflicting disk volumes.
|
||||
factory.RegisterFitPredicate("NoDiskConflict", predicates.NoDiskConflict),
|
||||
// Fit is determined by volume zone requirements.
|
||||
factory.RegisterFitPredicateFactory(
|
||||
"NoVolumeZoneConflict",
|
||||
func(args factory.PluginFactoryArgs) algorithm.FitPredicate {
|
||||
return predicates.NewVolumeZonePredicate(args.NodeInfo, args.PVInfo, args.PVCInfo)
|
||||
},
|
||||
),
|
||||
// Fit is determined by node selector query.
|
||||
factory.RegisterFitPredicateFactory(
|
||||
"MatchNodeSelector",
|
||||
|
||||
@@ -58,6 +58,10 @@ type ConfigFactory struct {
|
||||
PodLister algorithm.PodLister
|
||||
// a means to list all nodes
|
||||
NodeLister *cache.StoreToNodeLister
|
||||
// a means to list all PersistentVolumes
|
||||
PVLister *cache.StoreToPVFetcher
|
||||
// a means to list all PersistentVolumeClaims
|
||||
PVCLister *cache.StoreToPVCFetcher
|
||||
// a means to list all services
|
||||
ServiceLister *cache.StoreToServiceLister
|
||||
// a means to list all controllers
|
||||
@@ -85,6 +89,8 @@ func NewConfigFactory(client *client.Client, rateLimiter util.RateLimiter, sched
|
||||
ScheduledPodLister: &cache.StoreToPodLister{},
|
||||
// Only nodes in the "Ready" condition with status == "True" are schedulable
|
||||
NodeLister: &cache.StoreToNodeLister{Store: cache.NewStore(cache.MetaNamespaceKeyFunc)},
|
||||
PVLister: &cache.StoreToPVFetcher{Store: cache.NewStore(cache.MetaNamespaceKeyFunc)},
|
||||
PVCLister: &cache.StoreToPVCFetcher{Store: cache.NewStore(cache.MetaNamespaceKeyFunc)},
|
||||
ServiceLister: &cache.StoreToServiceLister{Store: cache.NewStore(cache.MetaNamespaceKeyFunc)},
|
||||
ControllerLister: &cache.StoreToReplicationControllerLister{Store: cache.NewStore(cache.MetaNamespaceKeyFunc)},
|
||||
StopEverything: make(chan struct{}),
|
||||
@@ -188,6 +194,8 @@ func (f *ConfigFactory) CreateFromKeys(predicateKeys, priorityKeys sets.String,
|
||||
// All fit predicates only need to consider schedulable nodes.
|
||||
NodeLister: f.NodeLister.NodeCondition(getNodeConditionPredicate()),
|
||||
NodeInfo: &predicates.CachedNodeInfo{f.NodeLister},
|
||||
PVInfo: f.PVLister,
|
||||
PVCInfo: f.PVCLister,
|
||||
}
|
||||
predicateFuncs, err := getFitPredicateFunctions(predicateKeys, pluginArgs)
|
||||
if err != nil {
|
||||
@@ -209,6 +217,11 @@ func (f *ConfigFactory) CreateFromKeys(predicateKeys, priorityKeys sets.String,
|
||||
// Nodes may be listed frequently, so provide a local up-to-date cache.
|
||||
cache.NewReflector(f.createNodeLW(), &api.Node{}, f.NodeLister.Store, 0).RunUntil(f.StopEverything)
|
||||
|
||||
// Watch PVs & PVCs
|
||||
// They may be listed frequently for scheduling constraints, so provide a local up-to-date cache.
|
||||
cache.NewReflector(f.createPersistentVolumeLW(), &api.PersistentVolume{}, f.PVLister.Store, 0).RunUntil(f.StopEverything)
|
||||
cache.NewReflector(f.createPersistentVolumeClaimLW(), &api.PersistentVolumeClaim{}, f.PVCLister.Store, 0).RunUntil(f.StopEverything)
|
||||
|
||||
// Watch and cache all service objects. Scheduler needs to find all pods
|
||||
// created by the same services or ReplicationControllers, so that it can spread them correctly.
|
||||
// Cache this locally.
|
||||
@@ -303,6 +316,16 @@ func (factory *ConfigFactory) createNodeLW() *cache.ListWatch {
|
||||
return cache.NewListWatchFromClient(factory.Client, "nodes", api.NamespaceAll, fields)
|
||||
}
|
||||
|
||||
// createPersistentVolumeLW returns a cache.ListWatch that gets all changes to persistentVolumes.
|
||||
func (factory *ConfigFactory) createPersistentVolumeLW() *cache.ListWatch {
|
||||
return cache.NewListWatchFromClient(factory.Client, "persistentVolumes", api.NamespaceAll, fields.ParseSelectorOrDie(""))
|
||||
}
|
||||
|
||||
// createPersistentVolumeClaimLW returns a cache.ListWatch that gets all changes to persistentVolumeClaims.
|
||||
func (factory *ConfigFactory) createPersistentVolumeClaimLW() *cache.ListWatch {
|
||||
return cache.NewListWatchFromClient(factory.Client, "persistentVolumeClaims", api.NamespaceAll, fields.ParseSelectorOrDie(""))
|
||||
}
|
||||
|
||||
// Returns a cache.ListWatch that gets all changes to services.
|
||||
func (factory *ConfigFactory) createServiceLW() *cache.ListWatch {
|
||||
return cache.NewListWatchFromClient(factory.Client, "services", api.NamespaceAll, fields.ParseSelectorOrDie(""))
|
||||
|
||||
@@ -38,6 +38,8 @@ type PluginFactoryArgs struct {
|
||||
algorithm.ControllerLister
|
||||
NodeLister algorithm.NodeLister
|
||||
NodeInfo predicates.NodeInfo
|
||||
PVInfo predicates.PersistentVolumeInfo
|
||||
PVCInfo predicates.PersistentVolumeClaimInfo
|
||||
}
|
||||
|
||||
// A FitPredicateFactory produces a FitPredicate from the given args.
|
||||
|
||||
Reference in New Issue
Block a user