Ubernetes Lite: Volumes can dictate zone scheduling

For AWS EBS, a volume can only be attached to a node in the same AZ.
The scheduler must therefore detect if a volume is being attached to a
pod, and ensure that the pod is scheduled on a node in the same AZ as
the volume.

So that the scheduler need not query the cloud provider every time, and
to support decoupled operation (e.g. bare metal) we tag the volume with
our placement labels.  This is done automatically by means of an
admission controller on AWS when a PersistentVolume is created backed by
an EBS volume.

Support for tagging GCE PVs will follow.

Pods that specify a volume directly (i.e. without using a
PersistentVolumeClaim) will not currently be scheduled correctly (i.e.
they will be scheduled without zone-awareness).
This commit is contained in:
Justin Santa Barbara
2015-11-29 14:00:49 -05:00
parent 7743a4ca89
commit f9a6ac077e
15 changed files with 536 additions and 4 deletions

View File

@@ -26,12 +26,21 @@ import (
"k8s.io/kubernetes/plugin/pkg/scheduler/algorithm"
"github.com/golang/glog"
"k8s.io/kubernetes/pkg/api/unversioned"
)
type NodeInfo interface {
GetNodeInfo(nodeID string) (*api.Node, error)
}
type PersistentVolumeInfo interface {
GetPersistentVolumeInfo(pvID string) (*api.PersistentVolume, error)
}
type PersistentVolumeClaimInfo interface {
GetPersistentVolumeClaimInfo(namespace string, pvcID string) (*api.PersistentVolumeClaim, error)
}
type StaticNodeInfo struct {
*api.NodeList
}
@@ -136,6 +145,108 @@ func NoDiskConflict(pod *api.Pod, existingPods []*api.Pod, node string) (bool, e
return true, nil
}
type VolumeZoneChecker struct {
nodeInfo NodeInfo
pvInfo PersistentVolumeInfo
pvcInfo PersistentVolumeClaimInfo
}
// VolumeZonePredicate evaluates if a pod can fit due to the volumes it requests, given
// that some volumes may have zone scheduling constraints. The requirement is that any
// volume zone-labels must match the equivalent zone-labels on the node. It is OK for
// the node to have more zone-label constraints (for example, a hypothetical replicated
// volume might allow region-wide access)
//
// Currently this is only supported with PersistentVolumeClaims, and looks to the labels
// only on the bound PersistentVolume.
//
// Working with volumes declared inline in the pod specification (i.e. not
// using a PersistentVolume) is likely to be harder, as it would require
// determining the zone of a volume during scheduling, and that is likely to
// require calling out to the cloud provider. It seems that we are moving away
// from inline volume declarations anyway.
func NewVolumeZonePredicate(nodeInfo NodeInfo, pvInfo PersistentVolumeInfo, pvcInfo PersistentVolumeClaimInfo) algorithm.FitPredicate {
c := &VolumeZoneChecker{
nodeInfo: nodeInfo,
pvInfo: pvInfo,
pvcInfo: pvcInfo,
}
return c.predicate
}
func (c *VolumeZoneChecker) predicate(pod *api.Pod, existingPods []*api.Pod, nodeID string) (bool, error) {
node, err := c.nodeInfo.GetNodeInfo(nodeID)
if err != nil {
return false, err
}
if node == nil {
return false, fmt.Errorf("node not found: %q", nodeID)
}
nodeConstraints := make(map[string]string)
for k, v := range node.ObjectMeta.Labels {
if k != unversioned.LabelZoneFailureDomain && k != unversioned.LabelZoneRegion {
continue
}
nodeConstraints[k] = v
}
if len(nodeConstraints) == 0 {
// The node has no zone constraints, so we're OK to schedule.
// In practice, when using zones, all nodes must be labeled with zone labels.
// We want to fast-path this case though.
return true, nil
}
namespace := pod.Namespace
manifest := &(pod.Spec)
for i := range manifest.Volumes {
volume := &manifest.Volumes[i]
if volume.PersistentVolumeClaim != nil {
pvcName := volume.PersistentVolumeClaim.ClaimName
if pvcName == "" {
return false, fmt.Errorf("PersistentVolumeClaim had no name: %q", pvcName)
}
pvc, err := c.pvcInfo.GetPersistentVolumeClaimInfo(namespace, pvcName)
if err != nil {
return false, err
}
if pvc == nil {
return false, fmt.Errorf("PersistentVolumeClaim was not found: %q", pvcName)
}
pvName := pvc.Spec.VolumeName
if pvName == "" {
return false, fmt.Errorf("PersistentVolumeClaim is not bound: %q", pvcName)
}
pv, err := c.pvInfo.GetPersistentVolumeInfo(pvName)
if err != nil {
return false, err
}
if pv == nil {
return false, fmt.Errorf("PersistentVolume not found: %q", pvName)
}
for k, v := range pv.ObjectMeta.Labels {
if k != unversioned.LabelZoneFailureDomain && k != unversioned.LabelZoneRegion {
continue
}
nodeV, _ := nodeConstraints[k]
if v != nodeV {
glog.V(2).Infof("Won't schedule pod %q onto node %q due to volume %q (mismatch on %q)", pod.Name, nodeID, pvName, k)
return false, nil
}
}
}
}
return true, nil
}
type ResourceFit struct {
info NodeInfo
}

View File

@@ -64,6 +64,13 @@ func defaultPredicates() sets.String {
),
// Fit is determined by non-conflicting disk volumes.
factory.RegisterFitPredicate("NoDiskConflict", predicates.NoDiskConflict),
// Fit is determined by volume zone requirements.
factory.RegisterFitPredicateFactory(
"NoVolumeZoneConflict",
func(args factory.PluginFactoryArgs) algorithm.FitPredicate {
return predicates.NewVolumeZonePredicate(args.NodeInfo, args.PVInfo, args.PVCInfo)
},
),
// Fit is determined by node selector query.
factory.RegisterFitPredicateFactory(
"MatchNodeSelector",

View File

@@ -58,6 +58,10 @@ type ConfigFactory struct {
PodLister algorithm.PodLister
// a means to list all nodes
NodeLister *cache.StoreToNodeLister
// a means to list all PersistentVolumes
PVLister *cache.StoreToPVFetcher
// a means to list all PersistentVolumeClaims
PVCLister *cache.StoreToPVCFetcher
// a means to list all services
ServiceLister *cache.StoreToServiceLister
// a means to list all controllers
@@ -85,6 +89,8 @@ func NewConfigFactory(client *client.Client, rateLimiter util.RateLimiter, sched
ScheduledPodLister: &cache.StoreToPodLister{},
// Only nodes in the "Ready" condition with status == "True" are schedulable
NodeLister: &cache.StoreToNodeLister{Store: cache.NewStore(cache.MetaNamespaceKeyFunc)},
PVLister: &cache.StoreToPVFetcher{Store: cache.NewStore(cache.MetaNamespaceKeyFunc)},
PVCLister: &cache.StoreToPVCFetcher{Store: cache.NewStore(cache.MetaNamespaceKeyFunc)},
ServiceLister: &cache.StoreToServiceLister{Store: cache.NewStore(cache.MetaNamespaceKeyFunc)},
ControllerLister: &cache.StoreToReplicationControllerLister{Store: cache.NewStore(cache.MetaNamespaceKeyFunc)},
StopEverything: make(chan struct{}),
@@ -188,6 +194,8 @@ func (f *ConfigFactory) CreateFromKeys(predicateKeys, priorityKeys sets.String,
// All fit predicates only need to consider schedulable nodes.
NodeLister: f.NodeLister.NodeCondition(getNodeConditionPredicate()),
NodeInfo: &predicates.CachedNodeInfo{f.NodeLister},
PVInfo: f.PVLister,
PVCInfo: f.PVCLister,
}
predicateFuncs, err := getFitPredicateFunctions(predicateKeys, pluginArgs)
if err != nil {
@@ -209,6 +217,11 @@ func (f *ConfigFactory) CreateFromKeys(predicateKeys, priorityKeys sets.String,
// Nodes may be listed frequently, so provide a local up-to-date cache.
cache.NewReflector(f.createNodeLW(), &api.Node{}, f.NodeLister.Store, 0).RunUntil(f.StopEverything)
// Watch PVs & PVCs
// They may be listed frequently for scheduling constraints, so provide a local up-to-date cache.
cache.NewReflector(f.createPersistentVolumeLW(), &api.PersistentVolume{}, f.PVLister.Store, 0).RunUntil(f.StopEverything)
cache.NewReflector(f.createPersistentVolumeClaimLW(), &api.PersistentVolumeClaim{}, f.PVCLister.Store, 0).RunUntil(f.StopEverything)
// Watch and cache all service objects. Scheduler needs to find all pods
// created by the same services or ReplicationControllers, so that it can spread them correctly.
// Cache this locally.
@@ -303,6 +316,16 @@ func (factory *ConfigFactory) createNodeLW() *cache.ListWatch {
return cache.NewListWatchFromClient(factory.Client, "nodes", api.NamespaceAll, fields)
}
// createPersistentVolumeLW returns a cache.ListWatch that gets all changes to persistentVolumes.
func (factory *ConfigFactory) createPersistentVolumeLW() *cache.ListWatch {
return cache.NewListWatchFromClient(factory.Client, "persistentVolumes", api.NamespaceAll, fields.ParseSelectorOrDie(""))
}
// createPersistentVolumeClaimLW returns a cache.ListWatch that gets all changes to persistentVolumeClaims.
func (factory *ConfigFactory) createPersistentVolumeClaimLW() *cache.ListWatch {
return cache.NewListWatchFromClient(factory.Client, "persistentVolumeClaims", api.NamespaceAll, fields.ParseSelectorOrDie(""))
}
// Returns a cache.ListWatch that gets all changes to services.
func (factory *ConfigFactory) createServiceLW() *cache.ListWatch {
return cache.NewListWatchFromClient(factory.Client, "services", api.NamespaceAll, fields.ParseSelectorOrDie(""))

View File

@@ -38,6 +38,8 @@ type PluginFactoryArgs struct {
algorithm.ControllerLister
NodeLister algorithm.NodeLister
NodeInfo predicates.NodeInfo
PVInfo predicates.PersistentVolumeInfo
PVCInfo predicates.PersistentVolumeClaimInfo
}
// A FitPredicateFactory produces a FitPredicate from the given args.